improved string literal parser. now \xHH works as expected (only matches two digits), and \uHHHH, \UHHHHHHHH are supported (16/32-bit Unicode characters)

This commit is contained in:
River Tarnell 2008-08-10 12:53:21 +00:00
parent 9911042c66
commit 10e89fda5c
2 changed files with 65 additions and 1 deletions

View file

@ -12,6 +12,32 @@
#ifndef AST_H
#define AST_H
namespace {
template<typename charT>
int
hex2int(charT const *str, int ndigits)
{
int ret = 0;
while (ndigits--) {
ret *= 0x10;
if (*str >= 'a' && *str <= 'f')
ret += 10 + int(*str - 'a');
else if (*str >= 'A' && *str <= 'F')
ret += 10 + int(*str - 'A');
else if (*str >= '0' && *str <= '9')
ret += int(*str - '0');
str++;
}
std::cerr << "hex2int: " << ret << '\n';
return ret;
}
}
namespace afp {
template<typename T> struct parser_grammar;
@ -234,6 +260,27 @@ ast_evaluator<charT, iterator>::ast_eval_string(basic_fray<charT> const &s)
case 'v':
ret.push_back('\v');
break;
case 'x':
if (i + 3 >= end)
break;
ret.push_back(hex2int(s.data() + i + 2, 2));
i += 2;
break;
case 'u':
if (i + 5 >= end)
break;
ret.push_back(hex2int(s.data() + i + 2, 4));
i += 4;
break;
case 'U':
if (i + 9 >= end)
break;
ret.push_back(hex2int(s.data() + i + 2, 8));
i += 8;
break;
default:
ret.push_back(s[i + 1]);
break;

View file

@ -22,6 +22,7 @@
#include <boost/spirit/core.hpp>
#include <boost/spirit/utility/confix.hpp>
#include <boost/spirit/utility/chset.hpp>
#include <boost/spirit/utility/loops.hpp>
#include <boost/spirit/tree/ast.hpp>
#include <boost/spirit/tree/tree_to_xml.hpp>
#include <boost/spirit/symbols.hpp>
@ -211,13 +212,28 @@ struct parser_grammar : public grammar<parser_grammar<charT> >
| string
;
hexchar = chset<>("a-fA-F0-9")
;
octchar = chset<>("0-7")
;
c_string_char =
"\\x" >> hexchar >> hexchar
| "\\u" >> repeat_p(4)[hexchar]
| "\\U" >> repeat_p(8)[hexchar]
| "\\o" >> octchar >> octchar >> octchar
| "\\" >> anychar_p - (ch_p('x') | 'u' | 'o')
| anychar_p - (ch_p('"') | '\\')
;
/*
* config_p can't be used here, because it will rewrite
* *(c_escape_ch_p[x]) into (*c_escape_ch_p)[x]
*/
string = inner_node_d[
'"'
>> leaf_node_d[ *(lex_escape_ch_p - '"') ]
>> leaf_node_d[ *(c_string_char) ]
>> '"'
]
;
@ -341,6 +357,7 @@ struct parser_grammar : public grammar<parser_grammar<charT> >
return tern_expr;
}
rule<ScannerT> c_string_char, hexchar, octchar;
rule<ScannerT, parser_context<>, parser_tag<id_value> > value;
rule<ScannerT, parser_context<>, parser_tag<id_variable> > variable;
rule<ScannerT, parser_context<>, parser_tag<id_basic> > basic;