blob: 754c7add41cd691d87489ff02129979391e77d7f (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
#include "quoted_pair.h"
#include "unicode_set_lexer.h"
#include <util/string/cast.h>
#include <util/charset/wide.h>
namespace NUnicode {
namespace NPrivate {
#define UNEXPECTED_TOKEN throw yexception() << "Unexpected token: " << lexer.GetLastToken()
#define EXPECT_TOKEN(type) \
if (lexer.GetToken() != type) { \
UNEXPECTED_TOKEN; \
}
void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer);
void ParseCharSequence(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
wchar32 prevChar = 0;
bool range = false;
for (EUnicodeSetTokenType type = lexer.GetToken(); type != USTT_RBRACKET; type = lexer.GetToken()) {
wchar32 curChar = 0;
switch (type) {
case USTT_SYMBOL:
curChar = lexer.GetLastToken().Symbol;
break;
case USTT_NEGATION:
curChar = '^';
break;
case USTT_QUOTED_PAIR:
ResolveUnicodeQuotedPair(lexer.GetLastToken().Symbol, curChar, set);
break;
case USTT_CODEPOINT8:
case USTT_CODEPOINT16:
case USTT_CODEPOINT32:
curChar = IntFromString<ui32, 16>(lexer.GetLastToken().Data);
if (curChar >= TUnicodeSet::CODEPOINT_HIGH) {
throw yexception() << "Invalid unicode codepoint: " << lexer.GetLastToken();
}
break;
case USTT_RANGE:
if (0 == prevChar) {
UNEXPECTED_TOKEN;
}
range = true;
continue;
case USTT_LBRACKET: {
lexer.PushBack();
TUnicodeSet inner;
ParseUnicodeSet(inner, lexer);
set.Add(inner);
break;
}
default:
UNEXPECTED_TOKEN;
}
if (curChar) {
if (range) {
if (prevChar >= curChar) {
throw yexception() << "Invalid character range";
}
set.Add(prevChar, curChar);
curChar = 0;
} else {
set.Add(curChar);
}
} else if (range) {
UNEXPECTED_TOKEN;
}
range = false;
prevChar = curChar;
}
if (range) {
UNEXPECTED_TOKEN;
}
lexer.PushBack();
}
void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
EXPECT_TOKEN(USTT_LBRACKET);
bool invert = false;
if (USTT_NEGATION == lexer.GetToken()) {
invert = true;
} else {
lexer.PushBack();
}
if (USTT_CATEGORY == lexer.GetToken()) {
set.AddCategory(WideToUTF8(lexer.GetLastToken().Data));
} else {
lexer.PushBack();
ParseCharSequence(set, lexer);
}
EXPECT_TOKEN(USTT_RBRACKET);
if (invert) {
set.Invert();
}
}
void ParseUnicodeSet(TUnicodeSet& set, const TWtringBuf& data) {
TUnicodeSetLexer lexer(data);
ParseUnicodeSet(set, lexer);
EXPECT_TOKEN(USTT_EOS);
}
} // NPrivate
}
|