aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/set/unicode_set_parser.cpp
blob: 754c7add41cd691d87489ff02129979391e77d7f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include "quoted_pair.h"
#include "unicode_set_lexer.h"

#include <util/string/cast.h>
#include <util/charset/wide.h>

namespace NUnicode {
    namespace NPrivate {
#define UNEXPECTED_TOKEN throw yexception() << "Unexpected token: " << lexer.GetLastToken()

#define EXPECT_TOKEN(type)          \
    if (lexer.GetToken() != type) { \
        UNEXPECTED_TOKEN;           \
    }

        void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer);

        void ParseCharSequence(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
            wchar32 prevChar = 0;
            bool range = false;
            for (EUnicodeSetTokenType type = lexer.GetToken(); type != USTT_RBRACKET; type = lexer.GetToken()) {
                wchar32 curChar = 0;
                switch (type) {
                    case USTT_SYMBOL:
                        curChar = lexer.GetLastToken().Symbol;
                        break;
                    case USTT_NEGATION:
                        curChar = '^';
                        break;
                    case USTT_QUOTED_PAIR:
                        ResolveUnicodeQuotedPair(lexer.GetLastToken().Symbol, curChar, set);
                        break;
                    case USTT_CODEPOINT8:
                    case USTT_CODEPOINT16:
                    case USTT_CODEPOINT32:
                        curChar = IntFromString<ui32, 16>(lexer.GetLastToken().Data);
                        if (curChar >= TUnicodeSet::CODEPOINT_HIGH) {
                            throw yexception() << "Invalid unicode codepoint: " << lexer.GetLastToken();
                        }
                        break;
                    case USTT_RANGE:
                        if (0 == prevChar) {
                            UNEXPECTED_TOKEN;
                        }
                        range = true;
                        continue;
                    case USTT_LBRACKET: {
                        lexer.PushBack();
                        TUnicodeSet inner;
                        ParseUnicodeSet(inner, lexer);
                        set.Add(inner);
                        break;
                    }
                    default:
                        UNEXPECTED_TOKEN;
                }
                if (curChar) {
                    if (range) {
                        if (prevChar >= curChar) {
                            throw yexception() << "Invalid character range";
                        }
                        set.Add(prevChar, curChar);
                        curChar = 0;
                    } else {
                        set.Add(curChar);
                    }
                } else if (range) {
                    UNEXPECTED_TOKEN;
                }
                range = false;
                prevChar = curChar;
            }
            if (range) {
                UNEXPECTED_TOKEN;
            }
            lexer.PushBack();
        }

        void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
            EXPECT_TOKEN(USTT_LBRACKET);
            bool invert = false;
            if (USTT_NEGATION == lexer.GetToken()) {
                invert = true;
            } else {
                lexer.PushBack();
            }

            if (USTT_CATEGORY == lexer.GetToken()) {
                set.AddCategory(WideToUTF8(lexer.GetLastToken().Data));
            } else {
                lexer.PushBack();
                ParseCharSequence(set, lexer);
            }

            EXPECT_TOKEN(USTT_RBRACKET);

            if (invert) {
                set.Invert();
            }
        }

        void ParseUnicodeSet(TUnicodeSet& set, const TWtringBuf& data) {
            TUnicodeSetLexer lexer(data);
            ParseUnicodeSet(set, lexer);
            EXPECT_TOKEN(USTT_EOS);
        }

    } // NPrivate
}