aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/generic.cpp
blob: 2a451b4ef5c5911ae8775829759470b6e17cc8b4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include "generic.h"

#include <contrib/libs/re2/re2/re2.h>

namespace NSQLTranslationV1 {

    namespace {

        TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
            re2::StringPiece input(prefix.data(), prefix.size());
            if (RE2::Consume(&input, regex)) {
                return TStringBuf(prefix.data(), input.data());
            }
            return Nothing();
        }

    } // namespace

    class TGenericLexer: public IGenericLexer {
    private:
        static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";

    public:
        explicit TGenericLexer(TGenericLexerGrammar grammar)
            : Grammar_(std::move(grammar))
        {
        }

        virtual bool Tokenize(
            TStringBuf text,
            const TTokenCallback& onNext,
            size_t maxErrors) const override {
            Y_ENSURE(0 < maxErrors);
            size_t errors = 0;

            size_t pos = 0;
            if (text.StartsWith(Utf8BOM)) {
                pos += Utf8BOM.size();
            }

            while (pos < text.size() && errors < maxErrors) {
                TGenericToken matched = Match(TStringBuf(text, pos));
                matched.Begin = pos;

                pos += matched.Content.size();

                if (matched.Name == TGenericToken::Error) {
                    errors += 1;
                }

                onNext(std::move(matched));
            }

            if (errors == maxErrors) {
                return false;
            }

            onNext(TGenericToken{
                .Name = "EOF",
                .Content = "<EOF>",
                .Begin = pos,
            });

            return errors == 0;
        }

    private:
        TGenericToken Match(TStringBuf prefix) const {
            TMaybe<TGenericToken> max;
            Match(prefix, [&](TGenericToken&& token) {
                if (max.Empty() || max->Content.size() < token.Content.size()) {
                    max = std::move(token);
                }
            });

            if (max) {
                return *max;
            }

            return {
                .Name = TGenericToken::Error,
                .Content = prefix.substr(0, 1),
            };
        }

        void Match(TStringBuf prefix, auto onMatch) const {
            for (const auto& token : Grammar_) {
                if (auto content = token.Match(prefix)) {
                    onMatch(TGenericToken{
                        .Name = token.TokenName,
                        .Content = *content,
                    });
                }
            }
        }

        TGenericLexerGrammar Grammar_;
    };

    TTokenMatcher Compile(const TRegexPattern& regex) {
        RE2::Options options;
        options.set_case_sensitive(!regex.IsCaseInsensitive);

        return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
                afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> {
            TMaybe<TStringBuf> body, after;
            if ((body = Match(prefix, *bodyRe)) &&
                (after = Match(prefix.Tail(body->size()), *afterRe))) {
                return body;
            }
            return Nothing();
        };
    }

    IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
        return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
    }

    TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
        TVector<TGenericToken> tokens;
        lexer->Tokenize(text, [&](TGenericToken&& token) {
            tokens.emplace_back(std::move(token));
        });
        return tokens;
    }

} // namespace NSQLTranslationV1