1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
#include "generic.h"
#include <contrib/libs/re2/re2/re2.h>
namespace NSQLTranslationV1 {
namespace {
TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
re2::StringPiece input(prefix.data(), prefix.size());
if (RE2::Consume(&input, regex)) {
return TStringBuf(prefix.data(), input.data());
}
return Nothing();
}
} // namespace
class TGenericLexer: public IGenericLexer {
private:
static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";
public:
explicit TGenericLexer(TGenericLexerGrammar grammar)
: Grammar_(std::move(grammar))
{
}
virtual bool Tokenize(
TStringBuf text,
const TTokenCallback& onNext,
size_t maxErrors) const override {
Y_ENSURE(0 < maxErrors);
size_t errors = 0;
size_t pos = 0;
if (text.StartsWith(Utf8BOM)) {
pos += Utf8BOM.size();
}
while (pos < text.size() && errors < maxErrors) {
TGenericToken matched = Match(TStringBuf(text, pos));
matched.Begin = pos;
pos += matched.Content.size();
if (matched.Name == TGenericToken::Error) {
errors += 1;
}
onNext(std::move(matched));
}
if (errors == maxErrors) {
return false;
}
onNext(TGenericToken{
.Name = "EOF",
.Content = "<EOF>",
.Begin = pos,
});
return errors == 0;
}
private:
TGenericToken Match(TStringBuf prefix) const {
TMaybe<TGenericToken> max;
Match(prefix, [&](TGenericToken&& token) {
if (max.Empty() || max->Content.size() < token.Content.size()) {
max = std::move(token);
}
});
if (max) {
return *max;
}
return {
.Name = TGenericToken::Error,
.Content = prefix.substr(0, 1),
};
}
void Match(TStringBuf prefix, auto onMatch) const {
for (const auto& token : Grammar_) {
if (auto content = token.Match(prefix)) {
onMatch(TGenericToken{
.Name = token.TokenName,
.Content = *content,
});
}
}
}
TGenericLexerGrammar Grammar_;
};
TTokenMatcher Compile(const TRegexPattern& regex) {
RE2::Options options;
options.set_case_sensitive(!regex.IsCaseInsensitive);
return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> {
TMaybe<TStringBuf> body, after;
if ((body = Match(prefix, *bodyRe)) &&
(after = Match(prefix.Tail(body->size()), *afterRe))) {
return body;
}
return Nothing();
};
}
IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
}
TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
TVector<TGenericToken> tokens;
lexer->Tokenize(text, [&](TGenericToken&& token) {
tokens.emplace_back(std::move(token));
});
return tokens;
}
} // namespace NSQLTranslationV1
|