summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/generic.cpp
diff options
context:
space:
mode:
authorvitya-smirnov <[email protected]>2025-10-07 09:34:39 +0300
committervitya-smirnov <[email protected]>2025-10-07 09:52:14 +0300
commitbabe7533f18c11be1f8a195ed2324d2d9a89436a (patch)
tree45b7627141bf5a52b45a3d61fd1fbdd564bb8dd9 /yql/essentials/sql/v1/lexer/regex/generic.cpp
parent8fe7cfe254fde2772477a8933a163b5f303716b4 (diff)
YQL-20086 sql/v1
commit_hash:55bc611cdaa0d8a0fc3c4c7708ed9f17cc4976cf
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/generic.cpp')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.cpp282
1 files changed, 141 insertions, 141 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp
index 50b2d78cf77..c27eec99b28 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp
@@ -6,176 +6,176 @@
namespace NSQLTranslationV1 {
- namespace {
+namespace {
- TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
- re2::StringPiece input(prefix.data(), prefix.size());
- if (RE2::Consume(&input, regex)) {
- return TStringBuf(prefix.data(), input.data());
- }
- return Nothing();
- }
-
- } // namespace
-
- class TGenericLexer: public IGenericLexer {
- private:
- static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";
-
- public:
- explicit TGenericLexer(TGenericLexerGrammar grammar)
- : Grammar_(std::move(grammar))
- {
- }
-
- virtual bool Tokenize(
- TStringBuf text,
- const TTokenCallback& onNext,
- size_t maxErrors) const override {
- Y_ENSURE(0 < maxErrors);
- size_t errors = 0;
+TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
+ re2::StringPiece input(prefix.data(), prefix.size());
+ if (RE2::Consume(&input, regex)) {
+ return TStringBuf(prefix.data(), input.data());
+ }
+ return Nothing();
+}
- size_t pos = 0;
- if (text.StartsWith(Utf8BOM)) {
- pos += Utf8BOM.size();
- }
+} // namespace
- while (pos < text.size() && errors < maxErrors) {
- TMaybe<TGenericToken> prev;
- TGenericToken next = Match(TStringBuf(text, pos));
+class TGenericLexer: public IGenericLexer {
+private:
+ static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";
- size_t skipped = next.Begin;
- next.Begin = skipped + pos;
+public:
+ explicit TGenericLexer(TGenericLexerGrammar grammar)
+ : Grammar_(std::move(grammar))
+ {
+ }
- if (skipped != 0) {
- prev = Match(TStringBuf(text, pos, skipped));
- prev->Begin = pos;
- }
+ virtual bool Tokenize(
+ TStringBuf text,
+ const TTokenCallback& onNext,
+ size_t maxErrors) const override {
+ Y_ENSURE(0 < maxErrors);
+ size_t errors = 0;
- pos += skipped + next.Content.size();
+ size_t pos = 0;
+ if (text.StartsWith(Utf8BOM)) {
+ pos += Utf8BOM.size();
+ }
- if (next.Name == TGenericToken::Error) {
- errors += 1;
- }
+ while (pos < text.size() && errors < maxErrors) {
+ TMaybe<TGenericToken> prev;
+ TGenericToken next = Match(TStringBuf(text, pos));
- if (prev) {
- onNext(std::move(*prev));
- }
- onNext(std::move(next));
- }
+ size_t skipped = next.Begin;
+ next.Begin = skipped + pos;
- if (errors == maxErrors) {
- return false;
+ if (skipped != 0) {
+ prev = Match(TStringBuf(text, pos, skipped));
+ prev->Begin = pos;
}
- onNext(TGenericToken{
- .Name = "EOF",
- .Content = "<EOF>",
- .Begin = pos,
- });
-
- return errors == 0;
- }
+ pos += skipped + next.Content.size();
- private:
- TGenericToken Match(TStringBuf prefix) const {
- TMaybe<TGenericToken> max;
- Match(prefix, [&](TGenericToken&& token) {
- if (max.Empty() || max->Content.size() < token.Content.size()) {
- max = std::move(token);
- }
- });
-
- if (max) {
- return *max;
+ if (next.Name == TGenericToken::Error) {
+ errors += 1;
}
- return {
- .Name = TGenericToken::Error,
- .Content = prefix.substr(0, 1),
- };
+ if (prev) {
+ onNext(std::move(*prev));
+ }
+ onNext(std::move(next));
}
- void Match(TStringBuf prefix, auto onMatch) const {
- for (const auto& matcher : Grammar_) {
- if (auto token = matcher(prefix)) {
- onMatch(std::move(*token));
- }
- }
+ if (errors == maxErrors) {
+ return false;
}
- TGenericLexerGrammar Grammar_;
- };
+ onNext(TGenericToken{
+ .Name = "EOF",
+ .Content = "<EOF>",
+ .Begin = pos,
+ });
- TTokenMatcher Compile(TString name, const TRegexPattern& regex) {
- RE2::Options options;
- options.set_case_sensitive(!regex.IsCaseInsensitive);
-
- return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options),
- bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
- afterRe = MakeAtomicShared<RE2>(regex.After, options),
- name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
- TMaybe<TStringBuf> before, body, after;
- if ((before = Match(prefix, *beforeRe)) &&
- (body = Match(prefix.Tail(before->size()), *bodyRe)) &&
- (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) {
- return TGenericToken{
- .Name = name,
- .Content = *body,
- .Begin = before->size(),
- };
- }
- return Nothing();
- };
+ return errors == 0;
}
- TRegexPattern Merged(TVector<TRegexPattern> patterns) {
- Y_ENSURE(!patterns.empty());
-
- const TRegexPattern& sample = patterns.back();
- Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
- return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) ==
- std::tie(sample.After, sample.Before, sample.IsCaseInsensitive);
- }));
-
- Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
- const auto lhs_length = lhs.Body.length();
- const auto rhs_length = rhs.Body.length();
-
- // Note: do not compare After and Before here as they are equal.
- return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body);
+private:
+ TGenericToken Match(TStringBuf prefix) const {
+ TMaybe<TGenericToken> max;
+ Match(prefix, [&](TGenericToken&& token) {
+ if (max.Empty() || max->Content.size() < token.Content.size()) {
+ max = std::move(token);
+ }
});
- TStringBuilder body;
- for (const auto& pattern : patterns) {
- TString regex = pattern.Body;
- if (pattern.Body.Contains('|')) {
- regex.prepend('(');
- regex.append(')');
- }
- body << regex << "|";
+ if (max) {
+ return *max;
}
- Y_ENSURE(body.back() == '|');
- body.pop_back();
-
- return TRegexPattern{
- .Body = std::move(body),
- .After = sample.After,
- .Before = sample.Before,
- .IsCaseInsensitive = sample.IsCaseInsensitive,
+
+ return {
+ .Name = TGenericToken::Error,
+ .Content = prefix.substr(0, 1),
};
}
- IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
- return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
+ void Match(TStringBuf prefix, auto onMatch) const {
+ for (const auto& matcher : Grammar_) {
+ if (auto token = matcher(prefix)) {
+ onMatch(std::move(*token));
+ }
+ }
}
- TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
- TVector<TGenericToken> tokens;
- lexer->Tokenize(text, [&](TGenericToken&& token) {
- tokens.emplace_back(std::move(token));
- });
- return tokens;
+ TGenericLexerGrammar Grammar_;
+};
+
+TTokenMatcher Compile(TString name, const TRegexPattern& regex) {
+ RE2::Options options;
+ options.set_case_sensitive(!regex.IsCaseInsensitive);
+
+ return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options),
+ bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
+ afterRe = MakeAtomicShared<RE2>(regex.After, options),
+ name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
+ TMaybe<TStringBuf> before, body, after;
+ if ((before = Match(prefix, *beforeRe)) &&
+ (body = Match(prefix.Tail(before->size()), *bodyRe)) &&
+ (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) {
+ return TGenericToken{
+ .Name = name,
+ .Content = *body,
+ .Begin = before->size(),
+ };
+ }
+ return Nothing();
+ };
+}
+
+TRegexPattern Merged(TVector<TRegexPattern> patterns) {
+ Y_ENSURE(!patterns.empty());
+
+ const TRegexPattern& sample = patterns.back();
+ Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
+ return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) ==
+ std::tie(sample.After, sample.Before, sample.IsCaseInsensitive);
+ }));
+
+ Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
+ const auto lhs_length = lhs.Body.length();
+ const auto rhs_length = rhs.Body.length();
+
+ // Note: do not compare After and Before here as they are equal.
+ return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body);
+ });
+
+ TStringBuilder body;
+ for (const auto& pattern : patterns) {
+ TString regex = pattern.Body;
+ if (pattern.Body.Contains('|')) {
+ regex.prepend('(');
+ regex.append(')');
+ }
+ body << regex << "|";
}
+ Y_ENSURE(body.back() == '|');
+ body.pop_back();
+
+ return TRegexPattern{
+ .Body = std::move(body),
+ .After = sample.After,
+ .Before = sample.Before,
+ .IsCaseInsensitive = sample.IsCaseInsensitive,
+ };
+}
+
+IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
+ return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
+}
+
+TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
+ TVector<TGenericToken> tokens;
+ lexer->Tokenize(text, [&](TGenericToken&& token) {
+ tokens.emplace_back(std::move(token));
+ });
+ return tokens;
+}
} // namespace NSQLTranslationV1