diff options
| author | vitya-smirnov <[email protected]> | 2025-10-07 09:34:39 +0300 |
|---|---|---|
| committer | vitya-smirnov <[email protected]> | 2025-10-07 09:52:14 +0300 |
| commit | babe7533f18c11be1f8a195ed2324d2d9a89436a (patch) | |
| tree | 45b7627141bf5a52b45a3d61fd1fbdd564bb8dd9 /yql/essentials/sql/v1/lexer/regex/generic.cpp | |
| parent | 8fe7cfe254fde2772477a8933a163b5f303716b4 (diff) | |
YQL-20086 sql/v1
commit_hash:55bc611cdaa0d8a0fc3c4c7708ed9f17cc4976cf
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/generic.cpp')
| -rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.cpp | 282 |
1 files changed, 141 insertions, 141 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp index 50b2d78cf77..c27eec99b28 100644 --- a/yql/essentials/sql/v1/lexer/regex/generic.cpp +++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp @@ -6,176 +6,176 @@ namespace NSQLTranslationV1 { - namespace { +namespace { - TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) { - re2::StringPiece input(prefix.data(), prefix.size()); - if (RE2::Consume(&input, regex)) { - return TStringBuf(prefix.data(), input.data()); - } - return Nothing(); - } - - } // namespace - - class TGenericLexer: public IGenericLexer { - private: - static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - - public: - explicit TGenericLexer(TGenericLexerGrammar grammar) - : Grammar_(std::move(grammar)) - { - } - - virtual bool Tokenize( - TStringBuf text, - const TTokenCallback& onNext, - size_t maxErrors) const override { - Y_ENSURE(0 < maxErrors); - size_t errors = 0; +TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) { + re2::StringPiece input(prefix.data(), prefix.size()); + if (RE2::Consume(&input, regex)) { + return TStringBuf(prefix.data(), input.data()); + } + return Nothing(); +} - size_t pos = 0; - if (text.StartsWith(Utf8BOM)) { - pos += Utf8BOM.size(); - } +} // namespace - while (pos < text.size() && errors < maxErrors) { - TMaybe<TGenericToken> prev; - TGenericToken next = Match(TStringBuf(text, pos)); +class TGenericLexer: public IGenericLexer { +private: + static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - size_t skipped = next.Begin; - next.Begin = skipped + pos; +public: + explicit TGenericLexer(TGenericLexerGrammar grammar) + : Grammar_(std::move(grammar)) + { + } - if (skipped != 0) { - prev = Match(TStringBuf(text, pos, skipped)); - prev->Begin = pos; - } + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors) const override { + Y_ENSURE(0 < maxErrors); + size_t errors = 0; - pos += skipped + next.Content.size(); + size_t pos = 0; + if (text.StartsWith(Utf8BOM)) { + pos += Utf8BOM.size(); + } - if (next.Name == TGenericToken::Error) { - errors += 1; - } + while (pos < text.size() && errors < maxErrors) { + TMaybe<TGenericToken> prev; + TGenericToken next = Match(TStringBuf(text, pos)); - if (prev) { - onNext(std::move(*prev)); - } - onNext(std::move(next)); - } + size_t skipped = next.Begin; + next.Begin = skipped + pos; - if (errors == maxErrors) { - return false; + if (skipped != 0) { + prev = Match(TStringBuf(text, pos, skipped)); + prev->Begin = pos; } - onNext(TGenericToken{ - .Name = "EOF", - .Content = "<EOF>", - .Begin = pos, - }); - - return errors == 0; - } + pos += skipped + next.Content.size(); - private: - TGenericToken Match(TStringBuf prefix) const { - TMaybe<TGenericToken> max; - Match(prefix, [&](TGenericToken&& token) { - if (max.Empty() || max->Content.size() < token.Content.size()) { - max = std::move(token); - } - }); - - if (max) { - return *max; + if (next.Name == TGenericToken::Error) { + errors += 1; } - return { - .Name = TGenericToken::Error, - .Content = prefix.substr(0, 1), - }; + if (prev) { + onNext(std::move(*prev)); + } + onNext(std::move(next)); } - void Match(TStringBuf prefix, auto onMatch) const { - for (const auto& matcher : Grammar_) { - if (auto token = matcher(prefix)) { - onMatch(std::move(*token)); - } - } + if (errors == maxErrors) { + return false; } - TGenericLexerGrammar Grammar_; - }; + onNext(TGenericToken{ + .Name = "EOF", + .Content = "<EOF>", + .Begin = pos, + }); - TTokenMatcher Compile(TString name, const TRegexPattern& regex) { - RE2::Options options; - options.set_case_sensitive(!regex.IsCaseInsensitive); - - return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options), - bodyRe = MakeAtomicShared<RE2>(regex.Body, options), - afterRe = MakeAtomicShared<RE2>(regex.After, options), - name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { - TMaybe<TStringBuf> before, body, after; - if ((before = Match(prefix, *beforeRe)) && - (body = Match(prefix.Tail(before->size()), *bodyRe)) && - (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) { - return TGenericToken{ - .Name = name, - .Content = *body, - .Begin = before->size(), - }; - } - return Nothing(); - }; + return errors == 0; } - TRegexPattern Merged(TVector<TRegexPattern> patterns) { - Y_ENSURE(!patterns.empty()); - - const TRegexPattern& sample = patterns.back(); - Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { - return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) == - std::tie(sample.After, sample.Before, sample.IsCaseInsensitive); - })); - - Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { - const auto lhs_length = lhs.Body.length(); - const auto rhs_length = rhs.Body.length(); - - // Note: do not compare After and Before here as they are equal. - return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body); +private: + TGenericToken Match(TStringBuf prefix) const { + TMaybe<TGenericToken> max; + Match(prefix, [&](TGenericToken&& token) { + if (max.Empty() || max->Content.size() < token.Content.size()) { + max = std::move(token); + } }); - TStringBuilder body; - for (const auto& pattern : patterns) { - TString regex = pattern.Body; - if (pattern.Body.Contains('|')) { - regex.prepend('('); - regex.append(')'); - } - body << regex << "|"; + if (max) { + return *max; } - Y_ENSURE(body.back() == '|'); - body.pop_back(); - - return TRegexPattern{ - .Body = std::move(body), - .After = sample.After, - .Before = sample.Before, - .IsCaseInsensitive = sample.IsCaseInsensitive, + + return { + .Name = TGenericToken::Error, + .Content = prefix.substr(0, 1), }; } - IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { - return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); + void Match(TStringBuf prefix, auto onMatch) const { + for (const auto& matcher : Grammar_) { + if (auto token = matcher(prefix)) { + onMatch(std::move(*token)); + } + } } - TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) { - TVector<TGenericToken> tokens; - lexer->Tokenize(text, [&](TGenericToken&& token) { - tokens.emplace_back(std::move(token)); - }); - return tokens; + TGenericLexerGrammar Grammar_; +}; + +TTokenMatcher Compile(TString name, const TRegexPattern& regex) { + RE2::Options options; + options.set_case_sensitive(!regex.IsCaseInsensitive); + + return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options), + bodyRe = MakeAtomicShared<RE2>(regex.Body, options), + afterRe = MakeAtomicShared<RE2>(regex.After, options), + name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { + TMaybe<TStringBuf> before, body, after; + if ((before = Match(prefix, *beforeRe)) && + (body = Match(prefix.Tail(before->size()), *bodyRe)) && + (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) { + return TGenericToken{ + .Name = name, + .Content = *body, + .Begin = before->size(), + }; + } + return Nothing(); + }; +} + +TRegexPattern Merged(TVector<TRegexPattern> patterns) { + Y_ENSURE(!patterns.empty()); + + const TRegexPattern& sample = patterns.back(); + Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { + return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) == + std::tie(sample.After, sample.Before, sample.IsCaseInsensitive); + })); + + Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { + const auto lhs_length = lhs.Body.length(); + const auto rhs_length = rhs.Body.length(); + + // Note: do not compare After and Before here as they are equal. + return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body); + }); + + TStringBuilder body; + for (const auto& pattern : patterns) { + TString regex = pattern.Body; + if (pattern.Body.Contains('|')) { + regex.prepend('('); + regex.append(')'); + } + body << regex << "|"; } + Y_ENSURE(body.back() == '|'); + body.pop_back(); + + return TRegexPattern{ + .Body = std::move(body), + .After = sample.After, + .Before = sample.Before, + .IsCaseInsensitive = sample.IsCaseInsensitive, + }; +} + +IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { + return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); +} + +TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) { + TVector<TGenericToken> tokens; + lexer->Tokenize(text, [&](TGenericToken&& token) { + tokens.emplace_back(std::move(token)); + }); + return tokens; +} } // namespace NSQLTranslationV1 |
