diff options
author | vityaman <vityaman.dev@yandex.ru> | 2025-05-19 11:17:12 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2025-05-19 11:31:23 +0300 |
commit | 50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch) | |
tree | c9c2952f8521851540e08338d093f2067a68fdb4 /yql/essentials/sql/v1/lexer/regex/generic.cpp | |
parent | 511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff) | |
download | ydb-50dbbb6a1e90cf9d1da40a92d563b02712b00b9e.tar.gz |
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare.

---
- Related to `YQL-19616`
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Related to https://github.com/vityaman/ydb/issues/42
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278
commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/generic.cpp')
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.cpp | 48 |
1 files changed, 39 insertions, 9 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp index 2a451b4ef5c..83ad5b4155d 100644 --- a/yql/essentials/sql/v1/lexer/regex/generic.cpp +++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp @@ -2,6 +2,8 @@ #include <contrib/libs/re2/re2/re2.h> +#include <util/string/builder.h> + namespace NSQLTranslationV1 { namespace { @@ -84,12 +86,9 @@ namespace NSQLTranslationV1 { } void Match(TStringBuf prefix, auto onMatch) const { - for (const auto& token : Grammar_) { - if (auto content = token.Match(prefix)) { - onMatch(TGenericToken{ - .Name = token.TokenName, - .Content = *content, - }); + for (const auto& matcher : Grammar_) { + if (auto token = matcher(prefix)) { + onMatch(std::move(*token)); } } } @@ -97,21 +96,52 @@ namespace NSQLTranslationV1 { TGenericLexerGrammar Grammar_; }; - TTokenMatcher Compile(const TRegexPattern& regex) { + TTokenMatcher Compile(TString name, const TRegexPattern& regex) { RE2::Options options; options.set_case_sensitive(!regex.IsCaseInsensitive); return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options), - afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> { + afterRe = MakeAtomicShared<RE2>(regex.After, options), + name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { TMaybe<TStringBuf> body, after; if ((body = Match(prefix, *bodyRe)) && (after = Match(prefix.Tail(body->size()), *afterRe))) { - return body; + return TGenericToken{ + .Name = name, + .Content = *body, + }; } return Nothing(); }; } + TRegexPattern Merged(TVector<TRegexPattern> patterns) { + Y_ENSURE(!patterns.empty()); + + const TRegexPattern& sample = patterns.back(); + Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { + return std::tie(pattern.After, pattern.IsCaseInsensitive) == + std::tie(sample.After, sample.IsCaseInsensitive); + })); + + Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { + return lhs.Body.length() > rhs.Body.length(); + }); + + TStringBuilder body; + for (const auto& pattern : patterns) { + body << "(" << pattern.Body << ")|"; + } + Y_ENSURE(body.back() == '|'); + body.pop_back(); + + return TRegexPattern{ + .Body = std::move(body), + .After = sample.After, + .IsCaseInsensitive = sample.IsCaseInsensitive, + }; + } + IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); } |