aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/generic.cpp
diff options
context:
space:
mode:
authorvityaman <vityaman.dev@yandex.ru>2025-05-19 11:17:12 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2025-05-19 11:31:23 +0300
commit50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch)
treec9c2952f8521851540e08338d093f2067a68fdb4 /yql/essentials/sql/v1/lexer/regex/generic.cpp
parent511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff)
downloadydb-50dbbb6a1e90cf9d1da40a92d563b02712b00b9e.tar.gz
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare. ![image](https://github.com/user-attachments/assets/4e0cb27a-491d-4dbd-b10a-5725ffa6d902) --- - Related to `YQL-19616` - Related to https://github.com/ydb-platform/ydb/issues/15129 - Related to https://github.com/vityaman/ydb/issues/42 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278 commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/generic.cpp')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.cpp48
1 files changed, 39 insertions, 9 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp
index 2a451b4ef5c..83ad5b4155d 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp
@@ -2,6 +2,8 @@
#include <contrib/libs/re2/re2/re2.h>
+#include <util/string/builder.h>
+
namespace NSQLTranslationV1 {
namespace {
@@ -84,12 +86,9 @@ namespace NSQLTranslationV1 {
}
void Match(TStringBuf prefix, auto onMatch) const {
- for (const auto& token : Grammar_) {
- if (auto content = token.Match(prefix)) {
- onMatch(TGenericToken{
- .Name = token.TokenName,
- .Content = *content,
- });
+ for (const auto& matcher : Grammar_) {
+ if (auto token = matcher(prefix)) {
+ onMatch(std::move(*token));
}
}
}
@@ -97,21 +96,52 @@ namespace NSQLTranslationV1 {
TGenericLexerGrammar Grammar_;
};
- TTokenMatcher Compile(const TRegexPattern& regex) {
+ TTokenMatcher Compile(TString name, const TRegexPattern& regex) {
RE2::Options options;
options.set_case_sensitive(!regex.IsCaseInsensitive);
return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
- afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ afterRe = MakeAtomicShared<RE2>(regex.After, options),
+ name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
TMaybe<TStringBuf> body, after;
if ((body = Match(prefix, *bodyRe)) &&
(after = Match(prefix.Tail(body->size()), *afterRe))) {
- return body;
+ return TGenericToken{
+ .Name = name,
+ .Content = *body,
+ };
}
return Nothing();
};
}
+ TRegexPattern Merged(TVector<TRegexPattern> patterns) {
+ Y_ENSURE(!patterns.empty());
+
+ const TRegexPattern& sample = patterns.back();
+ Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
+ return std::tie(pattern.After, pattern.IsCaseInsensitive) ==
+ std::tie(sample.After, sample.IsCaseInsensitive);
+ }));
+
+ Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
+ return lhs.Body.length() > rhs.Body.length();
+ });
+
+ TStringBuilder body;
+ for (const auto& pattern : patterns) {
+ body << "(" << pattern.Body << ")|";
+ }
+ Y_ENSURE(body.back() == '|');
+ body.pop_back();
+
+ return TRegexPattern{
+ .Body = std::move(body),
+ .After = sample.After,
+ .IsCaseInsensitive = sample.IsCaseInsensitive,
+ };
+ }
+
IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
}