diff options
author | vityaman <vityaman.dev@yandex.ru> | 2025-05-19 11:17:12 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2025-05-19 11:31:23 +0300 |
commit | 50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch) | |
tree | c9c2952f8521851540e08338d093f2067a68fdb4 | |
parent | 511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff) | |
download | ydb-50dbbb6a1e90cf9d1da40a92d563b02712b00b9e.tar.gz |
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare.

---
- Related to `YQL-19616`
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Related to https://github.com/vityaman/ydb/issues/42
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278
commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
-rw-r--r-- | yql/essentials/sql/v1/highlight/sql_highlight.cpp | 30 | ||||
-rw-r--r-- | yql/essentials/sql/v1/highlight/sql_highlighter.cpp | 15 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.cpp | 48 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.h | 14 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 95 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.h | 7 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/ya.make | 1 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.cpp | 13 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.h | 3 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/ya.make | 4 |
10 files changed, 144 insertions, 86 deletions
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight.cpp b/yql/essentials/sql/v1/highlight/sql_highlight.cpp index a477ba542f8..e35bb5fb736 100644 --- a/yql/essentials/sql/v1/highlight/sql_highlight.cpp +++ b/yql/essentials/sql/v1/highlight/sql_highlight.cpp @@ -12,35 +12,9 @@ namespace NSQLHighlight { + using NSQLTranslationV1::Merged; using NSQLTranslationV1::TRegexPattern; - TRegexPattern Merged(TVector<TRegexPattern> patterns) { - Y_ENSURE(!patterns.empty()); - - const TRegexPattern& sample = patterns.back(); - Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { - return std::tie(pattern.After, pattern.IsCaseInsensitive) == - std::tie(sample.After, sample.IsCaseInsensitive); - })); - - Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { - return lhs.Body.length() > rhs.Body.length(); - }); - - TStringBuilder body; - for (const auto& pattern : patterns) { - body << "(" << pattern.Body << ")|"; - } - Y_ENSURE(body.back() == '|'); - body.pop_back(); - - return TRegexPattern{ - .Body = std::move(body), - .After = sample.After, - .IsCaseInsensitive = sample.IsCaseInsensitive, - }; - } - struct Syntax { const NSQLReflect::TLexerGrammar* Grammar; THashMap<TString, TString> RegexesDefault; @@ -81,7 +55,7 @@ namespace NSQLHighlight { TUnit unit = {.Kind = EUnitKind::Keyword}; for (const auto& keyword : s.Grammar->KeywordNames) { - const TStringBuf content = TLexerGrammar::KeywordBlock(keyword); + const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); unit.Patterns.push_back(CaseInsensitive(content)); } diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter.cpp b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp index 23d17277e49..54513b1117f 100644 --- a/yql/essentials/sql/v1/highlight/sql_highlighter.cpp +++ b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp @@ -13,7 +13,6 @@ namespace NSQLHighlight { using NSQLTranslationV1::IGenericLexer; using NSQLTranslationV1::TGenericLexerGrammar; using NSQLTranslationV1::TGenericToken; - using NSQLTranslationV1::TTokenRule; THashMap<EUnitKind, TString> NamesByUnitKind = [] { THashMap<EUnitKind, TString> names; @@ -51,20 +50,16 @@ namespace NSQLHighlight { patterns = unit.PatternsANSI.Get(); } + const auto& name = NamesByUnitKind.at(unit.Kind); + if (unit.Kind == EUnitKind::Comment && ansi) { Y_ENSURE(unit.Patterns.size() == 1); - const auto& pattern = unit.Patterns[0]; - grammar.emplace_back(TTokenRule{ - .TokenName = NamesByUnitKind.at(unit.Kind), - .Match = ANSICommentMatcher(Compile(pattern)), - }); + auto matcher = Compile(name, unit.Patterns[0]); + grammar.emplace_back(ANSICommentMatcher(name, std::move(matcher))); } for (const auto& pattern : *patterns) { - grammar.emplace_back(TTokenRule{ - .TokenName = NamesByUnitKind.at(unit.Kind), - .Match = Compile(pattern), - }); + grammar.emplace_back(Compile(name, pattern)); } } return grammar; diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp index 2a451b4ef5c..83ad5b4155d 100644 --- a/yql/essentials/sql/v1/lexer/regex/generic.cpp +++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp @@ -2,6 +2,8 @@ #include <contrib/libs/re2/re2/re2.h> +#include <util/string/builder.h> + namespace NSQLTranslationV1 { namespace { @@ -84,12 +86,9 @@ namespace NSQLTranslationV1 { } void Match(TStringBuf prefix, auto onMatch) const { - for (const auto& token : Grammar_) { - if (auto content = token.Match(prefix)) { - onMatch(TGenericToken{ - .Name = token.TokenName, - .Content = *content, - }); + for (const auto& matcher : Grammar_) { + if (auto token = matcher(prefix)) { + onMatch(std::move(*token)); } } } @@ -97,21 +96,52 @@ namespace NSQLTranslationV1 { TGenericLexerGrammar Grammar_; }; - TTokenMatcher Compile(const TRegexPattern& regex) { + TTokenMatcher Compile(TString name, const TRegexPattern& regex) { RE2::Options options; options.set_case_sensitive(!regex.IsCaseInsensitive); return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options), - afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> { + afterRe = MakeAtomicShared<RE2>(regex.After, options), + name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { TMaybe<TStringBuf> body, after; if ((body = Match(prefix, *bodyRe)) && (after = Match(prefix.Tail(body->size()), *afterRe))) { - return body; + return TGenericToken{ + .Name = name, + .Content = *body, + }; } return Nothing(); }; } + TRegexPattern Merged(TVector<TRegexPattern> patterns) { + Y_ENSURE(!patterns.empty()); + + const TRegexPattern& sample = patterns.back(); + Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { + return std::tie(pattern.After, pattern.IsCaseInsensitive) == + std::tie(sample.After, sample.IsCaseInsensitive); + })); + + Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { + return lhs.Body.length() > rhs.Body.length(); + }); + + TStringBuilder body; + for (const auto& pattern : patterns) { + body << "(" << pattern.Body << ")|"; + } + Y_ENSURE(body.back() == '|'); + body.pop_back(); + + return TRegexPattern{ + .Body = std::move(body), + .After = sample.After, + .IsCaseInsensitive = sample.IsCaseInsensitive, + }; + } + IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); } diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h index cde028cc599..efbac67315a 100644 --- a/yql/essentials/sql/v1/lexer/regex/generic.h +++ b/yql/essentials/sql/v1/lexer/regex/generic.h @@ -13,7 +13,7 @@ namespace NSQLTranslationV1 { struct TGenericToken { static constexpr const char* Error = "<ERROR>"; - TStringBuf Name; + TString Name; TStringBuf Content; size_t Begin = 0; // In bytes }; @@ -32,14 +32,9 @@ namespace NSQLTranslationV1 { size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0; }; - using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>; + using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>; - struct TTokenRule { - TString TokenName; - TTokenMatcher Match; - }; - - using TGenericLexerGrammar = TVector<TTokenRule>; + using TGenericLexerGrammar = TVector<TTokenMatcher>; struct TRegexPattern { TString Body; @@ -47,7 +42,8 @@ namespace NSQLTranslationV1 { bool IsCaseInsensitive = false; }; - TTokenMatcher Compile(const TRegexPattern& regex); + TTokenMatcher Compile(TString name, const TRegexPattern& regex); + TRegexPattern Merged(TVector<TRegexPattern> patterns); IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar); diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index 58c98edfd31..5d48c092716 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -13,6 +13,7 @@ #include <util/generic/maybe.h> #include <util/string/subst.h> #include <util/string/ascii.h> +#include <util/string/join.h> namespace NSQLTranslationV1 { @@ -22,8 +23,8 @@ namespace NSQLTranslationV1 { size_t MatchANSIMultilineComment(TStringBuf remaining); - TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) { - return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> { + TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) { + return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { const auto basic = defaultComment(prefix); if (basic.Empty()) { return Nothing(); @@ -36,12 +37,15 @@ namespace NSQLTranslationV1 { size_t ll1Length = MatchANSIMultilineComment(prefix); TStringBuf ll1Content = prefix.SubString(0, ll1Length); - Y_ENSURE(ll1Content == 0 || basic <= ll1Content); + Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content); if (ll1Content == 0) { return basic; } - return ll1Content; + return TGenericToken{ + .Name = name, + .Content = ll1Content, + }; }; } @@ -89,38 +93,77 @@ namespace NSQLTranslationV1 { } } - TGenericLexerGrammar MakeGenericLexerGrammar( - bool ansi, - const TLexerGrammar& grammar, - const TVector<std::tuple<TString, TString>>& regexByOtherName) { - TGenericLexerGrammar generic; + TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) { + auto keyword = Compile("Keyword", KeywordPattern(grammar)); + return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = keyword(content)) { + return TGenericToken{ + .Name = TLexerGrammar::KeywordNameByBlock(token->Content), + .Content = token->Content, + }; + } + return Nothing(); + }; + } - for (const auto& name : grammar.KeywordNames) { - auto matcher = Compile({ - .Body = TString(TLexerGrammar::KeywordBlock(name)), + TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.KeywordNames.size()); + for (const auto& keyword : grammar.KeywordNames) { + const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); + patterns.push_back({ + .Body = TString(content), .IsCaseInsensitive = true, }); - generic.emplace_back(name, std::move(matcher)); } + return Merged(std::move(patterns)); + } + TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) { + THashMap<TString, TString> nameByBlock; + nameByBlock.reserve(grammar.PunctuationNames.size()); for (const auto& name : grammar.PunctuationNames) { - generic.emplace_back( - name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))})); + const auto& block = grammar.BlockByName.at(name); + nameByBlock[block] = name; } - for (const auto& [name, regex] : regexByOtherName) { - auto matcher = Compile({ - .Body = regex, - }); - generic.emplace_back(name, std::move(matcher)); + auto punct = Compile("Punctuation", PuntuationPattern(grammar)); + + return [nameByBlock = std::move(nameByBlock), + punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = punct(content)) { + return TGenericToken{ + .Name = nameByBlock.at(token->Content), + .Content = token->Content, + }; + } + return Nothing(); + }; + } + + TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.PunctuationNames.size()); + for (const auto& name : grammar.PunctuationNames) { + patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))}); } + return Merged(std::move(patterns)); + } - if (ansi) { - auto it = FindIf(generic, [](const auto& m) { - return m.TokenName == "COMMENT"; - }); - Y_ENSURE(it != std::end(generic)); - it->Match = ANSICommentMatcher(it->Match); + TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; + + generic.emplace_back(KeywordMatcher(grammar)); + generic.emplace_back(PuntuationMatcher(grammar)); + + for (const auto& [name, regex] : regexByOtherName) { + generic.emplace_back(Compile(name, {regex})); + if (name == "COMMENT" && ansi) { + generic.back() = ANSICommentMatcher(name, std::move(generic.back())); + } } return generic; diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h index 42d99a0a530..32c145c6484 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.h +++ b/yql/essentials/sql/v1/lexer/regex/lexer.h @@ -3,10 +3,15 @@ #include "generic.h" #include <yql/essentials/parser/lexer_common/lexer.h> +#include <yql/essentials/sql/v1/reflect/sql_reflect.h> namespace NSQLTranslationV1 { - TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment); + TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment); + + TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar); + + TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar); NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi); diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make index 3a7fe19b94c..ac2c2744d3d 100644 --- a/yql/essentials/sql/v1/lexer/regex/ya.make +++ b/yql/essentials/sql/v1/lexer/regex/ya.make @@ -2,6 +2,7 @@ LIBRARY() PEERDIR( contrib/libs/re2 + yql/essentials/public/issue yql/essentials/parser/lexer_common yql/essentials/sql/settings diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp index 262209cfc39..5e652db4fc0 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp +++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp @@ -1,9 +1,11 @@ #include "sql_reflect.h" #include <library/cpp/resource/resource.h> +#include <library/cpp/case_insensitive_string/case_insensitive_string.h> #include <util/string/split.h> #include <util/string/strip.h> +#include <util/charset/utf8.h> namespace NSQLReflect { @@ -15,13 +17,20 @@ namespace NSQLReflect { const TStringBuf SectionOther = "//! section:other"; const TStringBuf FragmentPrefix = "fragment "; - const TStringBuf TLexerGrammar::KeywordBlock(const TStringBuf name) { + const TStringBuf TLexerGrammar::KeywordBlockByName(const TStringBuf name Y_LIFETIME_BOUND) { if (name == "TSKIP") { return "SKIP"; } return name; } + const TString TLexerGrammar::KeywordNameByBlock(const TStringBuf block) { + if (TCaseInsensitiveStringBuf(block) == "SKIP") { + return "TSKIP"; + } + return ToUpperUTF8(block); + } + TVector<TString> GetResourceLines(const TStringBuf key) { TString text; Y_ENSURE(NResource::FindExact(key, &text)); @@ -133,7 +142,7 @@ namespace NSQLReflect { SubstGlobal(block, "'", ""); SubstGlobal(block, " ", ""); - Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("TSKIP"))); + Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlockByName("TSKIP"))); grammar.KeywordNames.emplace(std::move(name)); } diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h index 1f67a2f93a3..dec5ff98816 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.h +++ b/yql/essentials/sql/v1/reflect/sql_reflect.h @@ -13,7 +13,8 @@ namespace NSQLReflect { TVector<TString> OtherNames; THashMap<TString, TString> BlockByName; - static const TStringBuf KeywordBlock(const TStringBuf name); + static const TStringBuf KeywordBlockByName(const TStringBuf name); + static const TString KeywordNameByBlock(const TStringBuf block); }; TLexerGrammar LoadLexerGrammar(); diff --git a/yql/essentials/sql/v1/reflect/ya.make b/yql/essentials/sql/v1/reflect/ya.make index 5865654c86e..1843aabf19b 100644 --- a/yql/essentials/sql/v1/reflect/ya.make +++ b/yql/essentials/sql/v1/reflect/ya.make @@ -4,6 +4,10 @@ SRCS( sql_reflect.cpp ) +PEERDIR( + library/cpp/case_insensitive_string +) + RESOURCE(DONT_PARSE yql/essentials/sql/v1/SQLv1Antlr4.g.in SQLv1Antlr4.g.in) END() |