diff options
author | vityaman <[email protected]> | 2025-05-19 11:17:12 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-05-19 11:31:23 +0300 |
commit | 50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch) | |
tree | c9c2952f8521851540e08338d093f2067a68fdb4 /yql/essentials/sql/v1/lexer/regex/lexer.cpp | |
parent | 511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff) |
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare.

---
- Related to `YQL-19616`
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Related to https://github.com/vityaman/ydb/issues/42
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278
commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/lexer.cpp')
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 95 |
1 files changed, 69 insertions, 26 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index 58c98edfd31..5d48c092716 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -13,6 +13,7 @@ #include <util/generic/maybe.h> #include <util/string/subst.h> #include <util/string/ascii.h> +#include <util/string/join.h> namespace NSQLTranslationV1 { @@ -22,8 +23,8 @@ namespace NSQLTranslationV1 { size_t MatchANSIMultilineComment(TStringBuf remaining); - TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) { - return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> { + TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) { + return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { const auto basic = defaultComment(prefix); if (basic.Empty()) { return Nothing(); @@ -36,12 +37,15 @@ namespace NSQLTranslationV1 { size_t ll1Length = MatchANSIMultilineComment(prefix); TStringBuf ll1Content = prefix.SubString(0, ll1Length); - Y_ENSURE(ll1Content == 0 || basic <= ll1Content); + Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content); if (ll1Content == 0) { return basic; } - return ll1Content; + return TGenericToken{ + .Name = name, + .Content = ll1Content, + }; }; } @@ -89,38 +93,77 @@ namespace NSQLTranslationV1 { } } - TGenericLexerGrammar MakeGenericLexerGrammar( - bool ansi, - const TLexerGrammar& grammar, - const TVector<std::tuple<TString, TString>>& regexByOtherName) { - TGenericLexerGrammar generic; + TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) { + auto keyword = Compile("Keyword", KeywordPattern(grammar)); + return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = keyword(content)) { + return TGenericToken{ + .Name = TLexerGrammar::KeywordNameByBlock(token->Content), + .Content = token->Content, + }; + } + return Nothing(); + }; + } - for (const auto& name : grammar.KeywordNames) { - auto matcher = Compile({ - .Body = TString(TLexerGrammar::KeywordBlock(name)), + TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.KeywordNames.size()); + for (const auto& keyword : grammar.KeywordNames) { + const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); + patterns.push_back({ + .Body = TString(content), .IsCaseInsensitive = true, }); - generic.emplace_back(name, std::move(matcher)); } + return Merged(std::move(patterns)); + } + TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) { + THashMap<TString, TString> nameByBlock; + nameByBlock.reserve(grammar.PunctuationNames.size()); for (const auto& name : grammar.PunctuationNames) { - generic.emplace_back( - name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))})); + const auto& block = grammar.BlockByName.at(name); + nameByBlock[block] = name; } - for (const auto& [name, regex] : regexByOtherName) { - auto matcher = Compile({ - .Body = regex, - }); - generic.emplace_back(name, std::move(matcher)); + auto punct = Compile("Punctuation", PuntuationPattern(grammar)); + + return [nameByBlock = std::move(nameByBlock), + punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = punct(content)) { + return TGenericToken{ + .Name = nameByBlock.at(token->Content), + .Content = token->Content, + }; + } + return Nothing(); + }; + } + + TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.PunctuationNames.size()); + for (const auto& name : grammar.PunctuationNames) { + patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))}); } + return Merged(std::move(patterns)); + } - if (ansi) { - auto it = FindIf(generic, [](const auto& m) { - return m.TokenName == "COMMENT"; - }); - Y_ENSURE(it != std::end(generic)); - it->Match = ANSICommentMatcher(it->Match); + TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; + + generic.emplace_back(KeywordMatcher(grammar)); + generic.emplace_back(PuntuationMatcher(grammar)); + + for (const auto& [name, regex] : regexByOtherName) { + generic.emplace_back(Compile(name, {regex})); + if (name == "COMMENT" && ansi) { + generic.back() = ANSICommentMatcher(name, std::move(generic.back())); + } } return generic; |