summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/lexer.cpp
diff options
context:
space:
mode:
authorvityaman <[email protected]>2025-05-19 11:17:12 +0300
committerrobot-piglet <[email protected]>2025-05-19 11:31:23 +0300
commit50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch)
treec9c2952f8521851540e08338d093f2067a68fdb4 /yql/essentials/sql/v1/lexer/regex/lexer.cpp
parent511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff)
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare. ![image](https://github.com/user-attachments/assets/4e0cb27a-491d-4dbd-b10a-5725ffa6d902) --- - Related to `YQL-19616` - Related to https://github.com/ydb-platform/ydb/issues/15129 - Related to https://github.com/vityaman/ydb/issues/42 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278 commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/lexer.cpp')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp95
1 files changed, 69 insertions, 26 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index 58c98edfd31..5d48c092716 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -13,6 +13,7 @@
#include <util/generic/maybe.h>
#include <util/string/subst.h>
#include <util/string/ascii.h>
+#include <util/string/join.h>
namespace NSQLTranslationV1 {
@@ -22,8 +23,8 @@ namespace NSQLTranslationV1 {
size_t MatchANSIMultilineComment(TStringBuf remaining);
- TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) {
- return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) {
+ return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
const auto basic = defaultComment(prefix);
if (basic.Empty()) {
return Nothing();
@@ -36,12 +37,15 @@ namespace NSQLTranslationV1 {
size_t ll1Length = MatchANSIMultilineComment(prefix);
TStringBuf ll1Content = prefix.SubString(0, ll1Length);
- Y_ENSURE(ll1Content == 0 || basic <= ll1Content);
+ Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content);
if (ll1Content == 0) {
return basic;
}
- return ll1Content;
+ return TGenericToken{
+ .Name = name,
+ .Content = ll1Content,
+ };
};
}
@@ -89,38 +93,77 @@ namespace NSQLTranslationV1 {
}
}
- TGenericLexerGrammar MakeGenericLexerGrammar(
- bool ansi,
- const TLexerGrammar& grammar,
- const TVector<std::tuple<TString, TString>>& regexByOtherName) {
- TGenericLexerGrammar generic;
+ TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ auto keyword = Compile("Keyword", KeywordPattern(grammar));
+ return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = keyword(content)) {
+ return TGenericToken{
+ .Name = TLexerGrammar::KeywordNameByBlock(token->Content),
+ .Content = token->Content,
+ };
+ }
+ return Nothing();
+ };
+ }
- for (const auto& name : grammar.KeywordNames) {
- auto matcher = Compile({
- .Body = TString(TLexerGrammar::KeywordBlock(name)),
+ TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.KeywordNames.size());
+ for (const auto& keyword : grammar.KeywordNames) {
+ const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
+ patterns.push_back({
+ .Body = TString(content),
.IsCaseInsensitive = true,
});
- generic.emplace_back(name, std::move(matcher));
}
+ return Merged(std::move(patterns));
+ }
+ TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ THashMap<TString, TString> nameByBlock;
+ nameByBlock.reserve(grammar.PunctuationNames.size());
for (const auto& name : grammar.PunctuationNames) {
- generic.emplace_back(
- name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))}));
+ const auto& block = grammar.BlockByName.at(name);
+ nameByBlock[block] = name;
}
- for (const auto& [name, regex] : regexByOtherName) {
- auto matcher = Compile({
- .Body = regex,
- });
- generic.emplace_back(name, std::move(matcher));
+ auto punct = Compile("Punctuation", PuntuationPattern(grammar));
+
+ return [nameByBlock = std::move(nameByBlock),
+ punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = punct(content)) {
+ return TGenericToken{
+ .Name = nameByBlock.at(token->Content),
+ .Content = token->Content,
+ };
+ }
+ return Nothing();
+ };
+ }
+
+ TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.PunctuationNames.size());
+ for (const auto& name : grammar.PunctuationNames) {
+ patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))});
}
+ return Merged(std::move(patterns));
+ }
- if (ansi) {
- auto it = FindIf(generic, [](const auto& m) {
- return m.TokenName == "COMMENT";
- });
- Y_ENSURE(it != std::end(generic));
- it->Match = ANSICommentMatcher(it->Match);
+ TGenericLexerGrammar MakeGenericLexerGrammar(
+ bool ansi,
+ const TLexerGrammar& grammar,
+ const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+ TGenericLexerGrammar generic;
+
+ generic.emplace_back(KeywordMatcher(grammar));
+ generic.emplace_back(PuntuationMatcher(grammar));
+
+ for (const auto& [name, regex] : regexByOtherName) {
+ generic.emplace_back(Compile(name, {regex}));
+ if (name == "COMMENT" && ansi) {
+ generic.back() = ANSICommentMatcher(name, std::move(generic.back()));
+ }
}
return generic;