aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvityaman <vityaman.dev@yandex.ru>2025-05-19 11:17:12 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2025-05-19 11:31:23 +0300
commit50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch)
treec9c2952f8521851540e08338d093f2067a68fdb4
parent511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff)
downloadydb-50dbbb6a1e90cf9d1da40a92d563b02712b00b9e.tar.gz
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare. ![image](https://github.com/user-attachments/assets/4e0cb27a-491d-4dbd-b10a-5725ffa6d902) --- - Related to `YQL-19616` - Related to https://github.com/ydb-platform/ydb/issues/15129 - Related to https://github.com/vityaman/ydb/issues/42 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278 commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlight.cpp30
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlighter.cpp15
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.cpp48
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.h14
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp95
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.h7
-rw-r--r--yql/essentials/sql/v1/lexer/regex/ya.make1
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.cpp13
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.h3
-rw-r--r--yql/essentials/sql/v1/reflect/ya.make4
10 files changed, 144 insertions, 86 deletions
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight.cpp b/yql/essentials/sql/v1/highlight/sql_highlight.cpp
index a477ba542f8..e35bb5fb736 100644
--- a/yql/essentials/sql/v1/highlight/sql_highlight.cpp
+++ b/yql/essentials/sql/v1/highlight/sql_highlight.cpp
@@ -12,35 +12,9 @@
namespace NSQLHighlight {
+ using NSQLTranslationV1::Merged;
using NSQLTranslationV1::TRegexPattern;
- TRegexPattern Merged(TVector<TRegexPattern> patterns) {
- Y_ENSURE(!patterns.empty());
-
- const TRegexPattern& sample = patterns.back();
- Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
- return std::tie(pattern.After, pattern.IsCaseInsensitive) ==
- std::tie(sample.After, sample.IsCaseInsensitive);
- }));
-
- Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
- return lhs.Body.length() > rhs.Body.length();
- });
-
- TStringBuilder body;
- for (const auto& pattern : patterns) {
- body << "(" << pattern.Body << ")|";
- }
- Y_ENSURE(body.back() == '|');
- body.pop_back();
-
- return TRegexPattern{
- .Body = std::move(body),
- .After = sample.After,
- .IsCaseInsensitive = sample.IsCaseInsensitive,
- };
- }
-
struct Syntax {
const NSQLReflect::TLexerGrammar* Grammar;
THashMap<TString, TString> RegexesDefault;
@@ -81,7 +55,7 @@ namespace NSQLHighlight {
TUnit unit = {.Kind = EUnitKind::Keyword};
for (const auto& keyword : s.Grammar->KeywordNames) {
- const TStringBuf content = TLexerGrammar::KeywordBlock(keyword);
+ const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
unit.Patterns.push_back(CaseInsensitive(content));
}
diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter.cpp b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp
index 23d17277e49..54513b1117f 100644
--- a/yql/essentials/sql/v1/highlight/sql_highlighter.cpp
+++ b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp
@@ -13,7 +13,6 @@ namespace NSQLHighlight {
using NSQLTranslationV1::IGenericLexer;
using NSQLTranslationV1::TGenericLexerGrammar;
using NSQLTranslationV1::TGenericToken;
- using NSQLTranslationV1::TTokenRule;
THashMap<EUnitKind, TString> NamesByUnitKind = [] {
THashMap<EUnitKind, TString> names;
@@ -51,20 +50,16 @@ namespace NSQLHighlight {
patterns = unit.PatternsANSI.Get();
}
+ const auto& name = NamesByUnitKind.at(unit.Kind);
+
if (unit.Kind == EUnitKind::Comment && ansi) {
Y_ENSURE(unit.Patterns.size() == 1);
- const auto& pattern = unit.Patterns[0];
- grammar.emplace_back(TTokenRule{
- .TokenName = NamesByUnitKind.at(unit.Kind),
- .Match = ANSICommentMatcher(Compile(pattern)),
- });
+ auto matcher = Compile(name, unit.Patterns[0]);
+ grammar.emplace_back(ANSICommentMatcher(name, std::move(matcher)));
}
for (const auto& pattern : *patterns) {
- grammar.emplace_back(TTokenRule{
- .TokenName = NamesByUnitKind.at(unit.Kind),
- .Match = Compile(pattern),
- });
+ grammar.emplace_back(Compile(name, pattern));
}
}
return grammar;
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp
index 2a451b4ef5c..83ad5b4155d 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp
@@ -2,6 +2,8 @@
#include <contrib/libs/re2/re2/re2.h>
+#include <util/string/builder.h>
+
namespace NSQLTranslationV1 {
namespace {
@@ -84,12 +86,9 @@ namespace NSQLTranslationV1 {
}
void Match(TStringBuf prefix, auto onMatch) const {
- for (const auto& token : Grammar_) {
- if (auto content = token.Match(prefix)) {
- onMatch(TGenericToken{
- .Name = token.TokenName,
- .Content = *content,
- });
+ for (const auto& matcher : Grammar_) {
+ if (auto token = matcher(prefix)) {
+ onMatch(std::move(*token));
}
}
}
@@ -97,21 +96,52 @@ namespace NSQLTranslationV1 {
TGenericLexerGrammar Grammar_;
};
- TTokenMatcher Compile(const TRegexPattern& regex) {
+ TTokenMatcher Compile(TString name, const TRegexPattern& regex) {
RE2::Options options;
options.set_case_sensitive(!regex.IsCaseInsensitive);
return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
- afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ afterRe = MakeAtomicShared<RE2>(regex.After, options),
+ name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
TMaybe<TStringBuf> body, after;
if ((body = Match(prefix, *bodyRe)) &&
(after = Match(prefix.Tail(body->size()), *afterRe))) {
- return body;
+ return TGenericToken{
+ .Name = name,
+ .Content = *body,
+ };
}
return Nothing();
};
}
+ TRegexPattern Merged(TVector<TRegexPattern> patterns) {
+ Y_ENSURE(!patterns.empty());
+
+ const TRegexPattern& sample = patterns.back();
+ Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
+ return std::tie(pattern.After, pattern.IsCaseInsensitive) ==
+ std::tie(sample.After, sample.IsCaseInsensitive);
+ }));
+
+ Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
+ return lhs.Body.length() > rhs.Body.length();
+ });
+
+ TStringBuilder body;
+ for (const auto& pattern : patterns) {
+ body << "(" << pattern.Body << ")|";
+ }
+ Y_ENSURE(body.back() == '|');
+ body.pop_back();
+
+ return TRegexPattern{
+ .Body = std::move(body),
+ .After = sample.After,
+ .IsCaseInsensitive = sample.IsCaseInsensitive,
+ };
+ }
+
IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
}
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h
index cde028cc599..efbac67315a 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.h
+++ b/yql/essentials/sql/v1/lexer/regex/generic.h
@@ -13,7 +13,7 @@ namespace NSQLTranslationV1 {
struct TGenericToken {
static constexpr const char* Error = "<ERROR>";
- TStringBuf Name;
+ TString Name;
TStringBuf Content;
size_t Begin = 0; // In bytes
};
@@ -32,14 +32,9 @@ namespace NSQLTranslationV1 {
size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0;
};
- using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>;
+ using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>;
- struct TTokenRule {
- TString TokenName;
- TTokenMatcher Match;
- };
-
- using TGenericLexerGrammar = TVector<TTokenRule>;
+ using TGenericLexerGrammar = TVector<TTokenMatcher>;
struct TRegexPattern {
TString Body;
@@ -47,7 +42,8 @@ namespace NSQLTranslationV1 {
bool IsCaseInsensitive = false;
};
- TTokenMatcher Compile(const TRegexPattern& regex);
+ TTokenMatcher Compile(TString name, const TRegexPattern& regex);
+ TRegexPattern Merged(TVector<TRegexPattern> patterns);
IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar);
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index 58c98edfd31..5d48c092716 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -13,6 +13,7 @@
#include <util/generic/maybe.h>
#include <util/string/subst.h>
#include <util/string/ascii.h>
+#include <util/string/join.h>
namespace NSQLTranslationV1 {
@@ -22,8 +23,8 @@ namespace NSQLTranslationV1 {
size_t MatchANSIMultilineComment(TStringBuf remaining);
- TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) {
- return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) {
+ return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
const auto basic = defaultComment(prefix);
if (basic.Empty()) {
return Nothing();
@@ -36,12 +37,15 @@ namespace NSQLTranslationV1 {
size_t ll1Length = MatchANSIMultilineComment(prefix);
TStringBuf ll1Content = prefix.SubString(0, ll1Length);
- Y_ENSURE(ll1Content == 0 || basic <= ll1Content);
+ Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content);
if (ll1Content == 0) {
return basic;
}
- return ll1Content;
+ return TGenericToken{
+ .Name = name,
+ .Content = ll1Content,
+ };
};
}
@@ -89,38 +93,77 @@ namespace NSQLTranslationV1 {
}
}
- TGenericLexerGrammar MakeGenericLexerGrammar(
- bool ansi,
- const TLexerGrammar& grammar,
- const TVector<std::tuple<TString, TString>>& regexByOtherName) {
- TGenericLexerGrammar generic;
+ TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ auto keyword = Compile("Keyword", KeywordPattern(grammar));
+ return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = keyword(content)) {
+ return TGenericToken{
+ .Name = TLexerGrammar::KeywordNameByBlock(token->Content),
+ .Content = token->Content,
+ };
+ }
+ return Nothing();
+ };
+ }
- for (const auto& name : grammar.KeywordNames) {
- auto matcher = Compile({
- .Body = TString(TLexerGrammar::KeywordBlock(name)),
+ TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.KeywordNames.size());
+ for (const auto& keyword : grammar.KeywordNames) {
+ const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
+ patterns.push_back({
+ .Body = TString(content),
.IsCaseInsensitive = true,
});
- generic.emplace_back(name, std::move(matcher));
}
+ return Merged(std::move(patterns));
+ }
+ TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ THashMap<TString, TString> nameByBlock;
+ nameByBlock.reserve(grammar.PunctuationNames.size());
for (const auto& name : grammar.PunctuationNames) {
- generic.emplace_back(
- name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))}));
+ const auto& block = grammar.BlockByName.at(name);
+ nameByBlock[block] = name;
}
- for (const auto& [name, regex] : regexByOtherName) {
- auto matcher = Compile({
- .Body = regex,
- });
- generic.emplace_back(name, std::move(matcher));
+ auto punct = Compile("Punctuation", PuntuationPattern(grammar));
+
+ return [nameByBlock = std::move(nameByBlock),
+ punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = punct(content)) {
+ return TGenericToken{
+ .Name = nameByBlock.at(token->Content),
+ .Content = token->Content,
+ };
+ }
+ return Nothing();
+ };
+ }
+
+ TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.PunctuationNames.size());
+ for (const auto& name : grammar.PunctuationNames) {
+ patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))});
}
+ return Merged(std::move(patterns));
+ }
- if (ansi) {
- auto it = FindIf(generic, [](const auto& m) {
- return m.TokenName == "COMMENT";
- });
- Y_ENSURE(it != std::end(generic));
- it->Match = ANSICommentMatcher(it->Match);
+ TGenericLexerGrammar MakeGenericLexerGrammar(
+ bool ansi,
+ const TLexerGrammar& grammar,
+ const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+ TGenericLexerGrammar generic;
+
+ generic.emplace_back(KeywordMatcher(grammar));
+ generic.emplace_back(PuntuationMatcher(grammar));
+
+ for (const auto& [name, regex] : regexByOtherName) {
+ generic.emplace_back(Compile(name, {regex}));
+ if (name == "COMMENT" && ansi) {
+ generic.back() = ANSICommentMatcher(name, std::move(generic.back()));
+ }
}
return generic;
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h
index 42d99a0a530..32c145c6484 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.h
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.h
@@ -3,10 +3,15 @@
#include "generic.h"
#include <yql/essentials/parser/lexer_common/lexer.h>
+#include <yql/essentials/sql/v1/reflect/sql_reflect.h>
namespace NSQLTranslationV1 {
- TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment);
+ TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment);
+
+ TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar);
+
+ TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar);
NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi);
diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make
index 3a7fe19b94c..ac2c2744d3d 100644
--- a/yql/essentials/sql/v1/lexer/regex/ya.make
+++ b/yql/essentials/sql/v1/lexer/regex/ya.make
@@ -2,6 +2,7 @@ LIBRARY()
PEERDIR(
contrib/libs/re2
+
yql/essentials/public/issue
yql/essentials/parser/lexer_common
yql/essentials/sql/settings
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
index 262209cfc39..5e652db4fc0 100644
--- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
@@ -1,9 +1,11 @@
#include "sql_reflect.h"
#include <library/cpp/resource/resource.h>
+#include <library/cpp/case_insensitive_string/case_insensitive_string.h>
#include <util/string/split.h>
#include <util/string/strip.h>
+#include <util/charset/utf8.h>
namespace NSQLReflect {
@@ -15,13 +17,20 @@ namespace NSQLReflect {
const TStringBuf SectionOther = "//! section:other";
const TStringBuf FragmentPrefix = "fragment ";
- const TStringBuf TLexerGrammar::KeywordBlock(const TStringBuf name) {
+ const TStringBuf TLexerGrammar::KeywordBlockByName(const TStringBuf name Y_LIFETIME_BOUND) {
if (name == "TSKIP") {
return "SKIP";
}
return name;
}
+ const TString TLexerGrammar::KeywordNameByBlock(const TStringBuf block) {
+ if (TCaseInsensitiveStringBuf(block) == "SKIP") {
+ return "TSKIP";
+ }
+ return ToUpperUTF8(block);
+ }
+
TVector<TString> GetResourceLines(const TStringBuf key) {
TString text;
Y_ENSURE(NResource::FindExact(key, &text));
@@ -133,7 +142,7 @@ namespace NSQLReflect {
SubstGlobal(block, "'", "");
SubstGlobal(block, " ", "");
- Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("TSKIP")));
+ Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlockByName("TSKIP")));
grammar.KeywordNames.emplace(std::move(name));
}
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h
index 1f67a2f93a3..dec5ff98816 100644
--- a/yql/essentials/sql/v1/reflect/sql_reflect.h
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.h
@@ -13,7 +13,8 @@ namespace NSQLReflect {
TVector<TString> OtherNames;
THashMap<TString, TString> BlockByName;
- static const TStringBuf KeywordBlock(const TStringBuf name);
+ static const TStringBuf KeywordBlockByName(const TStringBuf name);
+ static const TString KeywordNameByBlock(const TStringBuf block);
};
TLexerGrammar LoadLexerGrammar();
diff --git a/yql/essentials/sql/v1/reflect/ya.make b/yql/essentials/sql/v1/reflect/ya.make
index 5865654c86e..1843aabf19b 100644
--- a/yql/essentials/sql/v1/reflect/ya.make
+++ b/yql/essentials/sql/v1/reflect/ya.make
@@ -4,6 +4,10 @@ SRCS(
sql_reflect.cpp
)
+PEERDIR(
+ library/cpp/case_insensitive_string
+)
+
RESOURCE(DONT_PARSE yql/essentials/sql/v1/SQLv1Antlr4.g.in SQLv1Antlr4.g.in)
END()