aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/generic.h
diff options
context:
space:
mode:
authorvityaman <vityaman.dev@yandex.ru>2025-05-19 11:17:12 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2025-05-19 11:31:23 +0300
commit50dbbb6a1e90cf9d1da40a92d563b02712b00b9e (patch)
treec9c2952f8521851540e08338d093f2067a68fdb4 /yql/essentials/sql/v1/lexer/regex/generic.h
parent511e56c14b85e20b29e77f9da53d5bb29a3e996c (diff)
downloadydb-50dbbb6a1e90cf9d1da40a92d563b02712b00b9e.tar.gz
YQL-19616: Fix TRegexLexer performance
Fix `TRegexLexer` performance. Now it is just 2 times slower than a reference ANTLR implementation on Release mode, so merged regexes are 3 times better than scan&compare. ![image](https://github.com/user-attachments/assets/4e0cb27a-491d-4dbd-b10a-5725ffa6d902) --- - Related to `YQL-19616` - Related to https://github.com/ydb-platform/ydb/issues/15129 - Related to https://github.com/vityaman/ydb/issues/42 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1278 commit_hash:1529f641172fea13f0d33fbfd06a4827c6efde01
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/generic.h')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.h14
1 files changed, 5 insertions, 9 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h
index cde028cc599..efbac67315a 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.h
+++ b/yql/essentials/sql/v1/lexer/regex/generic.h
@@ -13,7 +13,7 @@ namespace NSQLTranslationV1 {
struct TGenericToken {
static constexpr const char* Error = "<ERROR>";
- TStringBuf Name;
+ TString Name;
TStringBuf Content;
size_t Begin = 0; // In bytes
};
@@ -32,14 +32,9 @@ namespace NSQLTranslationV1 {
size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0;
};
- using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>;
+ using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>;
- struct TTokenRule {
- TString TokenName;
- TTokenMatcher Match;
- };
-
- using TGenericLexerGrammar = TVector<TTokenRule>;
+ using TGenericLexerGrammar = TVector<TTokenMatcher>;
struct TRegexPattern {
TString Body;
@@ -47,7 +42,8 @@ namespace NSQLTranslationV1 {
bool IsCaseInsensitive = false;
};
- TTokenMatcher Compile(const TRegexPattern& regex);
+ TTokenMatcher Compile(TString name, const TRegexPattern& regex);
+ TRegexPattern Merged(TVector<TRegexPattern> patterns);
IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar);