summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql
diff options
context:
space:
mode:
authorvityaman <[email protected]>2025-03-28 18:29:24 +0300
committerrobot-piglet <[email protected]>2025-03-28 18:50:04 +0300
commit60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch)
tree08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql
parent1e214be59cbf130bee433c422b42f16148e5acff (diff)
YQL-19616 Convert YQL lexer grammar to regexes
- [x] Parse YQL grammar to extract lexer grammar into `TLexerGrammar`. - [x] Translate `TLexerGrammar` into regexes. - [x] Implement a lexer via regexes `TRegexLexer` to test generated regexes validity. - [x] Test on `Default` syntax mode. - [x] Test on `ANSI` syntax mode. --- - Related to https://github.com/ydb-platform/ydb/issues/15129 - Requirement for https://github.com/ytsaurus/ytsaurus/pull/1112 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1127 commit_hash:03ffffe81cdafe7f93a4d3fd9a3212fe67f1c72d
Diffstat (limited to 'yql/essentials/sql')
-rw-r--r--yql/essentials/sql/v1/SQLv1Antlr4.g.in33
-rw-r--r--yql/essentials/sql/v1/complete/sql_complete.cpp4
-rw-r--r--yql/essentials/sql/v1/complete/sql_complete_ut.cpp4
-rw-r--r--yql/essentials/sql/v1/lexer/lexer.cpp95
-rw-r--r--yql/essentials/sql/v1/lexer/lexer.h11
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.cpp46
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp252
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.h9
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp219
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.cpp240
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.h14
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex_ut.cpp90
-rw-r--r--yql/essentials/sql/v1/lexer/regex/ut/ya.make13
-rw-r--r--yql/essentials/sql/v1/lexer/regex/ya.make39
-rw-r--r--yql/essentials/sql/v1/lexer/ut/ya.make1
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.cpp173
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.h18
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp46
-rw-r--r--yql/essentials/sql/v1/reflect/ut/ya.make7
-rw-r--r--yql/essentials/sql/v1/reflect/ya.make13
20 files changed, 1263 insertions, 64 deletions
diff --git a/yql/essentials/sql/v1/SQLv1Antlr4.g.in b/yql/essentials/sql/v1/SQLv1Antlr4.g.in
index fb92a68f9ab..5c59ab61ea4 100644
--- a/yql/essentials/sql/v1/SQLv1Antlr4.g.in
+++ b/yql/essentials/sql/v1/SQLv1Antlr4.g.in
@@ -1775,9 +1775,7 @@ bool_value: (TRUE | FALSE);
real: REAL;
integer: DIGITS | INTEGER_VALUE;
-//
-// Lexer
-//
+//! section:punctuation
EQUALS: '=';
EQUALS2: '==';
@@ -1823,6 +1821,8 @@ fragment QUOTE_SINGLE: '\'';
fragment BACKTICK: '`';
fragment DOUBLE_COMMAT: '@@';
+//! section:letter
+
// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
fragment A:('a'|'A');
fragment B:('b'|'B');
@@ -1851,6 +1851,8 @@ fragment X:('x'|'X');
fragment Y:('y'|'Y');
fragment Z:('z'|'Z');
+//! section:keyword
+
ABORT: A B O R T;
ACTION: A C T I O N;
ADD: A D D;
@@ -2144,13 +2146,7 @@ WRAPPER: W R A P P E R;
//WRITE: W R I T E;
XOR: X O R;
-// YQL Default Lexer:
-// GRAMMAR_STRING_CORE_SINGLE = ~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .)
-// GRAMMAR_STRING_CORE_DOUBLE = ~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .)
-
-// ANSI Lexer:
-// GRAMMAR_STRING_CORE_SINGLE = ~QUOTE_SINGLE | (QUOTE_SINGLE QUOTE_SINGLE)
-// GRAMMAR_STRING_CORE_DOUBLE = ~QUOTE_DOUBLE | (QUOTE_DOUBLE QUOTE_DOUBLE)
+//! section:other
fragment STRING_CORE_SINGLE: @GRAMMAR_STRING_CORE_SINGLE@;
fragment STRING_CORE_DOUBLE: @GRAMMAR_STRING_CORE_DOUBLE@;
@@ -2163,7 +2159,7 @@ STRING_VALUE: ((STRING_SINGLE | STRING_DOUBLE | STRING_MULTILINE) (S | U | Y | J
ID_PLAIN: ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | DIGIT)*;
-fragment ID_QUOTED_CORE: '\\'. | '``' | ~('`' | '\\');
+fragment ID_QUOTED_CORE: '\\' . | '``' | ~('`' | '\\');
ID_QUOTED: BACKTICK ID_QUOTED_CORE* BACKTICK;
fragment DIGIT: '0'..'9';
@@ -2177,23 +2173,18 @@ DIGITS: DECDIGITS | HEXDIGITS | OCTDIGITS | BINDIGITS;
// not all combinations of P/U with L/S/T/I/B/N are actually valid - this is resolved in sql.cpp
INTEGER_VALUE: DIGITS ((P | U)? (L | S | T | I | B | N)?);
-fragment FLOAT_EXP : E (PLUS | MINUS)? DECDIGITS ;
+fragment FLOAT_EXP: E (PLUS | MINUS)? DECDIGITS;
REAL:
(
DECDIGITS DOT DIGIT* FLOAT_EXP?
| DECDIGITS FLOAT_EXP
// | DOT DECDIGITS FLOAT_EXP? // Conflicts with tuple element access through DOT
- ) (F | P (F ('4'|'8') | N)?)?
+ ) (F | P (F ('4' | '8') | N)?)?
;
BLOB: X QUOTE_SINGLE HEXDIGIT+ QUOTE_SINGLE;
-// YQL Default Lexer:
-// GRAMMAR_MULTILINE_COMMENT_CORE = .
-// ANSI Lexer:
-// GRAMMAR_MULTILINE_COMMENT_CORE = MULTILINE_COMMENT | .
-
fragment MULTILINE_COMMENT: '/*' ( @GRAMMAR_MULTILINE_COMMENT_CORE@ )*? '*/';
-fragment LINE_COMMENT: '--' ~('\n'|'\r')* ('\r' '\n'? | '\n' | EOF);
-WS: (' '|'\r'|'\t'|'\u000C'|'\n')->channel(HIDDEN);
-COMMENT: (MULTILINE_COMMENT|LINE_COMMENT)->channel(HIDDEN);
+fragment LINE_COMMENT: '--' ~('\n' | '\r')* ('\r' '\n'? | '\n' | EOF);
+WS: (' ' | '\r' | '\t' | '\u000C' | '\n') -> channel(HIDDEN);
+COMMENT: (MULTILINE_COMMENT | LINE_COMMENT) -> channel(HIDDEN);
diff --git a/yql/essentials/sql/v1/complete/sql_complete.cpp b/yql/essentials/sql/v1/complete/sql_complete.cpp
index 53cb4ada420..753d0a2835c 100644
--- a/yql/essentials/sql/v1/complete/sql_complete.cpp
+++ b/yql/essentials/sql/v1/complete/sql_complete.cpp
@@ -125,7 +125,9 @@ namespace NSQLComplete {
INameService::TPtr names = MakeStaticNameService(MakeDefaultNameSet());
return MakeSqlCompletionEngine([lexers = std::move(lexers)](bool ansi) {
- return NSQLTranslationV1::MakeLexer(lexers, ansi, /* antlr4 = */ true, /* pure = */ true);
+ return NSQLTranslationV1::MakeLexer(
+ lexers, ansi, /* antlr4 = */ true,
+ NSQLTranslationV1::ELexerFlavor::Pure);
}, std::move(names));
}
diff --git a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
index 4fb6dfea587..aa242d313cb 100644
--- a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
+++ b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
@@ -43,7 +43,9 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory();
lexers.Antlr4PureAnsi = NSQLTranslationV1::MakeAntlr4PureAnsiLexerFactory();
return [lexers = std::move(lexers)](bool ansi) {
- return NSQLTranslationV1::MakeLexer(lexers, ansi, /* antlr4 = */ true, /* pure = */ true);
+ return NSQLTranslationV1::MakeLexer(
+ lexers, ansi, /* antlr4 = */ true,
+ NSQLTranslationV1::ELexerFlavor::Pure);
};
}
diff --git a/yql/essentials/sql/v1/lexer/lexer.cpp b/yql/essentials/sql/v1/lexer/lexer.cpp
index 5621cc65d7b..88ced55ccf4 100644
--- a/yql/essentials/sql/v1/lexer/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer.cpp
@@ -11,6 +11,7 @@
#include <util/string/ascii.h>
#include <util/string/builder.h>
#include <util/string/strip.h>
+#include <util/string/join.h>
#if defined(_tsan_enabled_)
#include <util/system/mutex.h>
@@ -29,8 +30,8 @@ using NSQLTranslation::MakeDummyLexerFactory;
class TV1Lexer : public ILexer {
public:
- explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure)
- : Factory(GetFactory(lexers, ansi, antlr4, pure))
+ explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor)
+ : Factory(GetFactory(lexers, ansi, antlr4, flavor))
{
}
@@ -42,52 +43,70 @@ public:
}
private:
- static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4, bool pure = false) {
- if (!ansi && !antlr4 && !pure) {
- if (lexers.Antlr3) {
- return lexers.Antlr3;
- }
- return MakeDummyLexerFactory("antlr3");
- } else if (ansi && !antlr4 && !pure) {
- if (lexers.Antlr3Ansi) {
- return lexers.Antlr3Ansi;
- }
- return MakeDummyLexerFactory("antlr3_ansi");
- } else if (!ansi && antlr4 && !pure) {
- if (lexers.Antlr4) {
- return lexers.Antlr4;
- }
- return MakeDummyLexerFactory("antlr4");
- } else if (ansi && antlr4 && !pure) {
- if (lexers.Antlr4Ansi) {
- return lexers.Antlr4Ansi;
- }
- return MakeDummyLexerFactory("antlr4_ansi");
- } else if (!ansi && antlr4 && pure) {
- if (lexers.Antlr4Pure) {
- return lexers.Antlr4Pure;
- }
- return MakeDummyLexerFactory("antlr4_pure");
- } else if (ansi && antlr4 && pure) {
- if (lexers.Antlr4PureAnsi) {
- return lexers.Antlr4PureAnsi;
- }
- return MakeDummyLexerFactory("antlr4_pure_ansi");
- } else if (!ansi && !antlr4 && pure) {
- return MakeDummyLexerFactory("antlr3_pure");
+ static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) {
+ if (auto ptr = GetMaybeFactory(lexers, ansi, antlr4, flavor)) {
+ return ptr;
+ }
+ return MakeDummyLexerFactory(GetLexerName(ansi, antlr4, flavor));
+ }
+
+ static NSQLTranslation::TLexerFactoryPtr GetMaybeFactory(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) {
+ if (!ansi && !antlr4 && flavor == ELexerFlavor::Default) {
+ return lexers.Antlr3;
+ } else if (ansi && !antlr4 && flavor == ELexerFlavor::Default) {
+ return lexers.Antlr3Ansi;
+ } else if (!ansi && antlr4 && flavor == ELexerFlavor::Default) {
+ return lexers.Antlr4;
+ } else if (ansi && antlr4 && flavor == ELexerFlavor::Default) {
+ return lexers.Antlr4Ansi;
+ } else if (!ansi && antlr4 && flavor == ELexerFlavor::Pure) {
+ return lexers.Antlr4Pure;
+ } else if (ansi && antlr4 && flavor == ELexerFlavor::Pure) {
+ return lexers.Antlr4PureAnsi;
+ } else if (!ansi && !antlr4 && flavor == ELexerFlavor::Regex) {
+ return lexers.Regex;
+ } else if (ansi && !antlr4 && flavor == ELexerFlavor::Regex) {
+ return lexers.RegexAnsi;
} else {
- return MakeDummyLexerFactory("antlr3_pure_ansi");
+ return nullptr;
}
}
+ static TString GetLexerName(bool ansi, bool antlr4, ELexerFlavor flavor) {
+ TVector<const TStringBuf> parts;
+
+ if (antlr4) {
+ parts.emplace_back("antlr4");
+ } else if (!antlr4 && flavor != ELexerFlavor::Regex) {
+ parts.emplace_back("antlr3");
+ }
+
+ switch (flavor) {
+ case ELexerFlavor::Default: {
+ } break;
+ case ELexerFlavor::Pure: {
+ parts.emplace_back("pure");
+ } break;
+ case ELexerFlavor::Regex: {
+ parts.emplace_back("regex");
+ } break;
+ }
+
+ if (ansi) {
+ parts.emplace_back("ansi");
+ }
+
+ return JoinSeq("_", parts);
+ }
+
private:
NSQLTranslation::TLexerFactoryPtr Factory;
};
} // namespace
-NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure) {
- return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4, pure));
+NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) {
+ return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4, flavor));
}
bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
diff --git a/yql/essentials/sql/v1/lexer/lexer.h b/yql/essentials/sql/v1/lexer/lexer.h
index 1cc8566fcf6..226e8b6ed25 100644
--- a/yql/essentials/sql/v1/lexer/lexer.h
+++ b/yql/essentials/sql/v1/lexer/lexer.h
@@ -11,9 +11,18 @@ struct TLexers {
NSQLTranslation::TLexerFactoryPtr Antlr4Ansi;
NSQLTranslation::TLexerFactoryPtr Antlr4Pure;
NSQLTranslation::TLexerFactoryPtr Antlr4PureAnsi;
+ NSQLTranslation::TLexerFactoryPtr Regex;
+ NSQLTranslation::TLexerFactoryPtr RegexAnsi;
};
-NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure = false);
+enum class ELexerFlavor {
+ Default,
+ Pure,
+ Regex,
+};
+
+NSQLTranslation::ILexer::TPtr MakeLexer(
+ const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor = ELexerFlavor::Default);
// "Probably" because YQL keyword can be an identifier
// depending on a query context. For example
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
index 3ad01f631b6..53cff6ffdc7 100644
--- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -6,6 +6,7 @@
#include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
#include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
#include <yql/essentials/sql/v1/lexer/antlr4_pure/lexer.h>
+#include <yql/essentials/sql/v1/lexer/regex/lexer.h>
#include <library/cpp/testing/unittest/registar.h>
@@ -59,6 +60,42 @@ void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs)
}
Y_UNIT_TEST_SUITE(SQLv1Lexer) {
+ Y_UNIT_TEST(UnsupportedIssues) {
+ NSQLTranslationV1::TLexers factories;
+
+ TVector<ILexer::TPtr> lexers;
+ for (auto ansi : {false, true}) {
+ for (auto antlr4 : {false, true}) {
+ for (auto flavor : {ELexerFlavor::Default, ELexerFlavor::Pure, ELexerFlavor::Regex}) {
+ lexers.emplace_back(MakeLexer(factories, ansi, antlr4, flavor));
+ }
+ }
+ }
+
+ TVector<TString> actual;
+ for (auto& lexer : lexers) {
+ auto issues = GetIssueMessages(lexer, "");
+ actual.emplace_back(std::move(issues.at(0)));
+ }
+
+ TVector<TString> expected = {
+ "<main>: Error: Lexer antlr3 is not supported",
+ "<main>: Error: Lexer antlr3_pure is not supported",
+ "<main>: Error: Lexer regex is not supported",
+ "<main>: Error: Lexer antlr4 is not supported",
+ "<main>: Error: Lexer antlr4_pure is not supported",
+ "<main>: Error: Lexer antlr4_regex is not supported",
+ "<main>: Error: Lexer antlr3_ansi is not supported",
+ "<main>: Error: Lexer antlr3_pure_ansi is not supported",
+ "<main>: Error: Lexer regex_ansi is not supported",
+ "<main>: Error: Lexer antlr4_ansi is not supported",
+ "<main>: Error: Lexer antlr4_pure_ansi is not supported",
+ "<main>: Error: Lexer antlr4_regex_ansi is not supported",
+ };
+
+ UNIT_ASSERT_VALUES_EQUAL(actual, expected);
+ }
+
Y_UNIT_TEST(AntlrVersionIndependent) {
const TVector<TString> queriesUtf8 = {
"",
@@ -85,7 +122,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
- auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, /* pure = */ true);
+ auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
for (const auto& query : queriesUtf8) {
auto [tokens3, issues3] = Tokenize(lexer3, query);
@@ -164,19 +201,24 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
NSQLTranslationV1::TLexers lexers;
lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
+ lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory();
+ lexers.Regex = NSQLTranslationV1::MakeRegexLexerFactory(/* ansi = */ false);
auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
- auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, /* pure = */ true);
+ auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
+ auto lexerR = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex);
for (const auto& query : InvalidQueries()) {
auto issues3 = GetIssueMessages(lexer3, query);
auto issues4 = GetIssueMessages(lexer4, query);
auto issues4p = GetIssueMessages(lexer4p, query);
+ auto issuesR = GetIssueMessages(lexerR, query);
UNIT_ASSERT(!issues3.empty());
UNIT_ASSERT(!issues4.empty());
UNIT_ASSERT(!issues4p.empty());
+ UNIT_ASSERT(!issuesR.empty());
}
}
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
new file mode 100644
index 00000000000..1c8f2104a48
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -0,0 +1,252 @@
+#include "lexer.h"
+
+#include "regex.h"
+
+#include <contrib/libs/re2/re2/re2.h>
+
+#include <yql/essentials/core/issue/yql_issue.h>
+#include <yql/essentials/sql/v1/reflect/sql_reflect.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/string.h>
+#include <util/string/subst.h>
+
+namespace NSQLTranslationV1 {
+
+ using NSQLTranslation::TParsedToken;
+ using NSQLTranslation::TParsedTokenList;
+
+ class TRegexLexer: public NSQLTranslation::ILexer {
+ static constexpr const char* CommentTokenName = "COMMENT";
+
+ public:
+ TRegexLexer(
+ bool ansi,
+ NSQLReflect::TLexerGrammar grammar,
+ const THashMap<TString, TString>& RegexByOtherNameMap)
+ : Grammar_(std::move(grammar))
+ , Ansi_(ansi)
+ {
+ for (auto& [token, regex] : RegexByOtherNameMap) {
+ if (token == CommentTokenName) {
+ CommentRegex_.Reset(new RE2(regex));
+ } else {
+ OtherRegexes_.emplace(std::move(token), std::move(regex));
+ }
+ }
+ }
+
+ bool Tokenize(
+ const TString& query,
+ const TString& queryName,
+ const TTokenCallback& onNextToken,
+ NYql::TIssues& issues,
+ size_t maxErrors) override {
+ size_t errors = 0;
+ for (size_t pos = 0; pos < query.size();) {
+ TParsedToken matched = Match(TStringBuf(query, pos));
+
+ if (matched.Name.empty() && maxErrors == errors) {
+ break;
+ }
+
+ if (matched.Name.empty()) {
+ pos += 1;
+ errors += 1;
+ issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates");
+ continue;
+ }
+
+ pos += matched.Content.length();
+ onNextToken(std::move(matched));
+ }
+
+ onNextToken(TParsedToken{.Name = "EOF"});
+ return errors == 0;
+ }
+
+ private:
+ TParsedToken Match(const TStringBuf prefix) {
+ TParsedTokenList matches;
+
+ size_t keywordCount = MatchKeyword(prefix, matches);
+ MatchPunctuation(prefix, matches);
+ size_t otherCount = MatchRegex(prefix, matches);
+ MatchComment(prefix, matches);
+
+ auto max = MaxElementBy(matches, [](const TParsedToken& m) {
+ return m.Content.length();
+ });
+
+ if (max == std::end(matches)) {
+ return {};
+ }
+
+ auto isMatched = [&](const TStringBuf name) {
+ return std::end(matches) != FindIf(matches, [&](const auto& m) {
+ return m.Name == name;
+ });
+ };
+
+ Y_ENSURE(
+ otherCount <= 1 ||
+ (otherCount == 2 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
+
+ size_t conflicts = CountIf(matches, [&](const TParsedToken& m) {
+ return m.Content.length() == max->Content.length();
+ });
+ conflicts -= 1;
+ Y_ENSURE(
+ conflicts == 0 ||
+ (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) ||
+ (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
+
+ Y_ENSURE(!max->Content.empty());
+ return *max;
+ }
+
+ bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
+ size_t count = 0;
+ for (const auto& keyword : Grammar_.KeywordNames) {
+ if (prefix.substr(0, keyword.length()) == keyword) {
+ matches.emplace_back(keyword, keyword);
+ count += 1;
+ }
+ }
+ return count;
+ }
+
+ size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) {
+ size_t count = 0;
+ for (const auto& name : Grammar_.PunctuationNames) {
+ const auto& content = Grammar_.BlockByName.at(name);
+ if (prefix.substr(0, content.length()) == content) {
+ matches.emplace_back(name, content);
+ count += 1;
+ }
+ }
+ return count;
+ }
+
+ size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) {
+ size_t count = 0;
+ for (const auto& [token, regex] : OtherRegexes_) {
+ if (const TStringBuf match = TryMatchRegex(prefix, regex); !match.empty()) {
+ matches.emplace_back(token, TString(match));
+ count += 1;
+ }
+ }
+ return count;
+ }
+
+ const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) {
+ re2::StringPiece input(prefix.data(), prefix.size());
+ if (RE2::Consume(&input, regex)) {
+ return TStringBuf(prefix.data(), input.data());
+ }
+ return "";
+ }
+
+ size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) {
+ const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_);
+ if (reContent.empty()) {
+ return 0;
+ }
+
+ if (!(Ansi_ && prefix.StartsWith("/*"))) {
+ matches.emplace_back(CommentTokenName, TString(reContent));
+ return 1;
+ }
+
+ size_t ll1Length = MatchANSIMultilineComment(prefix);
+ const TStringBuf ll1Content = prefix.SubString(0, ll1Length);
+
+ Y_ENSURE(ll1Content == 0 || reContent <= ll1Content);
+ if (ll1Content == 0) {
+ matches.emplace_back(CommentTokenName, TString(reContent));
+ return 1;
+ }
+
+ matches.emplace_back(CommentTokenName, TString(ll1Content));
+ return 1;
+ }
+
+ size_t MatchANSIMultilineComment(TStringBuf remaining) {
+ if (!remaining.StartsWith("/*")) {
+ return 0;
+ }
+
+ size_t skipped = 0;
+
+ remaining.Skip(2);
+ skipped += 2;
+
+ for (;;) {
+ if (remaining.StartsWith("*/")) {
+ remaining.Skip(2);
+ skipped += 2;
+ return skipped;
+ }
+
+ bool isSkipped = false;
+ if (remaining.StartsWith("/*")) {
+ size_t limit = remaining.rfind("*/");
+ if (limit == std::string::npos) {
+ return 0;
+ }
+
+ size_t len = MatchANSIMultilineComment(remaining.Head(limit));
+ remaining.Skip(len);
+ skipped += len;
+
+ isSkipped = len != 0;
+ }
+
+ if (isSkipped) {
+ continue;
+ }
+
+ if (remaining.size() == 0) {
+ return 0;
+ }
+
+ remaining.Skip(1);
+ skipped += 1;
+ }
+ }
+
+ NSQLReflect::TLexerGrammar Grammar_;
+ THashMap<TString, RE2> OtherRegexes_;
+ THolder<RE2> CommentRegex_;
+ bool Ansi_;
+ };
+
+ namespace {
+
+ class TFactory final: public NSQLTranslation::ILexerFactory {
+ public:
+ explicit TFactory(bool ansi)
+ : Ansi_(ansi)
+ , Grammar_(NSQLReflect::LoadLexerGrammar())
+ , RegexByOtherNameMap_(MakeRegexByOtherNameMap(Grammar_, Ansi_))
+ {
+ }
+
+ NSQLTranslation::ILexer::TPtr MakeLexer() const override {
+ return NSQLTranslation::ILexer::TPtr(
+ new TRegexLexer(Ansi_, Grammar_, RegexByOtherNameMap_));
+ }
+
+ private:
+ bool Ansi_;
+ NSQLReflect::TLexerGrammar Grammar_;
+ THashMap<TString, TString> RegexByOtherNameMap_;
+ };
+
+ } // namespace
+
+ NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) {
+ return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi));
+ }
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h
new file mode 100644
index 00000000000..e9968954e1f
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <yql/essentials/parser/lexer_common/lexer.h>
+
+namespace NSQLTranslationV1 {
+
+ NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi);
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
new file mode 100644
index 00000000000..ae0d018e42d
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
@@ -0,0 +1,219 @@
+#include "lexer.h"
+
+#include <yql/essentials/public/issue/yql_issue.h>
+#include <yql/essentials/sql/settings/translation_settings.h>
+#include <yql/essentials/sql/v1/lexer/lexer.h>
+#include <yql/essentials/sql/v1/lexer/antlr4_pure_ansi/lexer.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/random/random.h>
+
+using namespace NSQLTranslationV1;
+using NSQLTranslation::SQL_MAX_PARSER_ERRORS;
+using NSQLTranslation::Tokenize;
+using NSQLTranslation::TParsedToken;
+using NSQLTranslation::TParsedTokenList;
+using NYql::TIssues;
+
+TLexers Lexers = {
+ .Antlr4PureAnsi = MakeAntlr4PureAnsiLexerFactory(),
+ .Regex = MakeRegexLexerFactory(/* ansi = */ false),
+ .RegexAnsi = MakeRegexLexerFactory(/* ansi = */ true),
+};
+
+auto PureAnsiLexer = MakeLexer(
+ Lexers, /* ansi = */ true, /* antlr4 = */ true, ELexerFlavor::Pure);
+
+auto DefaultLexer = MakeLexer(
+ Lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex);
+
+auto AnsiLexer = MakeLexer(
+ Lexers, /* ansi = */ true, /* antlr4 = */ false, ELexerFlavor::Regex);
+
+TString ToString(TParsedToken token) {
+ TString& string = token.Name;
+ if (token.Name != token.Content && token.Name != "EOF") {
+ string += "(";
+ string += token.Content;
+ string += ")";
+ }
+ return string;
+}
+
+TString Tokenized(NSQLTranslation::ILexer& lexer, const TString& query) {
+ TParsedTokenList tokens;
+ TIssues issues;
+ bool ok = Tokenize(lexer, query, "Test", tokens, issues, SQL_MAX_PARSER_ERRORS);
+
+ TString out;
+ if (!ok) {
+ out = "[INVALID] ";
+ }
+
+ for (auto& token : tokens) {
+ out += ToString(std::move(token));
+ out += " ";
+ }
+ if (!out.empty()) {
+ out.pop_back();
+ }
+ return out;
+}
+
+TString RandomMultilineCommentLikeText(size_t maxSize) {
+ auto size = RandomNumber<size_t>(maxSize);
+ TString comment;
+ for (size_t i = 0; i < size; ++i) {
+ if (auto /* isOpen */ _ = RandomNumber<bool>()) {
+ comment += "/*";
+ } else {
+ comment += "*/";
+ }
+
+ for (int gap = RandomNumber<size_t>(2); gap > 0; --gap) {
+ comment += " ";
+ }
+ }
+ return comment;
+}
+
+void Check(TString input, TString expected, bool ansi) {
+ auto* lexer = DefaultLexer.Get();
+ if (ansi) {
+ lexer = AnsiLexer.Get();
+ }
+ UNIT_ASSERT_VALUES_EQUAL(Tokenized(*lexer, input), expected);
+}
+
+void Check(TString input, TString expected) {
+ Check(input, expected, /* ansi = */ false);
+ Check(input, expected, /* ansi = */ true);
+}
+
+Y_UNIT_TEST_SUITE(RegexLexerTests) {
+ Y_UNIT_TEST(Whitespace) {
+ Check("", "EOF");
+ Check(" ", "WS( ) EOF");
+ Check(" ", "WS( ) WS( ) EOF");
+ Check("\n", "WS(\n) EOF");
+ }
+
+ Y_UNIT_TEST(SinleLineComment) {
+ Check("--yql", "COMMENT(--yql) EOF");
+ Check("-- yql ", "COMMENT(-- yql ) EOF");
+ Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF");
+ Check("-- yql --", "COMMENT(-- yql --) EOF");
+ }
+
+ Y_UNIT_TEST(MultiLineComment) {
+ Check("/* yql */", "COMMENT(/* yql */) EOF");
+ Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
+ Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF");
+ }
+
+ Y_UNIT_TEST(RecursiveMultiLineCommentDefault) {
+ Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false);
+ Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false);
+ }
+
+ Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) {
+ Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true);
+ Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true);
+ Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true);
+ Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true);
+ }
+
+ Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) {
+ SetRandomSeed(100);
+ for (size_t i = 0; i < 512; ++i) {
+ auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128);
+ TString actual = Tokenized(*AnsiLexer, input);
+ TString expected = Tokenized(*PureAnsiLexer, input);
+ UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input);
+ }
+ }
+
+ Y_UNIT_TEST(Keyword) {
+ Check("SELECT", "SELECT EOF");
+ Check("INSERT", "INSERT EOF");
+ Check("FROM", "FROM EOF");
+ }
+
+ Y_UNIT_TEST(Punctuation) {
+ Check(
+ "* / + - <|",
+ "ASTERISK(*) WS( ) SLASH(/) WS( ) "
+ "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF");
+ Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF");
+ }
+
+ Y_UNIT_TEST(IdPlain) {
+ Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF");
+ }
+
+ Y_UNIT_TEST(IdQuoted) {
+ Check("``", "ID_QUOTED(``) EOF");
+ Check("` `", "ID_QUOTED(` `) EOF");
+ Check("` `", "ID_QUOTED(` `) EOF");
+ Check("`local/table`", "ID_QUOTED(`local/table`) EOF");
+ }
+
+ Y_UNIT_TEST(SinleLineString) {
+ Check("\"\"", "STRING_VALUE(\"\") EOF");
+ Check("\' \'", "STRING_VALUE(\' \') EOF");
+ Check("\" \"", "STRING_VALUE(\" \") EOF");
+ Check("\"test\"", "STRING_VALUE(\"test\") EOF");
+
+ Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false);
+ Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true);
+
+ Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false);
+ Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true);
+ }
+
+ Y_UNIT_TEST(MultiLineString) {
+ Check("@@@@", "STRING_VALUE(@@@@) EOF");
+ Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF");
+ Check("@@test@@", "STRING_VALUE(@@test@@) EOF");
+ Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF");
+ }
+
+ Y_UNIT_TEST(Query) {
+ TString query =
+ "SELECT\n"
+ " 123467,\n"
+ " \"Hello, {name}!\",\n"
+ " (1 + (5 * 1 / 0)),\n"
+ " MIN(identifier),\n"
+ " Bool(field),\n"
+ " Math::Sin(var)\n"
+ "FROM `local/test/space/table`\n"
+ "JOIN test;";
+
+ TString expected =
+ "SELECT WS(\n) "
+ "WS( ) WS( ) INTEGER_VALUE(123467) COMMA(,) WS(\n) "
+ "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) "
+ "WS( ) WS( ) LPAREN(() INTEGER_VALUE(1) WS( ) PLUS(+) WS( ) LPAREN(() INTEGER_VALUE(5) WS( ) "
+ "ASTERISK(*) WS( ) INTEGER_VALUE(1) WS( ) SLASH(/) WS( ) INTEGER_VALUE(0) RPAREN()) "
+ "RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) "
+ "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) "
+ "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF";
+
+ Check(query, expected);
+ }
+
+ Y_UNIT_TEST(Invalid) {
+ Check("\"", "[INVALID] EOF");
+ Check("\" SELECT", "[INVALID] WS( ) SELECT EOF");
+ }
+
+} // Y_UNIT_TEST_SUITE(RegexLexerTests)
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp
new file mode 100644
index 00000000000..a8aca8a1318
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp
@@ -0,0 +1,240 @@
+#include "regex.h"
+
+#include <contrib/libs/re2/re2/re2.h>
+
+#include <util/generic/vector.h>
+
+#define SUBSTITUTION(name, mode) \
+ {#name, name##_##mode}
+
+#define SUBSTITUTIONS(mode) \
+ { \
+ #mode, { \
+ SUBSTITUTION(GRAMMAR_STRING_CORE_SINGLE, mode), \
+ SUBSTITUTION(GRAMMAR_STRING_CORE_DOUBLE, mode), \
+ SUBSTITUTION(GRAMMAR_MULTILINE_COMMENT_CORE, mode), \
+ } \
+ }
+
+namespace NSQLTranslationV1 {
+
+ class TLexerGrammarToRegexTranslator {
+ private:
+ struct TRewriteRule {
+ TString Repr;
+ std::function<void(TString&)> Apply;
+ };
+
+ using TRewriteRules = TVector<TRewriteRule>;
+
+ public:
+ explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi)
+ : Grammar_(&grammar)
+ , Mode_(ansi ? "ANSI" : "DEFAULT")
+ {
+ AddExternalRules(Inliners_);
+ AddFragmentRules(Inliners_);
+
+ AddLetterRules(Transformations_);
+ AddTransformationRules(Transformations_);
+
+ UnwrapQuotes_ = UnwrapQuotesRule();
+ AddSpaceCollapses(SpaceCollapses_);
+ UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule();
+ }
+
+ TString ToRegex(const TStringBuf name) {
+ TString text = Grammar_->BlockByName.at(name);
+ Inline(text);
+ Transform(text);
+ Finalize(text);
+ return text;
+ }
+
+ private:
+ void Inline(TString& text) {
+ ApplyEachWhileChanging(text, Inliners_);
+ }
+
+ void AddExternalRules(TRewriteRules& rules) {
+ THashMap<TString, THashMap<TString, TString>> Substitutions = {
+ SUBSTITUTIONS(DEFAULT),
+ SUBSTITUTIONS(ANSI),
+ };
+
+ // ANSI mode MULTILINE_COMMENT is recursive
+ Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] =
+ Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"];
+
+ for (const auto& [k, v] : Substitutions.at(Mode_)) {
+ rules.emplace_back(RegexRewriteRule("@" + k + "@", v));
+ }
+ }
+
+ void AddFragmentRules(TRewriteRules& rules) {
+ const THashSet<TString> PunctuationFragments = {
+ "BACKSLASH",
+ "QUOTE_DOUBLE",
+ "QUOTE_SINGLE",
+ "BACKTICK",
+ "DOUBLE_COMMAT",
+ };
+
+ for (const auto& [name, definition] : Grammar_->BlockByName) {
+ TString def = definition;
+ if (
+ Grammar_->PunctuationNames.contains(name) ||
+ PunctuationFragments.contains(name)) {
+ def = "'" + def + "'";
+ }
+ def = QuoteAntlrRewrite(std::move(def));
+
+ rules.emplace_back(RegexRewriteRule(
+ "(\\b" + name + "\\b)",
+ "(" + def + ")"));
+ }
+ }
+
+ void Transform(TString& text) {
+ ApplyEachWhileChanging(text, Transformations_);
+ }
+
+ void AddLetterRules(TRewriteRules& rules) {
+ for (char letter = 'A'; letter <= 'Z'; ++letter) {
+ TString lower(char(ToLower(letter)));
+ TString upper(char(ToUpper(letter)));
+ rules.emplace_back(RegexRewriteRule(
+ "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)",
+ "\\1[" + lower + upper + "]\\2"));
+ }
+ }
+
+ void AddTransformationRules(TRewriteRules& rules) {
+ rules.emplace_back(RegexRewriteRule(
+ R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"(~\('(..?)'\))", R"([^\1])"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"(('..?')\.\.('..?'))", R"([\1-\2])"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"(\((.)\))", R"(\1)"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"(\((\[.{1,8}\])\))", R"(\1)"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"(\(('..?')\))", R"(\1)"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"( \.)", R"( (.|\\n))"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"(\bEOF\b)", R"($)"));
+
+ rules.emplace_back(RegexRewriteRule(
+ R"('\\u000C' \|)", ""));
+ }
+
+ void Finalize(TString& text) {
+ UnwrapQuotes_.Apply(text);
+ ApplyEachWhileChanging(text, SpaceCollapses_);
+ UnwrapQuotedSpace_.Apply(text);
+ }
+
+ void AddSpaceCollapses(TRewriteRules& rules) {
+ rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)"));
+ rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)"));
+ }
+
+ void ApplyEachOnce(TString& text, const TRewriteRules& rules) {
+ for (const auto& rule : rules) {
+ rule.Apply(text);
+ }
+ }
+
+ void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) {
+ constexpr size_t Limit = 16;
+
+ TString prev;
+ for (size_t i = 0; i < Limit + 1 && prev != text; ++i) {
+ prev = text;
+ ApplyEachOnce(text, rules);
+ Y_ENSURE(i != Limit);
+ }
+ }
+
+ TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) {
+ auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
+ Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
+
+ TString error;
+ Y_ENSURE(
+ re2->CheckRewriteString(rewrite, &error),
+ error << " on rewrite '" << rewrite << "'");
+
+ return {
+ .Repr = regex + " -> " + rewrite,
+ .Apply = [re2, rewrite = std::move(rewrite)](TString& text) {
+ RE2::GlobalReplace(&text, *re2, rewrite);
+ },
+ };
+ }
+
+ TRewriteRule UnwrapQuotesRule() {
+ const TString regex = R"('([^ ][^ ]?)')";
+ auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
+ Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
+
+ return {
+ .Repr = regex + " -> Quoted(\\1)",
+ .Apply = [re2](TString& text) {
+ TString content;
+ std::size_t i = 256;
+ while (RE2::PartialMatch(text, *re2, &content) && --i != 0) {
+ TString quoted = RE2::QuoteMeta(content);
+ for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) {
+ quoted.erase(std::begin(quoted));
+ }
+ SubstGlobal(text, "'" + content + "'", quoted);
+ }
+ Y_ENSURE(i != 0);
+ },
+ };
+ }
+
+ TRewriteRule UnwrapQuotedSpaceRule() {
+ return RegexRewriteRule(R"(' ')", R"( )");
+ }
+
+ TString QuoteAntlrRewrite(TString rewrite) {
+ SubstGlobal(rewrite, R"(\)", R"(\\)");
+ SubstGlobal(rewrite, R"('\\')", R"('\\\\')");
+ return rewrite;
+ }
+
+ const NSQLReflect::TLexerGrammar* Grammar_;
+ const TStringBuf Mode_;
+
+ TRewriteRules Inliners_;
+
+ TRewriteRules Transformations_;
+
+ TRewriteRule UnwrapQuotes_;
+ TRewriteRules SpaceCollapses_;
+ TRewriteRule UnwrapQuotedSpace_;
+ };
+
+ THashMap<TString, TString> MakeRegexByOtherNameMap(const NSQLReflect::TLexerGrammar& grammar, bool ansi) {
+ TLexerGrammarToRegexTranslator translator(grammar, ansi);
+
+ THashMap<TString, TString> regexes;
+ for (const auto& token : grammar.OtherNames) {
+ regexes.emplace(token, translator.ToRegex(token));
+ }
+ return regexes;
+ }
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h
new file mode 100644
index 00000000000..9e29c3df25b
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/regex.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <yql/essentials/sql/v1/reflect/sql_reflect.h>
+
+#include <util/generic/hash.h>
+
+namespace NSQLTranslationV1 {
+
+ // Makes regexes only for tokens from OtherNames,
+ // as keywords and punctuation are trivially matched.
+ THashMap<TString, TString> MakeRegexByOtherNameMap(
+ const NSQLReflect::TLexerGrammar& grammar, bool ansi);
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
new file mode 100644
index 00000000000..47a94f53ed0
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
@@ -0,0 +1,90 @@
+#include "regex.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <contrib/libs/re2/re2/re2.h>
+
+using namespace NSQLTranslationV1;
+
+namespace {
+ auto grammar = NSQLReflect::LoadLexerGrammar();
+ auto defaultRegexes = MakeRegexByOtherNameMap(grammar, /* ansi = */ false);
+ auto ansiRegexes = MakeRegexByOtherNameMap(grammar, /* ansi = */ true);
+
+ void CheckRegex(bool ansi, const TStringBuf name, const TStringBuf expected) {
+ const auto& regexes = ansi ? ansiRegexes : defaultRegexes;
+ const TString regex = regexes.at(name);
+
+ const RE2 re2(regex);
+ Y_ENSURE(re2.ok(), re2.error());
+
+ UNIT_ASSERT_VALUES_EQUAL(regex, expected);
+ }
+
+} // namespace
+
+Y_UNIT_TEST_SUITE(SqlRegexTests) {
+ Y_UNIT_TEST(StringValue) {
+ CheckRegex(
+ /* ansi = */ false,
+ "STRING_VALUE",
+ R"(((((\'([^'\\]|(\\(.|\n)))*\'))|((\"([^"\\]|(\\(.|\n)))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sS]|[uU]|[yY]|[jJ]|[pP]([tT]|[bB]|[vV])?)?))");
+ }
+
+ Y_UNIT_TEST(AnsiStringValue) {
+ CheckRegex(
+ /* ansi = */ true,
+ "STRING_VALUE",
+ R"(((((\'([^']|(\'\'))*\'))|((\"([^"]|(\"\"))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sS]|[uU]|[yY]|[jJ]|[pP]([tT]|[bB]|[vV])?)?))");
+ }
+
+ Y_UNIT_TEST(IdPlain) {
+ CheckRegex(
+ /* ansi = */ false,
+ "ID_PLAIN",
+ R"(([a-z]|[A-Z]|_)([a-z]|[A-Z]|_|[0-9])*)");
+ }
+
+ Y_UNIT_TEST(IdQuoted) {
+ CheckRegex(
+ /* ansi = */ false,
+ "ID_QUOTED",
+ R"(\`(\\(.|\n)|\`\`|[^`\\])*\`)");
+ }
+
+ Y_UNIT_TEST(Digits) {
+ CheckRegex(
+ /* ansi = */ false,
+ "DIGITS",
+ R"(([0-9]+)|(0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+))");
+ }
+
+ Y_UNIT_TEST(Real) {
+ CheckRegex(
+ /* ansi = */ false,
+ "REAL",
+ R"((([0-9]+)\.[0-9]*([eE](\+|\-)?([0-9]+))?|([0-9]+)([eE](\+|\-)?([0-9]+)))([fF]|[pP]([fF](4|8)|[nN])?)?)");
+ }
+
+ Y_UNIT_TEST(Ws) {
+ CheckRegex(
+ /* ansi = */ false,
+ "WS",
+ R"(( |\r|\t|\n))");
+ }
+
+ Y_UNIT_TEST(Comment) {
+ CheckRegex(
+ /* ansi = */ false,
+ "COMMENT",
+ R"(((\/\*(.|\n)*?\*\/)|(\-\-[^\n\r]*(\r\n?|\n|$))))");
+ }
+
+ Y_UNIT_TEST(AnsiCommentSameAsDefault) {
+ // Because of recursive definition
+ UNIT_ASSERT_VALUES_EQUAL(
+ ansiRegexes.at("COMMENT"),
+ defaultRegexes.at("COMMENT"));
+ }
+
+} // Y_UNIT_TEST_SUITE(SqlRegexTests)
diff --git a/yql/essentials/sql/v1/lexer/regex/ut/ya.make b/yql/essentials/sql/v1/lexer/regex/ut/ya.make
new file mode 100644
index 00000000000..09eb74a3f68
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/ut/ya.make
@@ -0,0 +1,13 @@
+UNITTEST_FOR(yql/essentials/sql/v1/lexer/regex)
+
+PEERDIR(
+ yql/essentials/sql/v1/lexer
+ yql/essentials/sql/v1/lexer/antlr4_pure_ansi
+)
+
+SRCS(
+ lexer_ut.cpp
+ regex_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make
new file mode 100644
index 00000000000..249dfbd11df
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/ya.make
@@ -0,0 +1,39 @@
+LIBRARY()
+
+PEERDIR(
+ contrib/libs/re2
+ yql/essentials/public/issue
+ yql/essentials/parser/lexer_common
+ yql/essentials/sql/settings
+ yql/essentials/sql/v1/reflect
+)
+
+# TODO(vityaman): Extract to a single ya.make for reusage.
+
+SET(GRAMMAR_STRING_CORE_SINGLE_DEFAULT "~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .)")
+SET(GRAMMAR_STRING_CORE_DOUBLE_DEFAULT "~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .)")
+SET(GRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT "(.)")
+
+SET(GRAMMAR_STRING_CORE_SINGLE_ANSI "~QUOTE_SINGLE | (QUOTE_SINGLE QUOTE_SINGLE)")
+SET(GRAMMAR_STRING_CORE_DOUBLE_ANSI "~QUOTE_DOUBLE | (QUOTE_DOUBLE QUOTE_DOUBLE)")
+SET(GRAMMAR_MULTILINE_COMMENT_CORE_ANSI "MULTILINE_COMMENT | .")
+
+CFLAGS(
+ -DGRAMMAR_STRING_CORE_SINGLE_DEFAULT="\\\"${GRAMMAR_STRING_CORE_SINGLE_DEFAULT}\\\""
+ -DGRAMMAR_STRING_CORE_DOUBLE_DEFAULT="\\\"${GRAMMAR_STRING_CORE_DOUBLE_DEFAULT}\\\""
+ -DGRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT="\\\"${GRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT}\\\""
+ -DGRAMMAR_STRING_CORE_SINGLE_ANSI="\\\"${GRAMMAR_STRING_CORE_SINGLE_ANSI}\\\""
+ -DGRAMMAR_STRING_CORE_DOUBLE_ANSI="\\\"${GRAMMAR_STRING_CORE_DOUBLE_ANSI}\\\""
+ -DGRAMMAR_MULTILINE_COMMENT_CORE_ANSI="\\\"${GRAMMAR_MULTILINE_COMMENT_CORE_ANSI}\\\""
+)
+
+SRCS(
+ lexer.cpp
+ regex.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+ ut
+)
diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make
index c50c8cd7277..7e62fb50c85 100644
--- a/yql/essentials/sql/v1/lexer/ut/ya.make
+++ b/yql/essentials/sql/v1/lexer/ut/ya.make
@@ -6,6 +6,7 @@ PEERDIR(
yql/essentials/sql/v1/lexer/antlr3
yql/essentials/sql/v1/lexer/antlr4
yql/essentials/sql/v1/lexer/antlr4_pure
+ yql/essentials/sql/v1/lexer/regex
)
SRCS(
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
new file mode 100644
index 00000000000..f47f35cb9de
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
@@ -0,0 +1,173 @@
+#include "sql_reflect.h"
+
+#include <library/cpp/resource/resource.h>
+
+#include <util/string/split.h>
+#include <util/string/strip.h>
+
+namespace NSQLReflect {
+
+ const TStringBuf ReflectPrefix = "//!";
+ const TStringBuf SectionPrefix = "//! section:";
+ const TStringBuf SectionPunctuation = "//! section:punctuation";
+ const TStringBuf SectionLetter = "//! section:letter";
+ const TStringBuf SectionKeyword = "//! section:keyword";
+ const TStringBuf SectionOther = "//! section:other";
+ const TStringBuf FragmentPrefix = "fragment ";
+
+ TVector<TString> GetResourceLines(const TStringBuf key) {
+ TString text;
+ Y_ENSURE(NResource::FindExact(key, &text));
+
+ TVector<TString> lines;
+ Split(text, "\n", lines);
+ return lines;
+ }
+
+ void Format(TVector<TString>& lines) {
+ for (size_t i = 0; i < lines.size(); ++i) {
+ auto& line = lines[i];
+
+ StripInPlace(line);
+
+ if (line.StartsWith("//") || (line.Contains(':') && line.Contains(';'))) {
+ continue;
+ }
+
+ size_t j = i + 1;
+ do {
+ line += lines.at(j);
+ } while (!lines.at(j++).Contains(';'));
+
+ auto first = std::next(std::begin(lines), i + 1);
+ auto last = std::next(std::begin(lines), j);
+ lines.erase(first, last);
+ }
+
+ for (auto& line : lines) {
+ CollapseInPlace(line);
+ SubstGlobal(line, " ;", ";");
+ SubstGlobal(line, " :", ":");
+ SubstGlobal(line, " )", ")");
+ SubstGlobal(line, "( ", "(");
+ }
+ }
+
+ void Purify(TVector<TString>& lines) {
+ const auto [first, last] = std::ranges::remove_if(lines, [](const TString& line) {
+ return (line.StartsWith("//") && !line.StartsWith(ReflectPrefix)) || line.empty();
+ });
+ lines.erase(first, last);
+ }
+
+ THashMap<TStringBuf, TVector<TString>> GroupBySection(TVector<TString>&& lines) {
+ TVector<TStringBuf> sections = {
+ "",
+ SectionPunctuation,
+ SectionLetter,
+ SectionKeyword,
+ SectionOther,
+ };
+
+ size_t section = 0;
+
+ THashMap<TStringBuf, TVector<TString>> groups;
+ for (auto& line : lines) {
+ if (line.StartsWith(SectionPrefix)) {
+ Y_ENSURE(sections.at(section + 1) == line);
+ section += 1;
+ continue;
+ }
+
+ groups[sections.at(section)].emplace_back(std::move(line));
+ }
+
+ groups.erase("");
+ groups.erase(SectionLetter);
+
+ return groups;
+ }
+
+ std::tuple<TString, TString> ParseLexerRule(TString&& line) {
+ size_t colonPos = line.find(':');
+ size_t semiPos = line.rfind(';');
+
+ Y_ENSURE(
+ colonPos != TString::npos &&
+ semiPos != TString::npos &&
+ colonPos < semiPos);
+
+ TString block = line.substr(colonPos + 2, semiPos - colonPos - 2);
+ SubstGlobal(block, "\\\\", "\\");
+
+ TString name = std::move(line);
+ name.resize(colonPos);
+
+ return std::make_tuple(std::move(name), std::move(block));
+ }
+
+ void ParsePunctuationLine(TString&& line, TLexerGrammar& grammar) {
+ auto [name, block] = ParseLexerRule(std::move(line));
+ block = block.erase(std::begin(block));
+ block.pop_back();
+
+ SubstGlobal(block, "\\\'", "\'");
+
+ if (!name.StartsWith(FragmentPrefix)) {
+ grammar.PunctuationNames.emplace(name);
+ }
+
+ SubstGlobal(name, FragmentPrefix, "");
+ grammar.BlockByName.emplace(std::move(name), std::move(block));
+ }
+
+ void ParseKeywordLine(TString&& line, TLexerGrammar& grammar) {
+ auto [name, block] = ParseLexerRule(std::move(line));
+ SubstGlobal(block, "'", "");
+ SubstGlobal(block, " ", "");
+
+ Y_ENSURE(name == block || (name == "TSKIP" && block == "SKIP"));
+ grammar.KeywordNames.emplace(std::move(name));
+ }
+
+ void ParseOtherLine(TString&& line, TLexerGrammar& grammar) {
+ auto [name, block] = ParseLexerRule(std::move(line));
+
+ if (!name.StartsWith(FragmentPrefix)) {
+ grammar.OtherNames.emplace(name);
+ }
+
+ SubstGlobal(name, FragmentPrefix, "");
+ SubstGlobal(block, " -> channel(HIDDEN)", "");
+ grammar.BlockByName.emplace(std::move(name), std::move(block));
+ }
+
+ TLexerGrammar LoadLexerGrammar() {
+ TVector<TString> lines = GetResourceLines("SQLv1Antlr4.g.in");
+ Purify(lines);
+ Format(lines);
+ Purify(lines);
+
+ THashMap<TStringBuf, TVector<TString>> sections;
+ sections = GroupBySection(std::move(lines));
+
+ TLexerGrammar grammar;
+
+ for (auto& [section, lines] : sections) {
+ for (auto& line : lines) {
+ if (section == SectionPunctuation) {
+ ParsePunctuationLine(std::move(line), grammar);
+ } else if (section == SectionKeyword) {
+ ParseKeywordLine(std::move(line), grammar);
+ } else if (section == SectionOther) {
+ ParseOtherLine(std::move(line), grammar);
+ } else {
+ Y_ABORT("Unexpected section %s", section);
+ }
+ }
+ }
+
+ return grammar;
+ }
+
+} // namespace NSQLReflect
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h
new file mode 100644
index 00000000000..5225a3c996b
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/hash.h>
+
+namespace NSQLReflect {
+
+ struct TLexerGrammar {
+ THashSet<TString> KeywordNames;
+ THashSet<TString> PunctuationNames;
+ THashSet<TString> OtherNames;
+ THashMap<TString, TString> BlockByName;
+ };
+
+ TLexerGrammar LoadLexerGrammar();
+
+} // namespace NSQLReflect
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp b/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp
new file mode 100644
index 00000000000..7bef2879e55
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp
@@ -0,0 +1,46 @@
+#include "sql_reflect.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NSQLReflect;
+
+namespace {
+ TLexerGrammar grammar = LoadLexerGrammar();
+} // namespace
+
+Y_UNIT_TEST_SUITE(SqlReflectTests) {
+ Y_UNIT_TEST(Keywords) {
+ UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("SELECT"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("INSERT"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("WHERE"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("COMMIT"), true);
+ }
+
+ Y_UNIT_TEST(Punctuation) {
+ UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("LPAREN"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("LPAREN"), "(");
+
+ UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("MINUS"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("MINUS"), "-");
+
+ UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("NAMESPACE"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("NAMESPACE"), "::");
+ }
+
+ Y_UNIT_TEST(Other) {
+ UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("REAL"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("STRING_VALUE"), true);
+ UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("STRING_MULTILINE"), false);
+
+ UNIT_ASSERT_VALUES_EQUAL(
+ grammar.BlockByName.at("FLOAT_EXP"),
+ "E (PLUS | MINUS)? DECDIGITS");
+ UNIT_ASSERT_VALUES_EQUAL(
+ grammar.BlockByName.at("STRING_MULTILINE"),
+ "(DOUBLE_COMMAT .*? DOUBLE_COMMAT)+ COMMAT?");
+ UNIT_ASSERT_VALUES_EQUAL(
+ grammar.BlockByName.at("REAL"),
+ "(DECDIGITS DOT DIGIT* FLOAT_EXP? | DECDIGITS FLOAT_EXP) (F | P (F ('4' | '8') | N)?)?");
+ }
+
+} // Y_UNIT_TEST_SUITE(SqlReflectTests)
diff --git a/yql/essentials/sql/v1/reflect/ut/ya.make b/yql/essentials/sql/v1/reflect/ut/ya.make
new file mode 100644
index 00000000000..ee52ff0837a
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/ut/ya.make
@@ -0,0 +1,7 @@
+UNITTEST_FOR(yql/essentials/sql/v1/reflect)
+
+SRCS(
+ sql_reflect_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/sql/v1/reflect/ya.make b/yql/essentials/sql/v1/reflect/ya.make
new file mode 100644
index 00000000000..5865654c86e
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+SRCS(
+ sql_reflect.cpp
+)
+
+RESOURCE(DONT_PARSE yql/essentials/sql/v1/SQLv1Antlr4.g.in SQLv1Antlr4.g.in)
+
+END()
+
+RECURSE_FOR_TESTS(
+ ut
+)