YQL-19616 Convert YQL lexer grammar to regexes

- [x] Parse YQL grammar to extract lexer grammar into `TLexerGrammar`. - [x] Translate `TLexerGrammar` into regexes. - [x] Implement a lexer via regexes `TRegexLexer` to test generated regexes validity. - [x] Test on `Default` syntax mode. - [x] Test on `ANSI` syntax mode. --- - Related to https://github.com/ydb-platform/ydb/issues/15129 - Requirement for https://github.com/ytsaurus/ytsaurus/pull/1112 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1127 commit_hash:03ffffe81cdafe7f93a4d3fd9a3212fe67f1c72d
author: vityaman <[email protected]> 2025-03-28 18:29:24 +0300
committer: robot-piglet <[email protected]> 2025-03-28 18:50:04 +0300
commit: 60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch)
tree: 08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql
parent: 1e214be59cbf130bee433c422b42f16148e5acff (diff)
20 files changed, 1263 insertions, 64 deletions
diff --git a/yql/essentials/sql/v1/SQLv1Antlr4.g.in b/yql/essentials/sql/v1/SQLv1Antlr4.g.in
index fb92a68f9ab..5c59ab61ea4 100644
--- a/yql/essentials/sql/v1/SQLv1Antlr4.g.in
+++ b/yql/essentials/sql/v1/SQLv1Antlr4.g.in
@@ -1775,9 +1775,7 @@ bool_value: (TRUE | FALSE);
 real: REAL;
 integer: DIGITS | INTEGER_VALUE;
 
-//
-// Lexer
-//
+//! section:punctuation
 
 EQUALS:        '=';
 EQUALS2:       '==';
@@ -1823,6 +1821,8 @@ fragment QUOTE_SINGLE:  '\'';
 fragment BACKTICK:      '`';
 fragment DOUBLE_COMMAT: '@@';
 
+//! section:letter
+
 // http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
 fragment A:('a'|'A');
 fragment B:('b'|'B');
@@ -1851,6 +1851,8 @@ fragment X:('x'|'X');
 fragment Y:('y'|'Y');
 fragment Z:('z'|'Z');
 
+//! section:keyword
+
 ABORT: A B O R T;
 ACTION: A C T I O N;
 ADD: A D D;
@@ -2144,13 +2146,7 @@ WRAPPER: W R A P P E R;
 //WRITE: W R I T E;
 XOR: X O R;
 
-// YQL Default Lexer:
-// GRAMMAR_STRING_CORE_SINGLE = ~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .)
-// GRAMMAR_STRING_CORE_DOUBLE = ~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .)
-
-// ANSI Lexer:
-// GRAMMAR_STRING_CORE_SINGLE = ~QUOTE_SINGLE | (QUOTE_SINGLE QUOTE_SINGLE)
-// GRAMMAR_STRING_CORE_DOUBLE = ~QUOTE_DOUBLE | (QUOTE_DOUBLE QUOTE_DOUBLE)
+//! section:other
 
 fragment STRING_CORE_SINGLE: @GRAMMAR_STRING_CORE_SINGLE@;
 fragment STRING_CORE_DOUBLE: @GRAMMAR_STRING_CORE_DOUBLE@;
@@ -2163,7 +2159,7 @@ STRING_VALUE: ((STRING_SINGLE | STRING_DOUBLE | STRING_MULTILINE) (S | U | Y | J
 
 ID_PLAIN: ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | DIGIT)*;
 
-fragment ID_QUOTED_CORE: '\\'. | '``' | ~('`' | '\\');
+fragment ID_QUOTED_CORE: '\\' . | '``' | ~('`' | '\\');
 ID_QUOTED: BACKTICK ID_QUOTED_CORE* BACKTICK;
 
 fragment DIGIT: '0'..'9';
@@ -2177,23 +2173,18 @@ DIGITS: DECDIGITS | HEXDIGITS | OCTDIGITS | BINDIGITS;
 // not all combinations of P/U with L/S/T/I/B/N are actually valid - this is resolved in sql.cpp
 INTEGER_VALUE: DIGITS ((P | U)? (L | S | T | I | B | N)?);
 
-fragment FLOAT_EXP : E (PLUS | MINUS)? DECDIGITS ;
+fragment FLOAT_EXP: E (PLUS | MINUS)? DECDIGITS;
 REAL:
     (
         DECDIGITS DOT DIGIT* FLOAT_EXP?
     |   DECDIGITS FLOAT_EXP
 //  |   DOT DECDIGITS FLOAT_EXP?    // Conflicts with tuple element access through DOT
-    ) (F | P (F ('4'|'8') | N)?)?
+    ) (F | P (F ('4' | '8') | N)?)?
     ;
 
 BLOB: X QUOTE_SINGLE HEXDIGIT+ QUOTE_SINGLE;
 
-// YQL Default Lexer:
-// GRAMMAR_MULTILINE_COMMENT_CORE = .
-// ANSI Lexer:
-// GRAMMAR_MULTILINE_COMMENT_CORE = MULTILINE_COMMENT | .
-
 fragment MULTILINE_COMMENT: '/*' ( @GRAMMAR_MULTILINE_COMMENT_CORE@ )*? '*/';
-fragment LINE_COMMENT: '--' ~('\n'|'\r')* ('\r' '\n'? | '\n' | EOF);
-WS: (' '|'\r'|'\t'|'\u000C'|'\n')->channel(HIDDEN);
-COMMENT: (MULTILINE_COMMENT|LINE_COMMENT)->channel(HIDDEN);
+fragment LINE_COMMENT: '--' ~('\n' | '\r')* ('\r' '\n'? | '\n' | EOF);
+WS: (' ' | '\r' | '\t' | '\u000C' | '\n') -> channel(HIDDEN);
+COMMENT: (MULTILINE_COMMENT | LINE_COMMENT) -> channel(HIDDEN);
diff --git a/yql/essentials/sql/v1/complete/sql_complete.cpp b/yql/essentials/sql/v1/complete/sql_complete.cpp
index 53cb4ada420..753d0a2835c 100644
--- a/yql/essentials/sql/v1/complete/sql_complete.cpp
+++ b/yql/essentials/sql/v1/complete/sql_complete.cpp
@@ -125,7 +125,9 @@ namespace NSQLComplete {
         INameService::TPtr names = MakeStaticNameService(MakeDefaultNameSet());
 
         return MakeSqlCompletionEngine([lexers = std::move(lexers)](bool ansi) {
-            return NSQLTranslationV1::MakeLexer(lexers, ansi, /* antlr4 = */ true, /* pure = */ true);
+            return NSQLTranslationV1::MakeLexer(
+                lexers, ansi, /* antlr4 = */ true, 
+                NSQLTranslationV1::ELexerFlavor::Pure);
         }, std::move(names));
     }
 
diff --git a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
index 4fb6dfea587..aa242d313cb 100644
--- a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
+++ b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
@@ -43,7 +43,9 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
         lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory();
         lexers.Antlr4PureAnsi = NSQLTranslationV1::MakeAntlr4PureAnsiLexerFactory();
         return [lexers = std::move(lexers)](bool ansi) {
-            return NSQLTranslationV1::MakeLexer(lexers, ansi, /* antlr4 = */ true, /* pure = */ true);
+            return NSQLTranslationV1::MakeLexer(
+                lexers, ansi, /* antlr4 = */ true, 
+                NSQLTranslationV1::ELexerFlavor::Pure);
         };
     }
 
diff --git a/yql/essentials/sql/v1/lexer/lexer.cpp b/yql/essentials/sql/v1/lexer/lexer.cpp
index 5621cc65d7b..88ced55ccf4 100644
--- a/yql/essentials/sql/v1/lexer/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer.cpp
@@ -11,6 +11,7 @@
 #include <util/string/ascii.h>
 #include <util/string/builder.h>
 #include <util/string/strip.h>
+#include <util/string/join.h>
 
 #if defined(_tsan_enabled_)
 #include <util/system/mutex.h>
@@ -29,8 +30,8 @@ using NSQLTranslation::MakeDummyLexerFactory;
 
 class TV1Lexer : public ILexer {
 public:
-    explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure)
-        : Factory(GetFactory(lexers, ansi, antlr4, pure))
+    explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor)
+        : Factory(GetFactory(lexers, ansi, antlr4, flavor))
     {
     }
 
@@ -42,52 +43,70 @@ public:
     }
 
 private:
-    static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4, bool pure = false) {
-        if (!ansi && !antlr4 && !pure) {
-            if (lexers.Antlr3) {
-                return lexers.Antlr3;
-            }
-            return MakeDummyLexerFactory("antlr3");
-        } else if (ansi && !antlr4 && !pure) {
-            if (lexers.Antlr3Ansi) {
-                return lexers.Antlr3Ansi;
-            }
-            return MakeDummyLexerFactory("antlr3_ansi");
-        } else if (!ansi && antlr4 && !pure) {
-            if (lexers.Antlr4) {
-                return lexers.Antlr4;
-            }
-            return MakeDummyLexerFactory("antlr4");
-        } else if (ansi && antlr4 && !pure) {
-            if (lexers.Antlr4Ansi) {
-                return lexers.Antlr4Ansi;
-            }
-            return MakeDummyLexerFactory("antlr4_ansi");
-        } else if (!ansi && antlr4 && pure) {
-            if (lexers.Antlr4Pure) {
-                return lexers.Antlr4Pure;
-            }
-            return MakeDummyLexerFactory("antlr4_pure");
-        } else if (ansi && antlr4 && pure) {
-            if (lexers.Antlr4PureAnsi) {
-                return lexers.Antlr4PureAnsi;
-            }
-            return MakeDummyLexerFactory("antlr4_pure_ansi");
-        } else if (!ansi && !antlr4 && pure) {
-            return MakeDummyLexerFactory("antlr3_pure");
+    static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) {
+        if (auto ptr = GetMaybeFactory(lexers, ansi, antlr4, flavor)) {
+            return ptr;
+        }
+        return MakeDummyLexerFactory(GetLexerName(ansi, antlr4, flavor));
+    }
+
+    static NSQLTranslation::TLexerFactoryPtr GetMaybeFactory(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) {
+        if (!ansi && !antlr4 && flavor == ELexerFlavor::Default) {
+            return lexers.Antlr3;
+        } else if (ansi && !antlr4 && flavor == ELexerFlavor::Default) {
+            return lexers.Antlr3Ansi;
+        } else if (!ansi && antlr4 && flavor == ELexerFlavor::Default) {
+            return lexers.Antlr4;
+        } else if (ansi && antlr4 && flavor == ELexerFlavor::Default) {
+            return lexers.Antlr4Ansi;
+        } else if (!ansi && antlr4 && flavor == ELexerFlavor::Pure) {
+            return lexers.Antlr4Pure;
+        } else if (ansi && antlr4 && flavor == ELexerFlavor::Pure) {
+            return lexers.Antlr4PureAnsi;
+        } else if (!ansi && !antlr4 && flavor == ELexerFlavor::Regex) {
+            return lexers.Regex;
+        } else if (ansi && !antlr4 && flavor == ELexerFlavor::Regex) {
+            return lexers.RegexAnsi;
         } else {
-            return MakeDummyLexerFactory("antlr3_pure_ansi");
+            return nullptr;
         }
     }
 
+    static TString GetLexerName(bool ansi, bool antlr4, ELexerFlavor flavor) {
+        TVector<const TStringBuf> parts;
+
+        if (antlr4) {
+            parts.emplace_back("antlr4");
+        } else if (!antlr4 && flavor != ELexerFlavor::Regex) {
+            parts.emplace_back("antlr3");
+        }
+
+        switch (flavor) {
+        case ELexerFlavor::Default: {
+        } break;
+        case ELexerFlavor::Pure: {
+            parts.emplace_back("pure");
+        } break;
+        case ELexerFlavor::Regex: {
+            parts.emplace_back("regex");
+        } break;
+        }
+
+        if (ansi) {
+            parts.emplace_back("ansi");
+        }
+
+        return JoinSeq("_", parts);
+    }
+
 private:
     NSQLTranslation::TLexerFactoryPtr Factory;
 };
 
 } // namespace
 
-NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure) {
-    return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4, pure));
+NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) {
+    return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4, flavor));
 }
 
 bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
diff --git a/yql/essentials/sql/v1/lexer/lexer.h b/yql/essentials/sql/v1/lexer/lexer.h
index 1cc8566fcf6..226e8b6ed25 100644
--- a/yql/essentials/sql/v1/lexer/lexer.h
+++ b/yql/essentials/sql/v1/lexer/lexer.h
@@ -11,9 +11,18 @@ struct TLexers {
     NSQLTranslation::TLexerFactoryPtr Antlr4Ansi;
     NSQLTranslation::TLexerFactoryPtr Antlr4Pure;
     NSQLTranslation::TLexerFactoryPtr Antlr4PureAnsi;
+    NSQLTranslation::TLexerFactoryPtr Regex;
+    NSQLTranslation::TLexerFactoryPtr RegexAnsi;
 };
 
-NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure = false);
+enum class ELexerFlavor {
+    Default,
+    Pure,
+    Regex,
+};
+
+NSQLTranslation::ILexer::TPtr MakeLexer(
+    const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor = ELexerFlavor::Default);
 
 // "Probably" because YQL keyword can be an identifier
 // depending on a query context. For example
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
index 3ad01f631b6..53cff6ffdc7 100644
--- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -6,6 +6,7 @@
 #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
 #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
 #include <yql/essentials/sql/v1/lexer/antlr4_pure/lexer.h>
+#include <yql/essentials/sql/v1/lexer/regex/lexer.h>
 
 #include <library/cpp/testing/unittest/registar.h>
 
@@ -59,6 +60,42 @@ void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs)
 }
 
 Y_UNIT_TEST_SUITE(SQLv1Lexer) {
+    Y_UNIT_TEST(UnsupportedIssues) {
+        NSQLTranslationV1::TLexers factories;
+
+        TVector<ILexer::TPtr> lexers;    
+        for (auto ansi : {false, true}) {
+            for (auto antlr4 : {false, true}) {
+                for (auto flavor : {ELexerFlavor::Default, ELexerFlavor::Pure, ELexerFlavor::Regex}) {
+                    lexers.emplace_back(MakeLexer(factories, ansi, antlr4, flavor));
+                }
+            }
+        }
+
+        TVector<TString> actual;
+        for (auto& lexer : lexers) {
+            auto issues = GetIssueMessages(lexer, "");
+            actual.emplace_back(std::move(issues.at(0)));
+        }
+
+        TVector<TString> expected = {
+            "<main>: Error: Lexer antlr3 is not supported",
+            "<main>: Error: Lexer antlr3_pure is not supported",
+            "<main>: Error: Lexer regex is not supported",
+            "<main>: Error: Lexer antlr4 is not supported",
+            "<main>: Error: Lexer antlr4_pure is not supported",
+            "<main>: Error: Lexer antlr4_regex is not supported",
+            "<main>: Error: Lexer antlr3_ansi is not supported",
+            "<main>: Error: Lexer antlr3_pure_ansi is not supported",
+            "<main>: Error: Lexer regex_ansi is not supported",
+            "<main>: Error: Lexer antlr4_ansi is not supported",
+            "<main>: Error: Lexer antlr4_pure_ansi is not supported",
+            "<main>: Error: Lexer antlr4_regex_ansi is not supported",
+        };
+
+        UNIT_ASSERT_VALUES_EQUAL(actual, expected);
+    }
+
     Y_UNIT_TEST(AntlrVersionIndependent) {
         const TVector<TString> queriesUtf8 = {
             "",
@@ -85,7 +122,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
 
         auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
         auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
-        auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, /* pure = */ true);
+        auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
 
         for (const auto& query : queriesUtf8) {
             auto [tokens3, issues3] = Tokenize(lexer3, query);
@@ -164,19 +201,24 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
         NSQLTranslationV1::TLexers lexers;
         lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
         lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
+        lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory();
+        lexers.Regex = NSQLTranslationV1::MakeRegexLexerFactory(/* ansi = */ false);
 
         auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
         auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
-        auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, /* pure = */ true);
+        auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
+        auto lexerR = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex);
 
         for (const auto& query : InvalidQueries()) {
             auto issues3 = GetIssueMessages(lexer3, query);
             auto issues4 = GetIssueMessages(lexer4, query);
             auto issues4p = GetIssueMessages(lexer4p, query);
+            auto issuesR = GetIssueMessages(lexerR, query);
 
             UNIT_ASSERT(!issues3.empty());
             UNIT_ASSERT(!issues4.empty());
             UNIT_ASSERT(!issues4p.empty());
+            UNIT_ASSERT(!issuesR.empty());
         }
     }
 
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
new file mode 100644
index 00000000000..1c8f2104a48
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -0,0 +1,252 @@
+#include "lexer.h"
+
+#include "regex.h"
+
+#include <contrib/libs/re2/re2/re2.h>
+
+#include <yql/essentials/core/issue/yql_issue.h>
+#include <yql/essentials/sql/v1/reflect/sql_reflect.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/string.h>
+#include <util/string/subst.h>
+
+namespace NSQLTranslationV1 {
+
+    using NSQLTranslation::TParsedToken;
+    using NSQLTranslation::TParsedTokenList;
+
+    class TRegexLexer: public NSQLTranslation::ILexer {
+        static constexpr const char* CommentTokenName = "COMMENT";
+
+    public:
+        TRegexLexer(
+            bool ansi,
+            NSQLReflect::TLexerGrammar grammar,
+            const THashMap<TString, TString>& RegexByOtherNameMap)
+            : Grammar_(std::move(grammar))
+            , Ansi_(ansi)
+        {
+            for (auto& [token, regex] : RegexByOtherNameMap) {
+                if (token == CommentTokenName) {
+                    CommentRegex_.Reset(new RE2(regex));
+                } else {
+                    OtherRegexes_.emplace(std::move(token), std::move(regex));
+                }
+            }
+        }
+
+        bool Tokenize(
+            const TString& query,
+            const TString& queryName,
+            const TTokenCallback& onNextToken,
+            NYql::TIssues& issues,
+            size_t maxErrors) override {
+            size_t errors = 0;
+            for (size_t pos = 0; pos < query.size();) {
+                TParsedToken matched = Match(TStringBuf(query, pos));
+
+                if (matched.Name.empty() && maxErrors == errors) {
+                    break;
+                }
+
+                if (matched.Name.empty()) {
+                    pos += 1;
+                    errors += 1;
+                    issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates");
+                    continue;
+                }
+
+                pos += matched.Content.length();
+                onNextToken(std::move(matched));
+            }
+
+            onNextToken(TParsedToken{.Name = "EOF"});
+            return errors == 0;
+        }
+
+    private:
+        TParsedToken Match(const TStringBuf prefix) {
+            TParsedTokenList matches;
+
+            size_t keywordCount = MatchKeyword(prefix, matches);
+            MatchPunctuation(prefix, matches);
+            size_t otherCount = MatchRegex(prefix, matches);
+            MatchComment(prefix, matches);
+
+            auto max = MaxElementBy(matches, [](const TParsedToken& m) {
+                return m.Content.length();
+            });
+
+            if (max == std::end(matches)) {
+                return {};
+            }
+
+            auto isMatched = [&](const TStringBuf name) {
+                return std::end(matches) != FindIf(matches, [&](const auto& m) {
+                           return m.Name == name;
+                       });
+            };
+
+            Y_ENSURE(
+                otherCount <= 1 ||
+                (otherCount == 2 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
+
+            size_t conflicts = CountIf(matches, [&](const TParsedToken& m) {
+                return m.Content.length() == max->Content.length();
+            });
+            conflicts -= 1;
+            Y_ENSURE(
+                conflicts == 0 ||
+                (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) ||
+                (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
+
+            Y_ENSURE(!max->Content.empty());
+            return *max;
+        }
+
+        bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
+            size_t count = 0;
+            for (const auto& keyword : Grammar_.KeywordNames) {
+                if (prefix.substr(0, keyword.length()) == keyword) {
+                    matches.emplace_back(keyword, keyword);
+                    count += 1;
+                }
+            }
+            return count;
+        }
+
+        size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) {
+            size_t count = 0;
+            for (const auto& name : Grammar_.PunctuationNames) {
+                const auto& content = Grammar_.BlockByName.at(name);
+                if (prefix.substr(0, content.length()) == content) {
+                    matches.emplace_back(name, content);
+                    count += 1;
+                }
+            }
+            return count;
+        }
+
+        size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) {
+            size_t count = 0;
+            for (const auto& [token, regex] : OtherRegexes_) {
+                if (const TStringBuf match = TryMatchRegex(prefix, regex); !match.empty()) {
+                    matches.emplace_back(token, TString(match));
+                    count += 1;
+                }
+            }
+            return count;
+        }
+
+        const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) {
+            re2::StringPiece input(prefix.data(), prefix.size());
+            if (RE2::Consume(&input, regex)) {
+                return TStringBuf(prefix.data(), input.data());
+            }
+            return "";
+        }
+
+        size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) {
+            const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_);
+            if (reContent.empty()) {
+                return 0;
+            }
+
+            if (!(Ansi_ && prefix.StartsWith("/*"))) {
+                matches.emplace_back(CommentTokenName, TString(reContent));
+                return 1;
+            }
+
+            size_t ll1Length = MatchANSIMultilineComment(prefix);
+            const TStringBuf ll1Content = prefix.SubString(0, ll1Length);
+
+            Y_ENSURE(ll1Content == 0 || reContent <= ll1Content);
+            if (ll1Content == 0) {
+                matches.emplace_back(CommentTokenName, TString(reContent));
+                return 1;
+            }
+
+            matches.emplace_back(CommentTokenName, TString(ll1Content));
+            return 1;
+        }
+
+        size_t MatchANSIMultilineComment(TStringBuf remaining) {
+            if (!remaining.StartsWith("/*")) {
+                return 0;
+            }
+
+            size_t skipped = 0;
+
+            remaining.Skip(2);
+            skipped += 2;
+
+            for (;;) {
+                if (remaining.StartsWith("*/")) {
+                    remaining.Skip(2);
+                    skipped += 2;
+                    return skipped;
+                }
+
+                bool isSkipped = false;
+                if (remaining.StartsWith("/*")) {
+                    size_t limit = remaining.rfind("*/");
+                    if (limit == std::string::npos) {
+                        return 0;
+                    }
+
+                    size_t len = MatchANSIMultilineComment(remaining.Head(limit));
+                    remaining.Skip(len);
+                    skipped += len;
+
+                    isSkipped = len != 0;
+                }
+
+                if (isSkipped) {
+                    continue;
+                }
+
+                if (remaining.size() == 0) {
+                    return 0;
+                }
+
+                remaining.Skip(1);
+                skipped += 1;
+            }
+        }
+
+        NSQLReflect::TLexerGrammar Grammar_;
+        THashMap<TString, RE2> OtherRegexes_;
+        THolder<RE2> CommentRegex_;
+        bool Ansi_;
+    };
+
+    namespace {
+
+        class TFactory final: public NSQLTranslation::ILexerFactory {
+        public:
+            explicit TFactory(bool ansi)
+                : Ansi_(ansi)
+                , Grammar_(NSQLReflect::LoadLexerGrammar())
+                , RegexByOtherNameMap_(MakeRegexByOtherNameMap(Grammar_, Ansi_))
+            {
+            }
+
+            NSQLTranslation::ILexer::TPtr MakeLexer() const override {
+                return NSQLTranslation::ILexer::TPtr(
+                    new TRegexLexer(Ansi_, Grammar_, RegexByOtherNameMap_));
+            }
+
+        private:
+            bool Ansi_;
+            NSQLReflect::TLexerGrammar Grammar_;
+            THashMap<TString, TString> RegexByOtherNameMap_;
+        };
+
+    } // namespace
+
+    NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) {
+        return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi));
+    }
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h
new file mode 100644
index 00000000000..e9968954e1f
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <yql/essentials/parser/lexer_common/lexer.h>
+
+namespace NSQLTranslationV1 {
+
+    NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi);
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
new file mode 100644
index 00000000000..ae0d018e42d
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
@@ -0,0 +1,219 @@
+#include "lexer.h"
+
+#include <yql/essentials/public/issue/yql_issue.h>
+#include <yql/essentials/sql/settings/translation_settings.h>
+#include <yql/essentials/sql/v1/lexer/lexer.h>
+#include <yql/essentials/sql/v1/lexer/antlr4_pure_ansi/lexer.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/random/random.h>
+
+using namespace NSQLTranslationV1;
+using NSQLTranslation::SQL_MAX_PARSER_ERRORS;
+using NSQLTranslation::Tokenize;
+using NSQLTranslation::TParsedToken;
+using NSQLTranslation::TParsedTokenList;
+using NYql::TIssues;
+
+TLexers Lexers = {
+    .Antlr4PureAnsi = MakeAntlr4PureAnsiLexerFactory(),
+    .Regex = MakeRegexLexerFactory(/* ansi = */ false),
+    .RegexAnsi = MakeRegexLexerFactory(/* ansi = */ true),
+};
+
+auto PureAnsiLexer = MakeLexer(
+    Lexers, /* ansi = */ true, /* antlr4 = */ true, ELexerFlavor::Pure);
+
+auto DefaultLexer = MakeLexer(
+    Lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex);
+
+auto AnsiLexer = MakeLexer(
+    Lexers, /* ansi = */ true, /* antlr4 = */ false, ELexerFlavor::Regex);
+
+TString ToString(TParsedToken token) {
+    TString& string = token.Name;
+    if (token.Name != token.Content && token.Name != "EOF") {
+        string += "(";
+        string += token.Content;
+        string += ")";
+    }
+    return string;
+}
+
+TString Tokenized(NSQLTranslation::ILexer& lexer, const TString& query) {
+    TParsedTokenList tokens;
+    TIssues issues;
+    bool ok = Tokenize(lexer, query, "Test", tokens, issues, SQL_MAX_PARSER_ERRORS);
+
+    TString out;
+    if (!ok) {
+        out = "[INVALID] ";
+    }
+
+    for (auto& token : tokens) {
+        out += ToString(std::move(token));
+        out += " ";
+    }
+    if (!out.empty()) {
+        out.pop_back();
+    }
+    return out;
+}
+
+TString RandomMultilineCommentLikeText(size_t maxSize) {
+    auto size = RandomNumber<size_t>(maxSize);
+    TString comment;
+    for (size_t i = 0; i < size; ++i) {
+        if (auto /* isOpen */ _ = RandomNumber<bool>()) {
+            comment += "/*";
+        } else {
+            comment += "*/";
+        }
+
+        for (int gap = RandomNumber<size_t>(2); gap > 0; --gap) {
+            comment += " ";
+        }
+    }
+    return comment;
+}
+
+void Check(TString input, TString expected, bool ansi) {
+    auto* lexer = DefaultLexer.Get();
+    if (ansi) {
+        lexer = AnsiLexer.Get();
+    }
+    UNIT_ASSERT_VALUES_EQUAL(Tokenized(*lexer, input), expected);
+}
+
+void Check(TString input, TString expected) {
+    Check(input, expected, /* ansi = */ false);
+    Check(input, expected, /* ansi = */ true);
+}
+
+Y_UNIT_TEST_SUITE(RegexLexerTests) {
+    Y_UNIT_TEST(Whitespace) {
+        Check("", "EOF");
+        Check(" ", "WS( ) EOF");
+        Check("  ", "WS( ) WS( ) EOF");
+        Check("\n", "WS(\n) EOF");
+    }
+
+    Y_UNIT_TEST(SinleLineComment) {
+        Check("--yql", "COMMENT(--yql) EOF");
+        Check("--  yql ", "COMMENT(--  yql ) EOF");
+        Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF");
+        Check("-- yql --", "COMMENT(-- yql --) EOF");
+    }
+
+    Y_UNIT_TEST(MultiLineComment) {
+        Check("/* yql */", "COMMENT(/* yql */) EOF");
+        Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
+        Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF");
+    }
+
+    Y_UNIT_TEST(RecursiveMultiLineCommentDefault) {
+        Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false);
+        Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false);
+    }
+
+    Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) {
+        Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true);
+        Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+        Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true);
+        Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+        Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true);
+        Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+        Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+        Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true);
+    }
+
+    Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) {
+        SetRandomSeed(100);
+        for (size_t i = 0; i < 512; ++i) {
+            auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128);
+            TString actual = Tokenized(*AnsiLexer, input);
+            TString expected = Tokenized(*PureAnsiLexer, input);
+            UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input);
+        }
+    }
+
+    Y_UNIT_TEST(Keyword) {
+        Check("SELECT", "SELECT EOF");
+        Check("INSERT", "INSERT EOF");
+        Check("FROM", "FROM EOF");
+    }
+
+    Y_UNIT_TEST(Punctuation) {
+        Check(
+            "* / + - <|",
+            "ASTERISK(*) WS( ) SLASH(/) WS( ) "
+            "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF");
+        Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF");
+    }
+
+    Y_UNIT_TEST(IdPlain) {
+        Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF");
+    }
+
+    Y_UNIT_TEST(IdQuoted) {
+        Check("``", "ID_QUOTED(``) EOF");
+        Check("` `", "ID_QUOTED(` `) EOF");
+        Check("` `", "ID_QUOTED(` `) EOF");
+        Check("`local/table`", "ID_QUOTED(`local/table`) EOF");
+    }
+
+    Y_UNIT_TEST(SinleLineString) {
+        Check("\"\"", "STRING_VALUE(\"\") EOF");
+        Check("\' \'", "STRING_VALUE(\' \') EOF");
+        Check("\" \"", "STRING_VALUE(\" \") EOF");
+        Check("\"test\"", "STRING_VALUE(\"test\") EOF");
+
+        Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false);
+        Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true);
+
+        Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false);
+        Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true);
+    }
+
+    Y_UNIT_TEST(MultiLineString) {
+        Check("@@@@", "STRING_VALUE(@@@@) EOF");
+        Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF");
+        Check("@@test@@", "STRING_VALUE(@@test@@) EOF");
+        Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF");
+    }
+
+    Y_UNIT_TEST(Query) {
+        TString query =
+            "SELECT\n"
+            "  123467,\n"
+            "  \"Hello, {name}!\",\n"
+            "  (1 + (5 * 1 / 0)),\n"
+            "  MIN(identifier),\n"
+            "  Bool(field),\n"
+            "  Math::Sin(var)\n"
+            "FROM `local/test/space/table`\n"
+            "JOIN test;";
+
+        TString expected =
+            "SELECT WS(\n) "
+            "WS( ) WS( ) INTEGER_VALUE(123467) COMMA(,) WS(\n) "
+            "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) "
+            "WS( ) WS( ) LPAREN(() INTEGER_VALUE(1) WS( ) PLUS(+) WS( ) LPAREN(() INTEGER_VALUE(5) WS( ) "
+            "ASTERISK(*) WS( ) INTEGER_VALUE(1) WS( ) SLASH(/) WS( ) INTEGER_VALUE(0) RPAREN()) "
+            "RPAREN()) COMMA(,) WS(\n) "
+            "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) "
+            "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) "
+            "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) "
+            "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) "
+            "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF";
+
+        Check(query, expected);
+    }
+
+    Y_UNIT_TEST(Invalid) {
+        Check("\"", "[INVALID] EOF");
+        Check("\" SELECT", "[INVALID] WS( ) SELECT EOF");
+    }
+
+} // Y_UNIT_TEST_SUITE(RegexLexerTests)
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp
new file mode 100644
index 00000000000..a8aca8a1318
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp
@@ -0,0 +1,240 @@
+#include "regex.h"
+
+#include <contrib/libs/re2/re2/re2.h>
+
+#include <util/generic/vector.h>
+
+#define SUBSTITUTION(name, mode) \
+    {#name, name##_##mode}
+
+#define SUBSTITUTIONS(mode)                                         \
+    {                                                               \
+        #mode, {                                                    \
+            SUBSTITUTION(GRAMMAR_STRING_CORE_SINGLE, mode),         \
+                SUBSTITUTION(GRAMMAR_STRING_CORE_DOUBLE, mode),     \
+                SUBSTITUTION(GRAMMAR_MULTILINE_COMMENT_CORE, mode), \
+        }                                                           \
+    }
+
+namespace NSQLTranslationV1 {
+
+    class TLexerGrammarToRegexTranslator {
+    private:
+        struct TRewriteRule {
+            TString Repr;
+            std::function<void(TString&)> Apply;
+        };
+
+        using TRewriteRules = TVector<TRewriteRule>;
+
+    public:
+        explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi)
+            : Grammar_(&grammar)
+            , Mode_(ansi ? "ANSI" : "DEFAULT")
+        {
+            AddExternalRules(Inliners_);
+            AddFragmentRules(Inliners_);
+
+            AddLetterRules(Transformations_);
+            AddTransformationRules(Transformations_);
+
+            UnwrapQuotes_ = UnwrapQuotesRule();
+            AddSpaceCollapses(SpaceCollapses_);
+            UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule();
+        }
+
+        TString ToRegex(const TStringBuf name) {
+            TString text = Grammar_->BlockByName.at(name);
+            Inline(text);
+            Transform(text);
+            Finalize(text);
+            return text;
+        }
+
+    private:
+        void Inline(TString& text) {
+            ApplyEachWhileChanging(text, Inliners_);
+        }
+
+        void AddExternalRules(TRewriteRules& rules) {
+            THashMap<TString, THashMap<TString, TString>> Substitutions = {
+                SUBSTITUTIONS(DEFAULT),
+                SUBSTITUTIONS(ANSI),
+            };
+
+            // ANSI mode MULTILINE_COMMENT is recursive
+            Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] =
+                Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"];
+
+            for (const auto& [k, v] : Substitutions.at(Mode_)) {
+                rules.emplace_back(RegexRewriteRule("@" + k + "@", v));
+            }
+        }
+
+        void AddFragmentRules(TRewriteRules& rules) {
+            const THashSet<TString> PunctuationFragments = {
+                "BACKSLASH",
+                "QUOTE_DOUBLE",
+                "QUOTE_SINGLE",
+                "BACKTICK",
+                "DOUBLE_COMMAT",
+            };
+
+            for (const auto& [name, definition] : Grammar_->BlockByName) {
+                TString def = definition;
+                if (
+                    Grammar_->PunctuationNames.contains(name) ||
+                    PunctuationFragments.contains(name)) {
+                    def = "'" + def + "'";
+                }
+                def = QuoteAntlrRewrite(std::move(def));
+
+                rules.emplace_back(RegexRewriteRule(
+                    "(\\b" + name + "\\b)",
+                    "(" + def + ")"));
+            }
+        }
+
+        void Transform(TString& text) {
+            ApplyEachWhileChanging(text, Transformations_);
+        }
+
+        void AddLetterRules(TRewriteRules& rules) {
+            for (char letter = 'A'; letter <= 'Z'; ++letter) {
+                TString lower(char(ToLower(letter)));
+                TString upper(char(ToUpper(letter)));
+                rules.emplace_back(RegexRewriteRule(
+                    "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)",
+                    "\\1[" + lower + upper + "]\\2"));
+            }
+        }
+
+        void AddTransformationRules(TRewriteRules& rules) {
+            rules.emplace_back(RegexRewriteRule(
+                R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"(~\('(..?)'\))", R"([^\1])"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"(('..?')\.\.('..?'))", R"([\1-\2])"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"(\((.)\))", R"(\1)"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"(\((\[.{1,8}\])\))", R"(\1)"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"(\(('..?')\))", R"(\1)"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"( \.)", R"( (.|\\n))"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"(\bEOF\b)", R"($)"));
+
+            rules.emplace_back(RegexRewriteRule(
+                R"('\\u000C' \|)", ""));
+        }
+
+        void Finalize(TString& text) {
+            UnwrapQuotes_.Apply(text);
+            ApplyEachWhileChanging(text, SpaceCollapses_);
+            UnwrapQuotedSpace_.Apply(text);
+        }
+
+        void AddSpaceCollapses(TRewriteRules& rules) {
+            rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)"));
+            rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)"));
+        }
+
+        void ApplyEachOnce(TString& text, const TRewriteRules& rules) {
+            for (const auto& rule : rules) {
+                rule.Apply(text);
+            }
+        }
+
+        void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) {
+            constexpr size_t Limit = 16;
+
+            TString prev;
+            for (size_t i = 0; i < Limit + 1 && prev != text; ++i) {
+                prev = text;
+                ApplyEachOnce(text, rules);
+                Y_ENSURE(i != Limit);
+            }
+        }
+
+        TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) {
+            auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
+            Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
+
+            TString error;
+            Y_ENSURE(
+                re2->CheckRewriteString(rewrite, &error),
+                error << " on rewrite '" << rewrite << "'");
+
+            return {
+                .Repr = regex + " -> " + rewrite,
+                .Apply = [re2, rewrite = std::move(rewrite)](TString& text) {
+                    RE2::GlobalReplace(&text, *re2, rewrite);
+                },
+            };
+        }
+
+        TRewriteRule UnwrapQuotesRule() {
+            const TString regex = R"('([^ ][^ ]?)')";
+            auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
+            Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
+
+            return {
+                .Repr = regex + " -> Quoted(\\1)",
+                .Apply = [re2](TString& text) {
+                    TString content;
+                    std::size_t i = 256;
+                    while (RE2::PartialMatch(text, *re2, &content) && --i != 0) {
+                        TString quoted = RE2::QuoteMeta(content);
+                        for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) {
+                            quoted.erase(std::begin(quoted));
+                        }
+                        SubstGlobal(text, "'" + content + "'", quoted);
+                    }
+                    Y_ENSURE(i != 0);
+                },
+            };
+        }
+
+        TRewriteRule UnwrapQuotedSpaceRule() {
+            return RegexRewriteRule(R"(' ')", R"( )");
+        }
+
+        TString QuoteAntlrRewrite(TString rewrite) {
+            SubstGlobal(rewrite, R"(\)", R"(\\)");
+            SubstGlobal(rewrite, R"('\\')", R"('\\\\')");
+            return rewrite;
+        }
+
+        const NSQLReflect::TLexerGrammar* Grammar_;
+        const TStringBuf Mode_;
+
+        TRewriteRules Inliners_;
+
+        TRewriteRules Transformations_;
+
+        TRewriteRule UnwrapQuotes_;
+        TRewriteRules SpaceCollapses_;
+        TRewriteRule UnwrapQuotedSpace_;
+    };
+
+    THashMap<TString, TString> MakeRegexByOtherNameMap(const NSQLReflect::TLexerGrammar& grammar, bool ansi) {
+        TLexerGrammarToRegexTranslator translator(grammar, ansi);
+
+        THashMap<TString, TString> regexes;
+        for (const auto& token : grammar.OtherNames) {
+            regexes.emplace(token, translator.ToRegex(token));
+        }
+        return regexes;
+    }
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h
new file mode 100644
index 00000000000..9e29c3df25b
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/regex.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <yql/essentials/sql/v1/reflect/sql_reflect.h>
+
+#include <util/generic/hash.h>
+
+namespace NSQLTranslationV1 {
+
+    // Makes regexes only for tokens from OtherNames,
+    // as keywords and punctuation are trivially matched.
+    THashMap<TString, TString> MakeRegexByOtherNameMap(
+        const NSQLReflect::TLexerGrammar& grammar, bool ansi);
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
new file mode 100644
index 00000000000..47a94f53ed0
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
@@ -0,0 +1,90 @@
+#include "regex.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <contrib/libs/re2/re2/re2.h>
+
+using namespace NSQLTranslationV1;
+
+namespace {
+    auto grammar = NSQLReflect::LoadLexerGrammar();
+    auto defaultRegexes = MakeRegexByOtherNameMap(grammar, /* ansi = */ false);
+    auto ansiRegexes = MakeRegexByOtherNameMap(grammar, /* ansi = */ true);
+
+    void CheckRegex(bool ansi, const TStringBuf name, const TStringBuf expected) {
+        const auto& regexes = ansi ? ansiRegexes : defaultRegexes;
+        const TString regex = regexes.at(name);
+
+        const RE2 re2(regex);
+        Y_ENSURE(re2.ok(), re2.error());
+
+        UNIT_ASSERT_VALUES_EQUAL(regex, expected);
+    }
+
+} // namespace
+
+Y_UNIT_TEST_SUITE(SqlRegexTests) {
+    Y_UNIT_TEST(StringValue) {
+        CheckRegex(
+            /* ansi = */ false,
+            "STRING_VALUE",
+            R"(((((\'([^'\\]|(\\(.|\n)))*\'))|((\"([^"\\]|(\\(.|\n)))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sS]|[uU]|[yY]|[jJ]|[pP]([tT]|[bB]|[vV])?)?))");
+    }
+
+    Y_UNIT_TEST(AnsiStringValue) {
+        CheckRegex(
+            /* ansi = */ true,
+            "STRING_VALUE",
+            R"(((((\'([^']|(\'\'))*\'))|((\"([^"]|(\"\"))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sS]|[uU]|[yY]|[jJ]|[pP]([tT]|[bB]|[vV])?)?))");
+    }
+
+    Y_UNIT_TEST(IdPlain) {
+        CheckRegex(
+            /* ansi = */ false,
+            "ID_PLAIN",
+            R"(([a-z]|[A-Z]|_)([a-z]|[A-Z]|_|[0-9])*)");
+    }
+
+    Y_UNIT_TEST(IdQuoted) {
+        CheckRegex(
+            /* ansi = */ false,
+            "ID_QUOTED",
+            R"(\`(\\(.|\n)|\`\`|[^`\\])*\`)");
+    }
+
+    Y_UNIT_TEST(Digits) {
+        CheckRegex(
+            /* ansi = */ false,
+            "DIGITS",
+            R"(([0-9]+)|(0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+))");
+    }
+
+    Y_UNIT_TEST(Real) {
+        CheckRegex(
+            /* ansi = */ false,
+            "REAL",
+            R"((([0-9]+)\.[0-9]*([eE](\+|\-)?([0-9]+))?|([0-9]+)([eE](\+|\-)?([0-9]+)))([fF]|[pP]([fF](4|8)|[nN])?)?)");
+    }
+
+    Y_UNIT_TEST(Ws) {
+        CheckRegex(
+            /* ansi = */ false,
+            "WS",
+            R"(( |\r|\t|\n))");
+    }
+
+    Y_UNIT_TEST(Comment) {
+        CheckRegex(
+            /* ansi = */ false,
+            "COMMENT",
+            R"(((\/\*(.|\n)*?\*\/)|(\-\-[^\n\r]*(\r\n?|\n|$))))");
+    }
+
+    Y_UNIT_TEST(AnsiCommentSameAsDefault) {
+        // Because of recursive definition
+        UNIT_ASSERT_VALUES_EQUAL(
+            ansiRegexes.at("COMMENT"),
+            defaultRegexes.at("COMMENT"));
+    }
+
+} // Y_UNIT_TEST_SUITE(SqlRegexTests)
diff --git a/yql/essentials/sql/v1/lexer/regex/ut/ya.make b/yql/essentials/sql/v1/lexer/regex/ut/ya.make
new file mode 100644
index 00000000000..09eb74a3f68
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/ut/ya.make
@@ -0,0 +1,13 @@
+UNITTEST_FOR(yql/essentials/sql/v1/lexer/regex)
+
+PEERDIR(
+    yql/essentials/sql/v1/lexer
+    yql/essentials/sql/v1/lexer/antlr4_pure_ansi
+)
+
+SRCS(
+    lexer_ut.cpp
+    regex_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make
new file mode 100644
index 00000000000..249dfbd11df
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/ya.make
@@ -0,0 +1,39 @@
+LIBRARY()
+
+PEERDIR(
+    contrib/libs/re2
+    yql/essentials/public/issue
+    yql/essentials/parser/lexer_common
+    yql/essentials/sql/settings
+    yql/essentials/sql/v1/reflect
+)
+
+# TODO(vityaman): Extract to a single ya.make for reusage.
+
+SET(GRAMMAR_STRING_CORE_SINGLE_DEFAULT "~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .)")
+SET(GRAMMAR_STRING_CORE_DOUBLE_DEFAULT "~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .)")
+SET(GRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT "(.)")
+
+SET(GRAMMAR_STRING_CORE_SINGLE_ANSI "~QUOTE_SINGLE | (QUOTE_SINGLE QUOTE_SINGLE)")
+SET(GRAMMAR_STRING_CORE_DOUBLE_ANSI "~QUOTE_DOUBLE | (QUOTE_DOUBLE QUOTE_DOUBLE)")
+SET(GRAMMAR_MULTILINE_COMMENT_CORE_ANSI "MULTILINE_COMMENT | .")
+
+CFLAGS(
+    -DGRAMMAR_STRING_CORE_SINGLE_DEFAULT="\\\"${GRAMMAR_STRING_CORE_SINGLE_DEFAULT}\\\""
+    -DGRAMMAR_STRING_CORE_DOUBLE_DEFAULT="\\\"${GRAMMAR_STRING_CORE_DOUBLE_DEFAULT}\\\""
+    -DGRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT="\\\"${GRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT}\\\""
+    -DGRAMMAR_STRING_CORE_SINGLE_ANSI="\\\"${GRAMMAR_STRING_CORE_SINGLE_ANSI}\\\""
+    -DGRAMMAR_STRING_CORE_DOUBLE_ANSI="\\\"${GRAMMAR_STRING_CORE_DOUBLE_ANSI}\\\""
+    -DGRAMMAR_MULTILINE_COMMENT_CORE_ANSI="\\\"${GRAMMAR_MULTILINE_COMMENT_CORE_ANSI}\\\""
+)
+
+SRCS(
+    lexer.cpp
+    regex.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+    ut
+)
diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make
index c50c8cd7277..7e62fb50c85 100644
--- a/yql/essentials/sql/v1/lexer/ut/ya.make
+++ b/yql/essentials/sql/v1/lexer/ut/ya.make
@@ -6,6 +6,7 @@ PEERDIR(
     yql/essentials/sql/v1/lexer/antlr3
     yql/essentials/sql/v1/lexer/antlr4
     yql/essentials/sql/v1/lexer/antlr4_pure
+    yql/essentials/sql/v1/lexer/regex
 )
 
 SRCS(
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
new file mode 100644
index 00000000000..f47f35cb9de
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
@@ -0,0 +1,173 @@
+#include "sql_reflect.h"
+
+#include <library/cpp/resource/resource.h>
+
+#include <util/string/split.h>
+#include <util/string/strip.h>
+
+namespace NSQLReflect {
+
+    const TStringBuf ReflectPrefix = "//!";
+    const TStringBuf SectionPrefix = "//! section:";
+    const TStringBuf SectionPunctuation = "//! section:punctuation";
+    const TStringBuf SectionLetter = "//! section:letter";
+    const TStringBuf SectionKeyword = "//! section:keyword";
+    const TStringBuf SectionOther = "//! section:other";
+    const TStringBuf FragmentPrefix = "fragment ";
+
+    TVector<TString> GetResourceLines(const TStringBuf key) {
+        TString text;
+        Y_ENSURE(NResource::FindExact(key, &text));
+
+        TVector<TString> lines;
+        Split(text, "\n", lines);
+        return lines;
+    }
+
+    void Format(TVector<TString>& lines) {
+        for (size_t i = 0; i < lines.size(); ++i) {
+            auto& line = lines[i];
+
+            StripInPlace(line);
+
+            if (line.StartsWith("//") || (line.Contains(':') && line.Contains(';'))) {
+                continue;
+            }
+
+            size_t j = i + 1;
+            do {
+                line += lines.at(j);
+            } while (!lines.at(j++).Contains(';'));
+
+            auto first = std::next(std::begin(lines), i + 1);
+            auto last = std::next(std::begin(lines), j);
+            lines.erase(first, last);
+        }
+
+        for (auto& line : lines) {
+            CollapseInPlace(line);
+            SubstGlobal(line, " ;", ";");
+            SubstGlobal(line, " :", ":");
+            SubstGlobal(line, " )", ")");
+            SubstGlobal(line, "( ", "(");
+        }
+    }
+
+    void Purify(TVector<TString>& lines) {
+        const auto [first, last] = std::ranges::remove_if(lines, [](const TString& line) {
+            return (line.StartsWith("//") && !line.StartsWith(ReflectPrefix)) || line.empty();
+        });
+        lines.erase(first, last);
+    }
+
+    THashMap<TStringBuf, TVector<TString>> GroupBySection(TVector<TString>&& lines) {
+        TVector<TStringBuf> sections = {
+            "",
+            SectionPunctuation,
+            SectionLetter,
+            SectionKeyword,
+            SectionOther,
+        };
+
+        size_t section = 0;
+
+        THashMap<TStringBuf, TVector<TString>> groups;
+        for (auto& line : lines) {
+            if (line.StartsWith(SectionPrefix)) {
+                Y_ENSURE(sections.at(section + 1) == line);
+                section += 1;
+                continue;
+            }
+
+            groups[sections.at(section)].emplace_back(std::move(line));
+        }
+
+        groups.erase("");
+        groups.erase(SectionLetter);
+
+        return groups;
+    }
+
+    std::tuple<TString, TString> ParseLexerRule(TString&& line) {
+        size_t colonPos = line.find(':');
+        size_t semiPos = line.rfind(';');
+
+        Y_ENSURE(
+            colonPos != TString::npos &&
+            semiPos != TString::npos &&
+            colonPos < semiPos);
+
+        TString block = line.substr(colonPos + 2, semiPos - colonPos - 2);
+        SubstGlobal(block, "\\\\", "\\");
+
+        TString name = std::move(line);
+        name.resize(colonPos);
+
+        return std::make_tuple(std::move(name), std::move(block));
+    }
+
+    void ParsePunctuationLine(TString&& line, TLexerGrammar& grammar) {
+        auto [name, block] = ParseLexerRule(std::move(line));
+        block = block.erase(std::begin(block));
+        block.pop_back();
+
+        SubstGlobal(block, "\\\'", "\'");
+
+        if (!name.StartsWith(FragmentPrefix)) {
+            grammar.PunctuationNames.emplace(name);
+        }
+
+        SubstGlobal(name, FragmentPrefix, "");
+        grammar.BlockByName.emplace(std::move(name), std::move(block));
+    }
+
+    void ParseKeywordLine(TString&& line, TLexerGrammar& grammar) {
+        auto [name, block] = ParseLexerRule(std::move(line));
+        SubstGlobal(block, "'", "");
+        SubstGlobal(block, " ", "");
+
+        Y_ENSURE(name == block || (name == "TSKIP" && block == "SKIP"));
+        grammar.KeywordNames.emplace(std::move(name));
+    }
+
+    void ParseOtherLine(TString&& line, TLexerGrammar& grammar) {
+        auto [name, block] = ParseLexerRule(std::move(line));
+
+        if (!name.StartsWith(FragmentPrefix)) {
+            grammar.OtherNames.emplace(name);
+        }
+
+        SubstGlobal(name, FragmentPrefix, "");
+        SubstGlobal(block, " -> channel(HIDDEN)", "");
+        grammar.BlockByName.emplace(std::move(name), std::move(block));
+    }
+
+    TLexerGrammar LoadLexerGrammar() {
+        TVector<TString> lines = GetResourceLines("SQLv1Antlr4.g.in");
+        Purify(lines);
+        Format(lines);
+        Purify(lines);
+
+        THashMap<TStringBuf, TVector<TString>> sections;
+        sections = GroupBySection(std::move(lines));
+
+        TLexerGrammar grammar;
+
+        for (auto& [section, lines] : sections) {
+            for (auto& line : lines) {
+                if (section == SectionPunctuation) {
+                    ParsePunctuationLine(std::move(line), grammar);
+                } else if (section == SectionKeyword) {
+                    ParseKeywordLine(std::move(line), grammar);
+                } else if (section == SectionOther) {
+                    ParseOtherLine(std::move(line), grammar);
+                } else {
+                    Y_ABORT("Unexpected section %s", section);
+                }
+            }
+        }
+
+        return grammar;
+    }
+
+} // namespace NSQLReflect
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h
new file mode 100644
index 00000000000..5225a3c996b
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/hash.h>
+
+namespace NSQLReflect {
+
+    struct TLexerGrammar {
+        THashSet<TString> KeywordNames;
+        THashSet<TString> PunctuationNames;
+        THashSet<TString> OtherNames;
+        THashMap<TString, TString> BlockByName;
+    };
+
+    TLexerGrammar LoadLexerGrammar();
+
+} // namespace NSQLReflect
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp b/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp
new file mode 100644
index 00000000000..7bef2879e55
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp
@@ -0,0 +1,46 @@
+#include "sql_reflect.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NSQLReflect;
+
+namespace {
+    TLexerGrammar grammar = LoadLexerGrammar();
+} // namespace
+
+Y_UNIT_TEST_SUITE(SqlReflectTests) {
+    Y_UNIT_TEST(Keywords) {
+        UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("SELECT"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("INSERT"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("WHERE"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("COMMIT"), true);
+    }
+
+    Y_UNIT_TEST(Punctuation) {
+        UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("LPAREN"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("LPAREN"), "(");
+
+        UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("MINUS"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("MINUS"), "-");
+
+        UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("NAMESPACE"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("NAMESPACE"), "::");
+    }
+
+    Y_UNIT_TEST(Other) {
+        UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("REAL"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("STRING_VALUE"), true);
+        UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("STRING_MULTILINE"), false);
+
+        UNIT_ASSERT_VALUES_EQUAL(
+            grammar.BlockByName.at("FLOAT_EXP"),
+            "E (PLUS | MINUS)? DECDIGITS");
+        UNIT_ASSERT_VALUES_EQUAL(
+            grammar.BlockByName.at("STRING_MULTILINE"),
+            "(DOUBLE_COMMAT .*? DOUBLE_COMMAT)+ COMMAT?");
+        UNIT_ASSERT_VALUES_EQUAL(
+            grammar.BlockByName.at("REAL"),
+            "(DECDIGITS DOT DIGIT* FLOAT_EXP? | DECDIGITS FLOAT_EXP) (F | P (F ('4' | '8') | N)?)?");
+    }
+
+} // Y_UNIT_TEST_SUITE(SqlReflectTests)
diff --git a/yql/essentials/sql/v1/reflect/ut/ya.make b/yql/essentials/sql/v1/reflect/ut/ya.make
new file mode 100644
index 00000000000..ee52ff0837a
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/ut/ya.make
@@ -0,0 +1,7 @@
+UNITTEST_FOR(yql/essentials/sql/v1/reflect)
+
+SRCS(
+    sql_reflect_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/sql/v1/reflect/ya.make b/yql/essentials/sql/v1/reflect/ya.make
new file mode 100644
index 00000000000..5865654c86e
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+SRCS(
+    sql_reflect.cpp
+)
+
+RESOURCE(DONT_PARSE yql/essentials/sql/v1/SQLv1Antlr4.g.in SQLv1Antlr4.g.in)
+
+END()
+
+RECURSE_FOR_TESTS(
+    ut
+)
author	vityaman <[email protected]>	2025-03-28 18:29:24 +0300
committer	robot-piglet <[email protected]>	2025-03-28 18:50:04 +0300
commit	60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch)
tree	08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql
parent	1e214be59cbf130bee433c422b42f16148e5acff (diff)