summaryrefslogtreecommitdiffstats
path: root/yql/essentials
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2025-04-01 01:12:58 +0300
committerrobot-piglet <[email protected]>2025-04-01 01:23:36 +0300
commitf93076bbe93dd6ebb8d75a930268d30839b9011a (patch)
treedbfc5b2bea8bf16b1599a69f0f721a2acdc5dac2 /yql/essentials
parent2d512f78c593c3f4573742129c281d0fc5479de0 (diff)
Intermediate changes
commit_hash:e57b3e95787cc8037f200f1b6b6073e35403b27e
Diffstat (limited to 'yql/essentials')
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.cpp319
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.h37
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp39
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.cpp6
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.h2
-rw-r--r--yql/essentials/sql/v1/lexer/ut/ya.make3
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.cpp2
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.h5
8 files changed, 327 insertions, 86 deletions
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
index 53cff6ffdc7..1ddfd04b507 100644
--- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -1,36 +1,60 @@
#include "lexer.h"
+#include "lexer_ut.h"
#include <yql/essentials/core/issue/yql_issue.h>
#include <yql/essentials/sql/settings/translation_settings.h>
#include <yql/essentials/sql/v1/lexer/antlr3/lexer.h>
+#include <yql/essentials/sql/v1/lexer/antlr3_ansi/lexer.h>
#include <yql/essentials/sql/v1/lexer/antlr4/lexer.h>
+#include <yql/essentials/sql/v1/lexer/antlr4_ansi/lexer.h>
#include <yql/essentials/sql/v1/lexer/antlr4_pure/lexer.h>
+#include <yql/essentials/sql/v1/lexer/antlr4_pure_ansi/lexer.h>
#include <yql/essentials/sql/v1/lexer/regex/lexer.h>
#include <library/cpp/testing/unittest/registar.h>
+#include <util/string/ascii.h>
+#include <util/random/random.h>
+
+#define UNIT_ASSERT_TOKENIZED(LEXER, QUERY, TOKENS) \
+ do { \
+ auto tokens = Tokenized((LEXER), (QUERY)); \
+ UNIT_ASSERT_VALUES_EQUAL(tokens, (TOKENS)); \
+ } while (false)
+
using namespace NSQLTranslation;
using namespace NSQLTranslationV1;
-std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) {
+TLexers Lexers = {
+ .Antlr3 = MakeAntlr3LexerFactory(),
+ .Antlr3Ansi = MakeAntlr4AnsiLexerFactory(),
+ .Antlr4 = MakeAntlr4LexerFactory(),
+ .Antlr4Ansi = MakeAntlr4AnsiLexerFactory(),
+ .Antlr4Pure = MakeAntlr4PureLexerFactory(),
+ .Antlr4PureAnsi = MakeAntlr4PureAnsiLexerFactory(),
+ .Regex = MakeRegexLexerFactory(/* ansi = */ false),
+ .RegexAnsi = MakeRegexLexerFactory(/* ansi = */ true),
+};
+
+std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, const TString& query) {
TParsedTokenList tokens;
NYql::TIssues issues;
- Tokenize(*lexer, queryUtf8, "", tokens, issues, SQL_MAX_PARSER_ERRORS);
+ Tokenize(*lexer, query, "", tokens, issues, SQL_MAX_PARSER_ERRORS);
return {tokens, issues};
}
-TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) {
+TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, const TString& query) {
TVector<TString> messages;
- for (const auto& issue : Tokenize(lexer, queryUtf8).second) {
+ for (const auto& issue : Tokenize(lexer, query).second) {
messages.emplace_back(issue.ToString(/* oneLine = */ true));
}
return messages;
}
-TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
+TVector<TString> GetTokenViews(ILexer::TPtr& lexer, const TString& query) {
TVector<TString> names;
- for (auto& token : Tokenize(lexer, queryUtf8).first) {
+ for (auto& token : Tokenize(lexer, query).first) {
TString view = std::move(token.Name);
if (view == "ID_PLAIN" || view == "STRING_VALUE") {
view.append(" (");
@@ -42,28 +66,58 @@ TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
return names;
}
-void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) {
- if (lhs.Name == "EOF" && rhs.Name == "EOF") {
- return;
+TString ToString(TParsedToken token) {
+ TString& string = token.Name;
+ if (!AsciiEqualsIgnoreCase(token.Name, token.Content) && token.Name != "EOF") {
+ string += "(";
+ string += token.Content;
+ string += ")";
+ }
+ return string;
+}
+
+TString Tokenized(ILexer::TPtr& lexer, const TString& query) {
+ TParsedTokenList tokens;
+ NYql::TIssues issues;
+ bool ok = Tokenize(*lexer, query, "Test", tokens, issues, SQL_MAX_PARSER_ERRORS);
+
+ TString out;
+ if (!ok) {
+ out = "[INVALID] ";
}
- UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name);
- UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content);
- UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line);
+ for (auto& token : tokens) {
+ out += ToString(std::move(token));
+ out += " ";
+ }
+ if (!out.empty()) {
+ out.pop_back();
+ }
+ return out;
}
-void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) {
- UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size());
- for (size_t i = 0; i < lhs.size(); ++i) {
- AssertEquivialent(lhs.at(i), rhs.at(i));
+TString RandomMultilineCommentLikeText(size_t maxSize) {
+ auto size = RandomNumber<size_t>(maxSize);
+ TString comment;
+ for (size_t i = 0; i < size; ++i) {
+ if (auto /* isOpen */ _ = RandomNumber<bool>()) {
+ comment += "/*";
+ } else {
+ comment += "*/";
+ }
+
+ for (int gap = RandomNumber<size_t>(2); gap > 0; --gap) {
+ comment += " ";
+ }
}
+ return comment;
}
Y_UNIT_TEST_SUITE(SQLv1Lexer) {
Y_UNIT_TEST(UnsupportedIssues) {
NSQLTranslationV1::TLexers factories;
- TVector<ILexer::TPtr> lexers;
+ TVector<ILexer::TPtr> lexers;
for (auto ansi : {false, true}) {
for (auto antlr4 : {false, true}) {
for (auto flavor : {ELexerFlavor::Default, ELexerFlavor::Pure, ELexerFlavor::Regex}) {
@@ -96,8 +150,8 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_VALUES_EQUAL(actual, expected);
}
- Y_UNIT_TEST(AntlrVersionIndependent) {
- const TVector<TString> queriesUtf8 = {
+ Y_UNIT_TEST_ON_EACH_LEXER(AntlrAndFlavorIndependent) {
+ static const TVector<TString> queries = {
"",
" ",
"SELECT",
@@ -115,35 +169,31 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
"\"select\"select",
};
- NSQLTranslationV1::TLexers lexers;
- lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
- lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
- lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory();
-
- auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
- auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
- auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
-
- for (const auto& query : queriesUtf8) {
- auto [tokens3, issues3] = Tokenize(lexer3, query);
- auto [tokens4, issues4] = Tokenize(lexer4, query);
- auto [tokens4p, issues4p] = Tokenize(lexer4p, query);
- AssertEquivialent(tokens3, tokens4);
- AssertEquivialent(tokens3, tokens4p);
- UNIT_ASSERT(issues3.Empty());
- UNIT_ASSERT(issues4.Empty());
- UNIT_ASSERT(issues4p.Empty());
+ static TVector<TString> expectations(queries.size());
+
+ if (ANSI) {
+ return;
+ }
+
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+
+ for (size_t i = 0; i < queries.size(); ++i) {
+ const auto& query = queries[i];
+ auto& expected = expectations[i];
+
+ if (expected.empty()) {
+ expected = Tokenized(lexer, query);
+ return;
+ }
+
+ UNIT_ASSERT_TOKENIZED(lexer, query, expected);
}
}
TVector<TString> InvalidQueries();
void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) {
- NSQLTranslationV1::TLexers lexers;
- lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
- lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
-
- auto lexer = MakeLexer(lexers, /* ansi = */ false, antlr4);
+ auto lexer = MakeLexer(Lexers, /* ansi = */ false, antlr4);
auto input = InvalidQueries();
UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size());
@@ -198,16 +248,10 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
}
Y_UNIT_TEST(IssuesCollected) {
- NSQLTranslationV1::TLexers lexers;
- lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
- lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
- lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory();
- lexers.Regex = NSQLTranslationV1::MakeRegexLexerFactory(/* ansi = */ false);
-
- auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
- auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
- auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
- auto lexerR = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex);
+ auto lexer3 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ false);
+ auto lexer4 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ true);
+ auto lexer4p = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure);
+ auto lexerR = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex);
for (const auto& query : InvalidQueries()) {
auto issues3 = GetIssueMessages(lexer3, query);
@@ -223,9 +267,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
}
Y_UNIT_TEST(IssueMessagesAntlr3) {
- NSQLTranslationV1::TLexers lexers;
- lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory();
- auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false);
+ auto lexer3 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ false);
auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR");
@@ -240,10 +282,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
}
Y_UNIT_TEST(IssueMessagesAntlr4) {
- NSQLTranslationV1::TLexers lexers;
- lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory();
-
- auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true);
+ auto lexer4 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ true);
auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR");
@@ -253,4 +292,164 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_VALUES_EQUAL(actual, expected);
}
-}
+
+ Y_UNIT_TEST_ON_EACH_LEXER(Whitespace) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "", "EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, " ", "WS( ) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, " ", "WS( ) WS( ) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "\n", "WS(\n) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(Keyword) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "SELECT", "SELECT EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "INSERT", "INSERT EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(Punctuation) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(
+ lexer,
+ "* / + - <|",
+ "ASTERISK(*) WS( ) SLASH(/) WS( ) "
+ "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "SELECT*FROM", "SELECT ASTERISK(*) FROM EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(IdPlain) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(IdQuoted) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "``", "ID_QUOTED(``) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "` `", "ID_QUOTED(` `) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "`local/table`", "ID_QUOTED(`local/table`) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(Number) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "1", "DIGITS(1) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123", "DIGITS(123) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123u", "INTEGER_VALUE(123u) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123ui", "INTEGER_VALUE(123ui) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123.45", "REAL(123.45) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123.45E10", "REAL(123.45E10) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123.45E+10", "REAL(123.45E+10) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "1E+10", "REAL(1E+10) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(SingleLineString) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "\"\"", "STRING_VALUE(\"\") EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "\' \'", "STRING_VALUE(\' \') EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "\" \"", "STRING_VALUE(\" \") EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "\"test\"", "STRING_VALUE(\"test\") EOF");
+
+ if (!ANSI) {
+ UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF");
+ } else {
+ UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF");
+ }
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(MultiLineString) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "@@@@", "STRING_VALUE(@@@@) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "@@ @@@", "STRING_VALUE(@@ @@@) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "@@test@@", "STRING_VALUE(@@test@@) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(SingleLineComment) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "--yql", "COMMENT(--yql) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "-- yql ", "COMMENT(-- yql ) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "-- yql --", "COMMENT(-- yql --) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(MultiLineComment) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "/* yql */", "COMMENT(/* yql */) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(RecursiveMultiLineComment) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ if (!ANSI) {
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
+ } else {
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF");
+ }
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(RandomRecursiveMultiLineComment) {
+ if (!ANTLR4 && FLAVOR != ELexerFlavor::Regex || FLAVOR != ELexerFlavor::Pure) {
+ return;
+ }
+
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ auto reference = MakeLexer(Lexers, ANSI, /* antlr4 = */ true, ELexerFlavor::Pure);
+
+ SetRandomSeed(100);
+ for (size_t i = 0; i < 512; ++i) {
+ auto input = RandomMultilineCommentLikeText(/* maxSize = */ 32);
+ TString actual = Tokenized(lexer, input);
+ TString expected = Tokenized(reference, input);
+
+ UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input);
+ }
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(SimpleQuery) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(lexer, "select 1", "SELECT WS( ) DIGITS(1) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "SELect 1", "SELECT WS( ) DIGITS(1) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(ComplexQuery) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+
+ TString query =
+ "SELECT\n"
+ " 123467,\n"
+ " \"Hello, {name}!\",\n"
+ " (1 + (5U * 1 / 0)),\n"
+ " MIN(identifier),\n"
+ " Bool(field),\n"
+ " Math::Sin(var)\n"
+ "FROM `local/test/space/table`\n"
+ "JOIN test;";
+
+ TString expected =
+ "SELECT WS(\n) "
+ "WS( ) WS( ) DIGITS(123467) COMMA(,) WS(\n) "
+ "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) "
+ "WS( ) WS( ) LPAREN(() DIGITS(1) WS( ) PLUS(+) WS( ) LPAREN(() INTEGER_VALUE(5U) WS( ) "
+ "ASTERISK(*) WS( ) DIGITS(1) WS( ) SLASH(/) WS( ) DIGITS(0) RPAREN()) "
+ "RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) "
+ "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) "
+ "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF";
+
+ UNIT_ASSERT_TOKENIZED(lexer, query, expected);
+ }
+
+} // Y_UNIT_TEST_SUITE(SQLv1Lexer)
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.h b/yql/essentials/sql/v1/lexer/lexer_ut.h
new file mode 100644
index 00000000000..b4304eb7070
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "lexer.h"
+
+#define LEXER_NAME_ANSI_false_ANTLR4_false_FLAVOR_Default "antlr3"
+#define LEXER_NAME_ANSI_false_ANTLR4_true_FLAVOR_Default "antlr4"
+#define LEXER_NAME_ANSI_true_ANTLR4_false_FLAVOR_Default "antlr3_ansi"
+#define LEXER_NAME_ANSI_true_ANTLR4_true_FLAVOR_Default "antlr4_ansi"
+#define LEXER_NAME_ANSI_false_ANTLR4_true_FLAVOR_Pure "antlr4_pure"
+#define LEXER_NAME_ANSI_true_ANTLR4_true_FLAVOR_Pure "antlr4_pure_ansi"
+#define LEXER_NAME_ANSI_false_ANTLR4_false_FLAVOR_Regex "regex"
+#define LEXER_NAME_ANSI_true_ANTLR4_false_FLAVOR_Regex "regex_ansi"
+
+#define Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, ANSI, ANTLR4, FLAVOR) \
+ TCurrentTest::AddTest( \
+ #N "::" LEXER_NAME_ANSI_##ANSI##_ANTLR4_##ANTLR4##_FLAVOR_##FLAVOR, \
+ static_cast<void (*)(NUnitTest::TTestContext&)>(&N<ANSI, ANTLR4, ELexerFlavor::FLAVOR>), \
+ /* forceFork = */ false)
+
+#define Y_UNIT_TEST_ON_EACH_LEXER(N) \
+ template <bool ANSI, bool ANTLR4, ELexerFlavor FLAVOR> \
+ void N(NUnitTest::TTestContext&); \
+ struct TTestRegistration##N { \
+ TTestRegistration##N() { \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, false, Default); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, true, Default); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, false, Default); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, true, Default); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, true, Pure); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, true, Pure); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, false, Regex); \
+ Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, false, Regex); \
+ } \
+ }; \
+ static TTestRegistration##N testRegistration##N; \
+ template <bool ANSI, bool ANTLR4, ELexerFlavor FLAVOR> \
+ void N(NUnitTest::TTestContext&)
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index 1c8f2104a48..b0b5c2dad44 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -10,6 +10,7 @@
#include <util/generic/algorithm.h>
#include <util/generic/string.h>
#include <util/string/subst.h>
+#include <util/string/ascii.h>
namespace NSQLTranslationV1 {
@@ -23,15 +24,15 @@ namespace NSQLTranslationV1 {
TRegexLexer(
bool ansi,
NSQLReflect::TLexerGrammar grammar,
- const THashMap<TString, TString>& RegexByOtherNameMap)
+ const TVector<std::tuple<TString, TString>>& RegexByOtherName)
: Grammar_(std::move(grammar))
, Ansi_(ansi)
{
- for (auto& [token, regex] : RegexByOtherNameMap) {
+ for (const auto& [token, regex] : RegexByOtherName) {
if (token == CommentTokenName) {
CommentRegex_.Reset(new RE2(regex));
} else {
- OtherRegexes_.emplace(std::move(token), std::move(regex));
+ OtherRegexes_.emplace_back(token, new RE2(regex));
}
}
}
@@ -71,27 +72,27 @@ namespace NSQLTranslationV1 {
size_t keywordCount = MatchKeyword(prefix, matches);
MatchPunctuation(prefix, matches);
- size_t otherCount = MatchRegex(prefix, matches);
+ MatchRegex(prefix, matches);
MatchComment(prefix, matches);
- auto max = MaxElementBy(matches, [](const TParsedToken& m) {
- return m.Content.length();
- });
-
- if (max == std::end(matches)) {
+ if (matches.empty()) {
return {};
}
+ auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) {
+ return m.Content.length();
+ })->Content.length();
+
+ auto max = FindIf(matches, [&](const TParsedToken& m) {
+ return m.Content.length() == maxLength;
+ });
+
auto isMatched = [&](const TStringBuf name) {
return std::end(matches) != FindIf(matches, [&](const auto& m) {
return m.Name == name;
});
};
- Y_ENSURE(
- otherCount <= 1 ||
- (otherCount == 2 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
-
size_t conflicts = CountIf(matches, [&](const TParsedToken& m) {
return m.Content.length() == max->Content.length();
});
@@ -108,7 +109,7 @@ namespace NSQLTranslationV1 {
bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
size_t count = 0;
for (const auto& keyword : Grammar_.KeywordNames) {
- if (prefix.substr(0, keyword.length()) == keyword) {
+ if (AsciiEqualsIgnoreCase(prefix.substr(0, keyword.length()), keyword)) {
matches.emplace_back(keyword, keyword);
count += 1;
}
@@ -131,7 +132,7 @@ namespace NSQLTranslationV1 {
size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) {
size_t count = 0;
for (const auto& [token, regex] : OtherRegexes_) {
- if (const TStringBuf match = TryMatchRegex(prefix, regex); !match.empty()) {
+ if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) {
matches.emplace_back(token, TString(match));
count += 1;
}
@@ -216,7 +217,7 @@ namespace NSQLTranslationV1 {
}
NSQLReflect::TLexerGrammar Grammar_;
- THashMap<TString, RE2> OtherRegexes_;
+ TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_;
THolder<RE2> CommentRegex_;
bool Ansi_;
};
@@ -228,19 +229,19 @@ namespace NSQLTranslationV1 {
explicit TFactory(bool ansi)
: Ansi_(ansi)
, Grammar_(NSQLReflect::LoadLexerGrammar())
- , RegexByOtherNameMap_(MakeRegexByOtherNameMap(Grammar_, Ansi_))
+ , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_))
{
}
NSQLTranslation::ILexer::TPtr MakeLexer() const override {
return NSQLTranslation::ILexer::TPtr(
- new TRegexLexer(Ansi_, Grammar_, RegexByOtherNameMap_));
+ new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_));
}
private:
bool Ansi_;
NSQLReflect::TLexerGrammar Grammar_;
- THashMap<TString, TString> RegexByOtherNameMap_;
+ TVector<std::tuple<TString, TString>> RegexByOtherName_;
};
} // namespace
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp
index a8aca8a1318..937d21572fc 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp
@@ -227,12 +227,12 @@ namespace NSQLTranslationV1 {
TRewriteRule UnwrapQuotedSpace_;
};
- THashMap<TString, TString> MakeRegexByOtherNameMap(const NSQLReflect::TLexerGrammar& grammar, bool ansi) {
+ TVector<std::tuple<TString, TString>> MakeRegexByOtherName(const NSQLReflect::TLexerGrammar& grammar, bool ansi) {
TLexerGrammarToRegexTranslator translator(grammar, ansi);
- THashMap<TString, TString> regexes;
+ TVector<std::tuple<TString, TString>> regexes;
for (const auto& token : grammar.OtherNames) {
- regexes.emplace(token, translator.ToRegex(token));
+ regexes.emplace_back(token, translator.ToRegex(token));
}
return regexes;
}
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h
index 9e29c3df25b..1e9d92b6535 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex.h
+++ b/yql/essentials/sql/v1/lexer/regex/regex.h
@@ -8,7 +8,7 @@ namespace NSQLTranslationV1 {
// Makes regexes only for tokens from OtherNames,
// as keywords and punctuation are trivially matched.
- THashMap<TString, TString> MakeRegexByOtherNameMap(
+ TVector<std::tuple<TString, TString>> MakeRegexByOtherName(
const NSQLReflect::TLexerGrammar& grammar, bool ansi);
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make
index 7e62fb50c85..87cb156cd93 100644
--- a/yql/essentials/sql/v1/lexer/ut/ya.make
+++ b/yql/essentials/sql/v1/lexer/ut/ya.make
@@ -4,8 +4,11 @@ PEERDIR(
yql/essentials/core/issue
yql/essentials/parser/lexer_common
yql/essentials/sql/v1/lexer/antlr3
+ yql/essentials/sql/v1/lexer/antlr3_ansi
yql/essentials/sql/v1/lexer/antlr4
+ yql/essentials/sql/v1/lexer/antlr4_ansi
yql/essentials/sql/v1/lexer/antlr4_pure
+ yql/essentials/sql/v1/lexer/antlr4_pure_ansi
yql/essentials/sql/v1/lexer/regex
)
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
index f47f35cb9de..c0af06e0b46 100644
--- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
@@ -134,7 +134,7 @@ namespace NSQLReflect {
auto [name, block] = ParseLexerRule(std::move(line));
if (!name.StartsWith(FragmentPrefix)) {
- grammar.OtherNames.emplace(name);
+ grammar.OtherNames.emplace_back(name);
}
SubstGlobal(name, FragmentPrefix, "");
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h
index 5225a3c996b..ca398706873 100644
--- a/yql/essentials/sql/v1/reflect/sql_reflect.h
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.h
@@ -1,15 +1,16 @@
#pragma once
#include <util/generic/string.h>
-#include <util/generic/hash_set.h>
#include <util/generic/hash.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/vector.h>
namespace NSQLReflect {
struct TLexerGrammar {
THashSet<TString> KeywordNames;
THashSet<TString> PunctuationNames;
- THashSet<TString> OtherNames;
+ TVector<TString> OtherNames;
THashMap<TString, TString> BlockByName;
};