diff options
author | robot-piglet <[email protected]> | 2025-04-01 01:12:58 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-04-01 01:23:36 +0300 |
commit | f93076bbe93dd6ebb8d75a930268d30839b9011a (patch) | |
tree | dbfc5b2bea8bf16b1599a69f0f721a2acdc5dac2 /yql/essentials | |
parent | 2d512f78c593c3f4573742129c281d0fc5479de0 (diff) |
Intermediate changes
commit_hash:e57b3e95787cc8037f200f1b6b6073e35403b27e
Diffstat (limited to 'yql/essentials')
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer_ut.cpp | 319 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer_ut.h | 37 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 39 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex.cpp | 6 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex.h | 2 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/ut/ya.make | 3 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.cpp | 2 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.h | 5 |
8 files changed, 327 insertions, 86 deletions
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp index 53cff6ffdc7..1ddfd04b507 100644 --- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp @@ -1,36 +1,60 @@ #include "lexer.h" +#include "lexer_ut.h" #include <yql/essentials/core/issue/yql_issue.h> #include <yql/essentials/sql/settings/translation_settings.h> #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h> +#include <yql/essentials/sql/v1/lexer/antlr3_ansi/lexer.h> #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h> +#include <yql/essentials/sql/v1/lexer/antlr4_ansi/lexer.h> #include <yql/essentials/sql/v1/lexer/antlr4_pure/lexer.h> +#include <yql/essentials/sql/v1/lexer/antlr4_pure_ansi/lexer.h> #include <yql/essentials/sql/v1/lexer/regex/lexer.h> #include <library/cpp/testing/unittest/registar.h> +#include <util/string/ascii.h> +#include <util/random/random.h> + +#define UNIT_ASSERT_TOKENIZED(LEXER, QUERY, TOKENS) \ + do { \ + auto tokens = Tokenized((LEXER), (QUERY)); \ + UNIT_ASSERT_VALUES_EQUAL(tokens, (TOKENS)); \ + } while (false) + using namespace NSQLTranslation; using namespace NSQLTranslationV1; -std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) { +TLexers Lexers = { + .Antlr3 = MakeAntlr3LexerFactory(), + .Antlr3Ansi = MakeAntlr4AnsiLexerFactory(), + .Antlr4 = MakeAntlr4LexerFactory(), + .Antlr4Ansi = MakeAntlr4AnsiLexerFactory(), + .Antlr4Pure = MakeAntlr4PureLexerFactory(), + .Antlr4PureAnsi = MakeAntlr4PureAnsiLexerFactory(), + .Regex = MakeRegexLexerFactory(/* ansi = */ false), + .RegexAnsi = MakeRegexLexerFactory(/* ansi = */ true), +}; + +std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, const TString& query) { TParsedTokenList tokens; NYql::TIssues issues; - Tokenize(*lexer, queryUtf8, "", tokens, issues, SQL_MAX_PARSER_ERRORS); + Tokenize(*lexer, query, "", tokens, issues, SQL_MAX_PARSER_ERRORS); return {tokens, issues}; } -TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) { +TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, const TString& query) { TVector<TString> messages; - for (const auto& issue : Tokenize(lexer, queryUtf8).second) { + for (const auto& issue : Tokenize(lexer, query).second) { messages.emplace_back(issue.ToString(/* oneLine = */ true)); } return messages; } -TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) { +TVector<TString> GetTokenViews(ILexer::TPtr& lexer, const TString& query) { TVector<TString> names; - for (auto& token : Tokenize(lexer, queryUtf8).first) { + for (auto& token : Tokenize(lexer, query).first) { TString view = std::move(token.Name); if (view == "ID_PLAIN" || view == "STRING_VALUE") { view.append(" ("); @@ -42,28 +66,58 @@ TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) { return names; } -void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) { - if (lhs.Name == "EOF" && rhs.Name == "EOF") { - return; +TString ToString(TParsedToken token) { + TString& string = token.Name; + if (!AsciiEqualsIgnoreCase(token.Name, token.Content) && token.Name != "EOF") { + string += "("; + string += token.Content; + string += ")"; + } + return string; +} + +TString Tokenized(ILexer::TPtr& lexer, const TString& query) { + TParsedTokenList tokens; + NYql::TIssues issues; + bool ok = Tokenize(*lexer, query, "Test", tokens, issues, SQL_MAX_PARSER_ERRORS); + + TString out; + if (!ok) { + out = "[INVALID] "; } - UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name); - UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content); - UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line); + for (auto& token : tokens) { + out += ToString(std::move(token)); + out += " "; + } + if (!out.empty()) { + out.pop_back(); + } + return out; } -void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) { - UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size()); - for (size_t i = 0; i < lhs.size(); ++i) { - AssertEquivialent(lhs.at(i), rhs.at(i)); +TString RandomMultilineCommentLikeText(size_t maxSize) { + auto size = RandomNumber<size_t>(maxSize); + TString comment; + for (size_t i = 0; i < size; ++i) { + if (auto /* isOpen */ _ = RandomNumber<bool>()) { + comment += "/*"; + } else { + comment += "*/"; + } + + for (int gap = RandomNumber<size_t>(2); gap > 0; --gap) { + comment += " "; + } } + return comment; } Y_UNIT_TEST_SUITE(SQLv1Lexer) { Y_UNIT_TEST(UnsupportedIssues) { NSQLTranslationV1::TLexers factories; - TVector<ILexer::TPtr> lexers; + TVector<ILexer::TPtr> lexers; for (auto ansi : {false, true}) { for (auto antlr4 : {false, true}) { for (auto flavor : {ELexerFlavor::Default, ELexerFlavor::Pure, ELexerFlavor::Regex}) { @@ -96,8 +150,8 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_VALUES_EQUAL(actual, expected); } - Y_UNIT_TEST(AntlrVersionIndependent) { - const TVector<TString> queriesUtf8 = { + Y_UNIT_TEST_ON_EACH_LEXER(AntlrAndFlavorIndependent) { + static const TVector<TString> queries = { "", " ", "SELECT", @@ -115,35 +169,31 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { "\"select\"select", }; - NSQLTranslationV1::TLexers lexers; - lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory(); - lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory(); - lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory(); - - auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false); - auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true); - auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure); - - for (const auto& query : queriesUtf8) { - auto [tokens3, issues3] = Tokenize(lexer3, query); - auto [tokens4, issues4] = Tokenize(lexer4, query); - auto [tokens4p, issues4p] = Tokenize(lexer4p, query); - AssertEquivialent(tokens3, tokens4); - AssertEquivialent(tokens3, tokens4p); - UNIT_ASSERT(issues3.Empty()); - UNIT_ASSERT(issues4.Empty()); - UNIT_ASSERT(issues4p.Empty()); + static TVector<TString> expectations(queries.size()); + + if (ANSI) { + return; + } + + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + + for (size_t i = 0; i < queries.size(); ++i) { + const auto& query = queries[i]; + auto& expected = expectations[i]; + + if (expected.empty()) { + expected = Tokenized(lexer, query); + return; + } + + UNIT_ASSERT_TOKENIZED(lexer, query, expected); } } TVector<TString> InvalidQueries(); void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) { - NSQLTranslationV1::TLexers lexers; - lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory(); - lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory(); - - auto lexer = MakeLexer(lexers, /* ansi = */ false, antlr4); + auto lexer = MakeLexer(Lexers, /* ansi = */ false, antlr4); auto input = InvalidQueries(); UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size()); @@ -198,16 +248,10 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { } Y_UNIT_TEST(IssuesCollected) { - NSQLTranslationV1::TLexers lexers; - lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory(); - lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory(); - lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory(); - lexers.Regex = NSQLTranslationV1::MakeRegexLexerFactory(/* ansi = */ false); - - auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false); - auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true); - auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure); - auto lexerR = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex); + auto lexer3 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ false); + auto lexer4 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ true); + auto lexer4p = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure); + auto lexerR = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex); for (const auto& query : InvalidQueries()) { auto issues3 = GetIssueMessages(lexer3, query); @@ -223,9 +267,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { } Y_UNIT_TEST(IssueMessagesAntlr3) { - NSQLTranslationV1::TLexers lexers; - lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory(); - auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false); + auto lexer3 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ false); auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR"); @@ -240,10 +282,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { } Y_UNIT_TEST(IssueMessagesAntlr4) { - NSQLTranslationV1::TLexers lexers; - lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory(); - - auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true); + auto lexer4 = MakeLexer(Lexers, /* ansi = */ false, /* antlr4 = */ true); auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR"); @@ -253,4 +292,164 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_VALUES_EQUAL(actual, expected); } -} + + Y_UNIT_TEST_ON_EACH_LEXER(Whitespace) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "", "EOF"); + UNIT_ASSERT_TOKENIZED(lexer, " ", "WS( ) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, " ", "WS( ) WS( ) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "\n", "WS(\n) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(Keyword) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "SELECT", "SELECT EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "INSERT", "INSERT EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(Punctuation) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED( + lexer, + "* / + - <|", + "ASTERISK(*) WS( ) SLASH(/) WS( ) " + "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "SELECT*FROM", "SELECT ASTERISK(*) FROM EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(IdPlain) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(IdQuoted) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "``", "ID_QUOTED(``) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "` `", "ID_QUOTED(` `) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "`local/table`", "ID_QUOTED(`local/table`) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(Number) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "1", "DIGITS(1) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123", "DIGITS(123) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123u", "INTEGER_VALUE(123u) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123ui", "INTEGER_VALUE(123ui) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123.45", "REAL(123.45) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123.45E10", "REAL(123.45E10) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123.45E+10", "REAL(123.45E+10) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "1E+10", "REAL(1E+10) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(SingleLineString) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "\"\"", "STRING_VALUE(\"\") EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "\' \'", "STRING_VALUE(\' \') EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "\" \"", "STRING_VALUE(\" \") EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "\"test\"", "STRING_VALUE(\"test\") EOF"); + + if (!ANSI) { + UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF"); + } else { + UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF"); + } + } + + Y_UNIT_TEST_ON_EACH_LEXER(MultiLineString) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "@@@@", "STRING_VALUE(@@@@) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "@@ @@@", "STRING_VALUE(@@ @@@) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "@@test@@", "STRING_VALUE(@@test@@) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(SingleLineComment) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "--yql", "COMMENT(--yql) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "-- yql ", "COMMENT(-- yql ) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "-- yql --", "COMMENT(-- yql --) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(MultiLineComment) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "/* yql */", "COMMENT(/* yql */) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(RecursiveMultiLineComment) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + if (!ANSI) { + UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); + } else { + UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF"); + } + } + + Y_UNIT_TEST_ON_EACH_LEXER(RandomRecursiveMultiLineComment) { + if (!ANTLR4 && FLAVOR != ELexerFlavor::Regex || FLAVOR != ELexerFlavor::Pure) { + return; + } + + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + auto reference = MakeLexer(Lexers, ANSI, /* antlr4 = */ true, ELexerFlavor::Pure); + + SetRandomSeed(100); + for (size_t i = 0; i < 512; ++i) { + auto input = RandomMultilineCommentLikeText(/* maxSize = */ 32); + TString actual = Tokenized(lexer, input); + TString expected = Tokenized(reference, input); + + UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input); + } + } + + Y_UNIT_TEST_ON_EACH_LEXER(SimpleQuery) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED(lexer, "select 1", "SELECT WS( ) DIGITS(1) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "SELect 1", "SELECT WS( ) DIGITS(1) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(ComplexQuery) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + + TString query = + "SELECT\n" + " 123467,\n" + " \"Hello, {name}!\",\n" + " (1 + (5U * 1 / 0)),\n" + " MIN(identifier),\n" + " Bool(field),\n" + " Math::Sin(var)\n" + "FROM `local/test/space/table`\n" + "JOIN test;"; + + TString expected = + "SELECT WS(\n) " + "WS( ) WS( ) DIGITS(123467) COMMA(,) WS(\n) " + "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) " + "WS( ) WS( ) LPAREN(() DIGITS(1) WS( ) PLUS(+) WS( ) LPAREN(() INTEGER_VALUE(5U) WS( ) " + "ASTERISK(*) WS( ) DIGITS(1) WS( ) SLASH(/) WS( ) DIGITS(0) RPAREN()) " + "RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) " + "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) " + "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF"; + + UNIT_ASSERT_TOKENIZED(lexer, query, expected); + } + +} // Y_UNIT_TEST_SUITE(SQLv1Lexer) diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.h b/yql/essentials/sql/v1/lexer/lexer_ut.h new file mode 100644 index 00000000000..b4304eb7070 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/lexer_ut.h @@ -0,0 +1,37 @@ +#pragma once + +#include "lexer.h" + +#define LEXER_NAME_ANSI_false_ANTLR4_false_FLAVOR_Default "antlr3" +#define LEXER_NAME_ANSI_false_ANTLR4_true_FLAVOR_Default "antlr4" +#define LEXER_NAME_ANSI_true_ANTLR4_false_FLAVOR_Default "antlr3_ansi" +#define LEXER_NAME_ANSI_true_ANTLR4_true_FLAVOR_Default "antlr4_ansi" +#define LEXER_NAME_ANSI_false_ANTLR4_true_FLAVOR_Pure "antlr4_pure" +#define LEXER_NAME_ANSI_true_ANTLR4_true_FLAVOR_Pure "antlr4_pure_ansi" +#define LEXER_NAME_ANSI_false_ANTLR4_false_FLAVOR_Regex "regex" +#define LEXER_NAME_ANSI_true_ANTLR4_false_FLAVOR_Regex "regex_ansi" + +#define Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, ANSI, ANTLR4, FLAVOR) \ + TCurrentTest::AddTest( \ + #N "::" LEXER_NAME_ANSI_##ANSI##_ANTLR4_##ANTLR4##_FLAVOR_##FLAVOR, \ + static_cast<void (*)(NUnitTest::TTestContext&)>(&N<ANSI, ANTLR4, ELexerFlavor::FLAVOR>), \ + /* forceFork = */ false) + +#define Y_UNIT_TEST_ON_EACH_LEXER(N) \ + template <bool ANSI, bool ANTLR4, ELexerFlavor FLAVOR> \ + void N(NUnitTest::TTestContext&); \ + struct TTestRegistration##N { \ + TTestRegistration##N() { \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, false, Default); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, true, Default); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, false, Default); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, true, Default); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, true, Pure); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, true, Pure); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, false, false, Regex); \ + Y_UNIT_TEST_ON_EACH_LEXER_ADD_TEST(N, true, false, Regex); \ + } \ + }; \ + static TTestRegistration##N testRegistration##N; \ + template <bool ANSI, bool ANTLR4, ELexerFlavor FLAVOR> \ + void N(NUnitTest::TTestContext&) diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index 1c8f2104a48..b0b5c2dad44 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -10,6 +10,7 @@ #include <util/generic/algorithm.h> #include <util/generic/string.h> #include <util/string/subst.h> +#include <util/string/ascii.h> namespace NSQLTranslationV1 { @@ -23,15 +24,15 @@ namespace NSQLTranslationV1 { TRegexLexer( bool ansi, NSQLReflect::TLexerGrammar grammar, - const THashMap<TString, TString>& RegexByOtherNameMap) + const TVector<std::tuple<TString, TString>>& RegexByOtherName) : Grammar_(std::move(grammar)) , Ansi_(ansi) { - for (auto& [token, regex] : RegexByOtherNameMap) { + for (const auto& [token, regex] : RegexByOtherName) { if (token == CommentTokenName) { CommentRegex_.Reset(new RE2(regex)); } else { - OtherRegexes_.emplace(std::move(token), std::move(regex)); + OtherRegexes_.emplace_back(token, new RE2(regex)); } } } @@ -71,27 +72,27 @@ namespace NSQLTranslationV1 { size_t keywordCount = MatchKeyword(prefix, matches); MatchPunctuation(prefix, matches); - size_t otherCount = MatchRegex(prefix, matches); + MatchRegex(prefix, matches); MatchComment(prefix, matches); - auto max = MaxElementBy(matches, [](const TParsedToken& m) { - return m.Content.length(); - }); - - if (max == std::end(matches)) { + if (matches.empty()) { return {}; } + auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) { + return m.Content.length(); + })->Content.length(); + + auto max = FindIf(matches, [&](const TParsedToken& m) { + return m.Content.length() == maxLength; + }); + auto isMatched = [&](const TStringBuf name) { return std::end(matches) != FindIf(matches, [&](const auto& m) { return m.Name == name; }); }; - Y_ENSURE( - otherCount <= 1 || - (otherCount == 2 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); - size_t conflicts = CountIf(matches, [&](const TParsedToken& m) { return m.Content.length() == max->Content.length(); }); @@ -108,7 +109,7 @@ namespace NSQLTranslationV1 { bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { size_t count = 0; for (const auto& keyword : Grammar_.KeywordNames) { - if (prefix.substr(0, keyword.length()) == keyword) { + if (AsciiEqualsIgnoreCase(prefix.substr(0, keyword.length()), keyword)) { matches.emplace_back(keyword, keyword); count += 1; } @@ -131,7 +132,7 @@ namespace NSQLTranslationV1 { size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) { size_t count = 0; for (const auto& [token, regex] : OtherRegexes_) { - if (const TStringBuf match = TryMatchRegex(prefix, regex); !match.empty()) { + if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) { matches.emplace_back(token, TString(match)); count += 1; } @@ -216,7 +217,7 @@ namespace NSQLTranslationV1 { } NSQLReflect::TLexerGrammar Grammar_; - THashMap<TString, RE2> OtherRegexes_; + TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_; THolder<RE2> CommentRegex_; bool Ansi_; }; @@ -228,19 +229,19 @@ namespace NSQLTranslationV1 { explicit TFactory(bool ansi) : Ansi_(ansi) , Grammar_(NSQLReflect::LoadLexerGrammar()) - , RegexByOtherNameMap_(MakeRegexByOtherNameMap(Grammar_, Ansi_)) + , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_)) { } NSQLTranslation::ILexer::TPtr MakeLexer() const override { return NSQLTranslation::ILexer::TPtr( - new TRegexLexer(Ansi_, Grammar_, RegexByOtherNameMap_)); + new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_)); } private: bool Ansi_; NSQLReflect::TLexerGrammar Grammar_; - THashMap<TString, TString> RegexByOtherNameMap_; + TVector<std::tuple<TString, TString>> RegexByOtherName_; }; } // namespace diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp index a8aca8a1318..937d21572fc 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp @@ -227,12 +227,12 @@ namespace NSQLTranslationV1 { TRewriteRule UnwrapQuotedSpace_; }; - THashMap<TString, TString> MakeRegexByOtherNameMap(const NSQLReflect::TLexerGrammar& grammar, bool ansi) { + TVector<std::tuple<TString, TString>> MakeRegexByOtherName(const NSQLReflect::TLexerGrammar& grammar, bool ansi) { TLexerGrammarToRegexTranslator translator(grammar, ansi); - THashMap<TString, TString> regexes; + TVector<std::tuple<TString, TString>> regexes; for (const auto& token : grammar.OtherNames) { - regexes.emplace(token, translator.ToRegex(token)); + regexes.emplace_back(token, translator.ToRegex(token)); } return regexes; } diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h index 9e29c3df25b..1e9d92b6535 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex.h +++ b/yql/essentials/sql/v1/lexer/regex/regex.h @@ -8,7 +8,7 @@ namespace NSQLTranslationV1 { // Makes regexes only for tokens from OtherNames, // as keywords and punctuation are trivially matched. - THashMap<TString, TString> MakeRegexByOtherNameMap( + TVector<std::tuple<TString, TString>> MakeRegexByOtherName( const NSQLReflect::TLexerGrammar& grammar, bool ansi); } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make index 7e62fb50c85..87cb156cd93 100644 --- a/yql/essentials/sql/v1/lexer/ut/ya.make +++ b/yql/essentials/sql/v1/lexer/ut/ya.make @@ -4,8 +4,11 @@ PEERDIR( yql/essentials/core/issue yql/essentials/parser/lexer_common yql/essentials/sql/v1/lexer/antlr3 + yql/essentials/sql/v1/lexer/antlr3_ansi yql/essentials/sql/v1/lexer/antlr4 + yql/essentials/sql/v1/lexer/antlr4_ansi yql/essentials/sql/v1/lexer/antlr4_pure + yql/essentials/sql/v1/lexer/antlr4_pure_ansi yql/essentials/sql/v1/lexer/regex ) diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp index f47f35cb9de..c0af06e0b46 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp +++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp @@ -134,7 +134,7 @@ namespace NSQLReflect { auto [name, block] = ParseLexerRule(std::move(line)); if (!name.StartsWith(FragmentPrefix)) { - grammar.OtherNames.emplace(name); + grammar.OtherNames.emplace_back(name); } SubstGlobal(name, FragmentPrefix, ""); diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h index 5225a3c996b..ca398706873 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.h +++ b/yql/essentials/sql/v1/reflect/sql_reflect.h @@ -1,15 +1,16 @@ #pragma once #include <util/generic/string.h> -#include <util/generic/hash_set.h> #include <util/generic/hash.h> +#include <util/generic/hash_set.h> +#include <util/generic/vector.h> namespace NSQLReflect { struct TLexerGrammar { THashSet<TString> KeywordNames; THashSet<TString> PunctuationNames; - THashSet<TString> OtherNames; + TVector<TString> OtherNames; THashMap<TString, TString> BlockByName; }; |