diff options
author | vityaman <[email protected]> | 2025-04-11 17:00:21 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-04-11 17:16:43 +0300 |
commit | 35a2668d661e1293630350bf07ad4d297ae338ee (patch) | |
tree | 584795bf9334ed7cd6b16e98b102bd92c118472f | |
parent | ffa276d35af2cc7ab17b850915674b60900c8b56 (diff) |
YQL-19616 Fix lexer/regex STRING_VALUE and TSKIP recognition
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Related to https://github.com/vityaman/ydb/issues/11
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1201
commit_hash:53ef677a35649a6dc77d8c4269a8aceefcd15026
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer_ut.cpp | 29 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 19 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.cpp | 2 |
3 files changed, 40 insertions, 10 deletions
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp index 8f5b6d69e9b..92395f5237b 100644 --- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp @@ -307,10 +307,19 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, "INSERT", "INSERT EOF"); UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF"); UNIT_ASSERT_TOKENIZED(lexer, "from", "FROM(from) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, " UPSERT ", "WS( ) UPSERT WS( ) EOF"); + } + + Y_UNIT_TEST_ON_EACH_LEXER(KeywordSkip) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) { UNIT_ASSERT_TOKENIZED(lexer, "sKip", "TSKIP(sKip) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "SKIP", "TSKIP(SKIP) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, " SKIP ", "WS( ) TSKIP(SKIP) WS( ) EOF"); } else { UNIT_ASSERT_TOKENIZED(lexer, "sKip", "SKIP(sKip) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "SKIP", "SKIP EOF"); + UNIT_ASSERT_TOKENIZED(lexer, " SKIP ", "WS( ) SKIP WS( ) EOF"); } } @@ -371,6 +380,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, "@@ @@@", "STRING_VALUE(@@ @@@) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "@@test@@", "STRING_VALUE(@@test@@) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "@@@@ @@A@@ @@@A@@", "STRING_VALUE(@@@@) WS( ) STRING_VALUE(@@A@@) WS( ) STRING_VALUE(@@@A@@) EOF"); } Y_UNIT_TEST_ON_EACH_LEXER(SingleLineComment) { @@ -414,8 +424,8 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { auto reference = MakeLexer(Lexers, ANSI, /* antlr4 = */ true, ELexerFlavor::Pure); SetRandomSeed(100); - for (size_t i = 0; i < 512; ++i) { - auto input = RandomMultilineCommentLikeText(/* maxSize = */ 32); + for (size_t i = 0; i < 128; ++i) { + auto input = RandomMultilineCommentLikeText(/* maxSize = */ 16); TString actual = Tokenized(lexer, input); TString expected = Tokenized(reference, input); @@ -459,4 +469,19 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, query, expected); } + Y_UNIT_TEST_ON_EACH_LEXER(Examples) { + auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR); + UNIT_ASSERT_TOKENIZED( + lexer, + R"( +SELECT + YQL::@@(Uint32 '100500)@@, + YQL::@@(String '[WAT])@@ +;)", + "WS(\n) " + "SELECT WS(\n) WS( ) ID_PLAIN(YQL) NAMESPACE(::) STRING_VALUE(@@(Uint32 '100500)@@) COMMA(,) WS(\n) " + "WS( ) ID_PLAIN(YQL) NAMESPACE(::) STRING_VALUE(@@(String '[WAT])@@) WS(\n) " + "SEMICOLON(;) EOF"); + } + } // Y_UNIT_TEST_SUITE(SQLv1Lexer) diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index 9f96e444ac7..820cbebf235 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -19,6 +19,7 @@ namespace NSQLTranslationV1 { class TRegexLexer: public NSQLTranslation::ILexer { static constexpr const char* CommentTokenName = "COMMENT"; + static constexpr const char* StringValueName = "STRING_VALUE"; public: TRegexLexer( @@ -28,14 +29,17 @@ namespace NSQLTranslationV1 { : Grammar_(std::move(grammar)) , Ansi_(ansi) { - RE2::Options custom; - custom.set_longest_match(true); - for (const auto& [token, regex] : RegexByOtherName) { + RE2::Options custom; + if (token != CommentTokenName && token != StringValueName) { + custom.set_longest_match(true); + } + + RE2* re2 = new RE2(regex, custom); if (token == CommentTokenName) { - CommentRegex_.Reset(new RE2(regex)); + CommentRegex_.Reset(re2); } else { - OtherRegexes_.emplace_back(token, new RE2(regex, custom)); + OtherRegexes_.emplace_back(token, re2); } } } @@ -112,8 +116,9 @@ namespace NSQLTranslationV1 { bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { size_t count = 0; for (const auto& keyword : Grammar_.KeywordNames) { - const TStringBuf content = prefix.substr(0, keyword.length()); - if (AsciiEqualsIgnoreCase(content, NSQLReflect::TLexerGrammar::KeywordBlock(keyword))) { + const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword); + const TStringBuf content = prefix.substr(0, block.length()); + if (AsciiEqualsIgnoreCase(content, block)) { matches.emplace_back(keyword, TString(content)); count += 1; } diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp index 22fda0a33cd..262209cfc39 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp +++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp @@ -133,7 +133,7 @@ namespace NSQLReflect { SubstGlobal(block, "'", ""); SubstGlobal(block, " ", ""); - Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("SKIP"))); + Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("TSKIP"))); grammar.KeywordNames.emplace(std::move(name)); } |