summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvityaman <[email protected]>2025-04-11 17:00:21 +0300
committerrobot-piglet <[email protected]>2025-04-11 17:16:43 +0300
commit35a2668d661e1293630350bf07ad4d297ae338ee (patch)
tree584795bf9334ed7cd6b16e98b102bd92c118472f
parentffa276d35af2cc7ab17b850915674b60900c8b56 (diff)
YQL-19616 Fix lexer/regex STRING_VALUE and TSKIP recognition
- Related to https://github.com/ydb-platform/ydb/issues/15129 - Related to https://github.com/vityaman/ydb/issues/11 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1201 commit_hash:53ef677a35649a6dc77d8c4269a8aceefcd15026
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.cpp29
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp19
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.cpp2
3 files changed, 40 insertions, 10 deletions
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
index 8f5b6d69e9b..92395f5237b 100644
--- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -307,10 +307,19 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, "INSERT", "INSERT EOF");
UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF");
UNIT_ASSERT_TOKENIZED(lexer, "from", "FROM(from) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, " UPSERT ", "WS( ) UPSERT WS( ) EOF");
+ }
+
+ Y_UNIT_TEST_ON_EACH_LEXER(KeywordSkip) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) {
UNIT_ASSERT_TOKENIZED(lexer, "sKip", "TSKIP(sKip) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "SKIP", "TSKIP(SKIP) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, " SKIP ", "WS( ) TSKIP(SKIP) WS( ) EOF");
} else {
UNIT_ASSERT_TOKENIZED(lexer, "sKip", "SKIP(sKip) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "SKIP", "SKIP EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, " SKIP ", "WS( ) SKIP WS( ) EOF");
}
}
@@ -371,6 +380,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, "@@ @@@", "STRING_VALUE(@@ @@@) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "@@test@@", "STRING_VALUE(@@test@@) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "@@@@ @@A@@ @@@A@@", "STRING_VALUE(@@@@) WS( ) STRING_VALUE(@@A@@) WS( ) STRING_VALUE(@@@A@@) EOF");
}
Y_UNIT_TEST_ON_EACH_LEXER(SingleLineComment) {
@@ -414,8 +424,8 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
auto reference = MakeLexer(Lexers, ANSI, /* antlr4 = */ true, ELexerFlavor::Pure);
SetRandomSeed(100);
- for (size_t i = 0; i < 512; ++i) {
- auto input = RandomMultilineCommentLikeText(/* maxSize = */ 32);
+ for (size_t i = 0; i < 128; ++i) {
+ auto input = RandomMultilineCommentLikeText(/* maxSize = */ 16);
TString actual = Tokenized(lexer, input);
TString expected = Tokenized(reference, input);
@@ -459,4 +469,19 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, query, expected);
}
+ Y_UNIT_TEST_ON_EACH_LEXER(Examples) {
+ auto lexer = MakeLexer(Lexers, ANSI, ANTLR4, FLAVOR);
+ UNIT_ASSERT_TOKENIZED(
+ lexer,
+ R"(
+SELECT
+ YQL::@@(Uint32 '100500)@@,
+ YQL::@@(String '[WAT])@@
+;)",
+ "WS(\n) "
+ "SELECT WS(\n) WS( ) ID_PLAIN(YQL) NAMESPACE(::) STRING_VALUE(@@(Uint32 '100500)@@) COMMA(,) WS(\n) "
+ "WS( ) ID_PLAIN(YQL) NAMESPACE(::) STRING_VALUE(@@(String '[WAT])@@) WS(\n) "
+ "SEMICOLON(;) EOF");
+ }
+
} // Y_UNIT_TEST_SUITE(SQLv1Lexer)
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index 9f96e444ac7..820cbebf235 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -19,6 +19,7 @@ namespace NSQLTranslationV1 {
class TRegexLexer: public NSQLTranslation::ILexer {
static constexpr const char* CommentTokenName = "COMMENT";
+ static constexpr const char* StringValueName = "STRING_VALUE";
public:
TRegexLexer(
@@ -28,14 +29,17 @@ namespace NSQLTranslationV1 {
: Grammar_(std::move(grammar))
, Ansi_(ansi)
{
- RE2::Options custom;
- custom.set_longest_match(true);
-
for (const auto& [token, regex] : RegexByOtherName) {
+ RE2::Options custom;
+ if (token != CommentTokenName && token != StringValueName) {
+ custom.set_longest_match(true);
+ }
+
+ RE2* re2 = new RE2(regex, custom);
if (token == CommentTokenName) {
- CommentRegex_.Reset(new RE2(regex));
+ CommentRegex_.Reset(re2);
} else {
- OtherRegexes_.emplace_back(token, new RE2(regex, custom));
+ OtherRegexes_.emplace_back(token, re2);
}
}
}
@@ -112,8 +116,9 @@ namespace NSQLTranslationV1 {
bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
size_t count = 0;
for (const auto& keyword : Grammar_.KeywordNames) {
- const TStringBuf content = prefix.substr(0, keyword.length());
- if (AsciiEqualsIgnoreCase(content, NSQLReflect::TLexerGrammar::KeywordBlock(keyword))) {
+ const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword);
+ const TStringBuf content = prefix.substr(0, block.length());
+ if (AsciiEqualsIgnoreCase(content, block)) {
matches.emplace_back(keyword, TString(content));
count += 1;
}
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
index 22fda0a33cd..262209cfc39 100644
--- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
@@ -133,7 +133,7 @@ namespace NSQLReflect {
SubstGlobal(block, "'", "");
SubstGlobal(block, " ", "");
- Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("SKIP")));
+ Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("TSKIP")));
grammar.KeywordNames.emplace(std::move(name));
}