diff options
author | vityaman <[email protected]> | 2025-04-09 15:56:28 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-04-09 16:23:45 +0300 |
commit | 51016b5bd58ceae9cd9e56aaa4b52a0a12174221 (patch) | |
tree | 8996776d97804c1e0f08e329abed7c638e2ce506 /yql/essentials/sql/v1 | |
parent | 8c02be7c9a260edf13714760e07bf560c803761a (diff) |
YQL-19616 Fix regex lexer
Fixed regex lexer issues:
- `TSKIP` token recognition
- `HEXGIGITS` number recognition
- `EOF` token content
---
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Related to https://github.com/vityaman/ydb/issues/11
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1190
commit_hash:497c39efcbbe4e387da523b5e2c8abaa6485d93b
Diffstat (limited to 'yql/essentials/sql/v1')
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer_ut.cpp | 12 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 9 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp | 2 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex_ut.cpp | 2 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.cpp | 9 | ||||
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.h | 2 |
6 files changed, 27 insertions, 9 deletions
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp index 549dc9d8fa9..8f5b6d69e9b 100644 --- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp @@ -28,7 +28,7 @@ using namespace NSQLTranslationV1; TLexers Lexers = { .Antlr3 = MakeAntlr3LexerFactory(), - .Antlr3Ansi = MakeAntlr4AnsiLexerFactory(), + .Antlr3Ansi = MakeAntlr3AnsiLexerFactory(), .Antlr4 = MakeAntlr4LexerFactory(), .Antlr4Ansi = MakeAntlr4AnsiLexerFactory(), .Antlr4Pure = MakeAntlr4PureLexerFactory(), @@ -307,6 +307,11 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, "INSERT", "INSERT EOF"); UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF"); UNIT_ASSERT_TOKENIZED(lexer, "from", "FROM(from) EOF"); + if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) { + UNIT_ASSERT_TOKENIZED(lexer, "sKip", "TSKIP(sKip) EOF"); + } else { + UNIT_ASSERT_TOKENIZED(lexer, "sKip", "SKIP(sKip) EOF"); + } } Y_UNIT_TEST_ON_EACH_LEXER(Punctuation) { @@ -337,6 +342,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, "123", "DIGITS(123) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "123u", "INTEGER_VALUE(123u) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "123ui", "INTEGER_VALUE(123ui) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "0xDEADbeef", "DIGITS(0xDEADbeef) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "123.45", "REAL(123.45) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "123.45E10", "REAL(123.45E10) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "123.45E+10", "REAL(123.45E+10) EOF"); @@ -353,7 +359,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { if (!ANSI) { UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF"); UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF"); - } else { + } else if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) { UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF"); UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF"); } @@ -387,7 +393,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { if (!ANSI) { UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); - } else { + } else if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) { UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF"); diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index b8ca033b0c6..9f96e444ac7 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -28,11 +28,14 @@ namespace NSQLTranslationV1 { : Grammar_(std::move(grammar)) , Ansi_(ansi) { + RE2::Options custom; + custom.set_longest_match(true); + for (const auto& [token, regex] : RegexByOtherName) { if (token == CommentTokenName) { CommentRegex_.Reset(new RE2(regex)); } else { - OtherRegexes_.emplace_back(token, new RE2(regex)); + OtherRegexes_.emplace_back(token, new RE2(regex, custom)); } } } @@ -62,7 +65,7 @@ namespace NSQLTranslationV1 { onNextToken(std::move(matched)); } - onNextToken(TParsedToken{.Name = "EOF"}); + onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"}); return errors == 0; } @@ -110,7 +113,7 @@ namespace NSQLTranslationV1 { size_t count = 0; for (const auto& keyword : Grammar_.KeywordNames) { const TStringBuf content = prefix.substr(0, keyword.length()); - if (AsciiEqualsIgnoreCase(content, keyword)) { + if (AsciiEqualsIgnoreCase(content, NSQLReflect::TLexerGrammar::KeywordBlock(keyword))) { matches.emplace_back(keyword, TString(content)); count += 1; } diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp index 03c84bcffe3..6ac25008b34 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp @@ -216,4 +216,4 @@ Y_UNIT_TEST_SUITE(RegexLexerTests) { Check("\" SELECT", "[INVALID] WS( ) SELECT EOF"); } -} // Y_UNIT_TEST_SUITE(RegexLexerTests)
\ No newline at end of file +} // Y_UNIT_TEST_SUITE(RegexLexerTests) diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp index dad0b2ebd2d..8f22bda5886 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp @@ -93,4 +93,4 @@ Y_UNIT_TEST_SUITE(SqlRegexTests) { Get(defaultRegexes, "COMMENT")); } -} // Y_UNIT_TEST_SUITE(SqlRegexTests)
\ No newline at end of file +} // Y_UNIT_TEST_SUITE(SqlRegexTests) diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp index c0af06e0b46..22fda0a33cd 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.cpp +++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp @@ -15,6 +15,13 @@ namespace NSQLReflect { const TStringBuf SectionOther = "//! section:other"; const TStringBuf FragmentPrefix = "fragment "; + const TStringBuf TLexerGrammar::KeywordBlock(const TStringBuf name) { + if (name == "TSKIP") { + return "SKIP"; + } + return name; + } + TVector<TString> GetResourceLines(const TStringBuf key) { TString text; Y_ENSURE(NResource::FindExact(key, &text)); @@ -126,7 +133,7 @@ namespace NSQLReflect { SubstGlobal(block, "'", ""); SubstGlobal(block, " ", ""); - Y_ENSURE(name == block || (name == "TSKIP" && block == "SKIP")); + Y_ENSURE(name == block || (name == "TSKIP" && block == TLexerGrammar::KeywordBlock("SKIP"))); grammar.KeywordNames.emplace(std::move(name)); } diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h index ca398706873..1f67a2f93a3 100644 --- a/yql/essentials/sql/v1/reflect/sql_reflect.h +++ b/yql/essentials/sql/v1/reflect/sql_reflect.h @@ -12,6 +12,8 @@ namespace NSQLReflect { THashSet<TString> PunctuationNames; TVector<TString> OtherNames; THashMap<TString, TString> BlockByName; + + static const TStringBuf KeywordBlock(const TStringBuf name); }; TLexerGrammar LoadLexerGrammar(); |