summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer
diff options
context:
space:
mode:
authorvityaman <[email protected]>2025-04-09 15:56:28 +0300
committerrobot-piglet <[email protected]>2025-04-09 16:23:45 +0300
commit51016b5bd58ceae9cd9e56aaa4b52a0a12174221 (patch)
tree8996776d97804c1e0f08e329abed7c638e2ce506 /yql/essentials/sql/v1/lexer
parent8c02be7c9a260edf13714760e07bf560c803761a (diff)
YQL-19616 Fix regex lexer
Fixed regex lexer issues: - `TSKIP` token recognition - `HEXGIGITS` number recognition - `EOF` token content --- - Related to https://github.com/ydb-platform/ydb/issues/15129 - Related to https://github.com/vityaman/ydb/issues/11 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1190 commit_hash:497c39efcbbe4e387da523b5e2c8abaa6485d93b
Diffstat (limited to 'yql/essentials/sql/v1/lexer')
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.cpp12
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp9
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp2
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex_ut.cpp2
4 files changed, 17 insertions, 8 deletions
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
index 549dc9d8fa9..8f5b6d69e9b 100644
--- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -28,7 +28,7 @@ using namespace NSQLTranslationV1;
TLexers Lexers = {
.Antlr3 = MakeAntlr3LexerFactory(),
- .Antlr3Ansi = MakeAntlr4AnsiLexerFactory(),
+ .Antlr3Ansi = MakeAntlr3AnsiLexerFactory(),
.Antlr4 = MakeAntlr4LexerFactory(),
.Antlr4Ansi = MakeAntlr4AnsiLexerFactory(),
.Antlr4Pure = MakeAntlr4PureLexerFactory(),
@@ -307,6 +307,11 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, "INSERT", "INSERT EOF");
UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF");
UNIT_ASSERT_TOKENIZED(lexer, "from", "FROM(from) EOF");
+ if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) {
+ UNIT_ASSERT_TOKENIZED(lexer, "sKip", "TSKIP(sKip) EOF");
+ } else {
+ UNIT_ASSERT_TOKENIZED(lexer, "sKip", "SKIP(sKip) EOF");
+ }
}
Y_UNIT_TEST_ON_EACH_LEXER(Punctuation) {
@@ -337,6 +342,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, "123", "DIGITS(123) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "123u", "INTEGER_VALUE(123u) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "123ui", "INTEGER_VALUE(123ui) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "0xDEADbeef", "DIGITS(0xDEADbeef) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "123.45", "REAL(123.45) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "123.45E10", "REAL(123.45E10) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "123.45E+10", "REAL(123.45E+10) EOF");
@@ -353,7 +359,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
if (!ANSI) {
UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF");
UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF");
- } else {
+ } else if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) {
UNIT_ASSERT_TOKENIZED(lexer, "\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF");
UNIT_ASSERT_TOKENIZED(lexer, "\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF");
}
@@ -387,7 +393,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
if (!ANSI) {
UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
- } else {
+ } else if (ANTLR4 || FLAVOR == ELexerFlavor::Regex) {
UNIT_ASSERT_TOKENIZED(lexer, "/* /* yql */", "COMMENT(/* /* yql */) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF");
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index b8ca033b0c6..9f96e444ac7 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -28,11 +28,14 @@ namespace NSQLTranslationV1 {
: Grammar_(std::move(grammar))
, Ansi_(ansi)
{
+ RE2::Options custom;
+ custom.set_longest_match(true);
+
for (const auto& [token, regex] : RegexByOtherName) {
if (token == CommentTokenName) {
CommentRegex_.Reset(new RE2(regex));
} else {
- OtherRegexes_.emplace_back(token, new RE2(regex));
+ OtherRegexes_.emplace_back(token, new RE2(regex, custom));
}
}
}
@@ -62,7 +65,7 @@ namespace NSQLTranslationV1 {
onNextToken(std::move(matched));
}
- onNextToken(TParsedToken{.Name = "EOF"});
+ onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"});
return errors == 0;
}
@@ -110,7 +113,7 @@ namespace NSQLTranslationV1 {
size_t count = 0;
for (const auto& keyword : Grammar_.KeywordNames) {
const TStringBuf content = prefix.substr(0, keyword.length());
- if (AsciiEqualsIgnoreCase(content, keyword)) {
+ if (AsciiEqualsIgnoreCase(content, NSQLReflect::TLexerGrammar::KeywordBlock(keyword))) {
matches.emplace_back(keyword, TString(content));
count += 1;
}
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
index 03c84bcffe3..6ac25008b34 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
@@ -216,4 +216,4 @@ Y_UNIT_TEST_SUITE(RegexLexerTests) {
Check("\" SELECT", "[INVALID] WS( ) SELECT EOF");
}
-} // Y_UNIT_TEST_SUITE(RegexLexerTests) \ No newline at end of file
+} // Y_UNIT_TEST_SUITE(RegexLexerTests)
diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
index dad0b2ebd2d..8f22bda5886 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
@@ -93,4 +93,4 @@ Y_UNIT_TEST_SUITE(SqlRegexTests) {
Get(defaultRegexes, "COMMENT"));
}
-} // Y_UNIT_TEST_SUITE(SqlRegexTests) \ No newline at end of file
+} // Y_UNIT_TEST_SUITE(SqlRegexTests)