diff options
author | vityaman <[email protected]> | 2025-03-28 18:29:24 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-03-28 18:50:04 +0300 |
commit | 60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch) | |
tree | 08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql | |
parent | 1e214be59cbf130bee433c422b42f16148e5acff (diff) |
YQL-19616 Convert YQL lexer grammar to regexes
- [x] Parse YQL grammar to extract lexer grammar into `TLexerGrammar`.
- [x] Translate `TLexerGrammar` into regexes.
- [x] Implement a lexer via regexes `TRegexLexer` to test generated regexes validity.
- [x] Test on `Default` syntax mode.
- [x] Test on `ANSI` syntax mode.
---
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Requirement for https://github.com/ytsaurus/ytsaurus/pull/1112
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1127
commit_hash:03ffffe81cdafe7f93a4d3fd9a3212fe67f1c72d
Diffstat (limited to 'yql/essentials/sql')
20 files changed, 1263 insertions, 64 deletions
diff --git a/yql/essentials/sql/v1/SQLv1Antlr4.g.in b/yql/essentials/sql/v1/SQLv1Antlr4.g.in index fb92a68f9ab..5c59ab61ea4 100644 --- a/yql/essentials/sql/v1/SQLv1Antlr4.g.in +++ b/yql/essentials/sql/v1/SQLv1Antlr4.g.in @@ -1775,9 +1775,7 @@ bool_value: (TRUE | FALSE); real: REAL; integer: DIGITS | INTEGER_VALUE; -// -// Lexer -// +//! section:punctuation EQUALS: '='; EQUALS2: '=='; @@ -1823,6 +1821,8 @@ fragment QUOTE_SINGLE: '\''; fragment BACKTICK: '`'; fragment DOUBLE_COMMAT: '@@'; +//! section:letter + // http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782 fragment A:('a'|'A'); fragment B:('b'|'B'); @@ -1851,6 +1851,8 @@ fragment X:('x'|'X'); fragment Y:('y'|'Y'); fragment Z:('z'|'Z'); +//! section:keyword + ABORT: A B O R T; ACTION: A C T I O N; ADD: A D D; @@ -2144,13 +2146,7 @@ WRAPPER: W R A P P E R; //WRITE: W R I T E; XOR: X O R; -// YQL Default Lexer: -// GRAMMAR_STRING_CORE_SINGLE = ~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .) -// GRAMMAR_STRING_CORE_DOUBLE = ~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .) - -// ANSI Lexer: -// GRAMMAR_STRING_CORE_SINGLE = ~QUOTE_SINGLE | (QUOTE_SINGLE QUOTE_SINGLE) -// GRAMMAR_STRING_CORE_DOUBLE = ~QUOTE_DOUBLE | (QUOTE_DOUBLE QUOTE_DOUBLE) +//! section:other fragment STRING_CORE_SINGLE: @GRAMMAR_STRING_CORE_SINGLE@; fragment STRING_CORE_DOUBLE: @GRAMMAR_STRING_CORE_DOUBLE@; @@ -2163,7 +2159,7 @@ STRING_VALUE: ((STRING_SINGLE | STRING_DOUBLE | STRING_MULTILINE) (S | U | Y | J ID_PLAIN: ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | DIGIT)*; -fragment ID_QUOTED_CORE: '\\'. | '``' | ~('`' | '\\'); +fragment ID_QUOTED_CORE: '\\' . | '``' | ~('`' | '\\'); ID_QUOTED: BACKTICK ID_QUOTED_CORE* BACKTICK; fragment DIGIT: '0'..'9'; @@ -2177,23 +2173,18 @@ DIGITS: DECDIGITS | HEXDIGITS | OCTDIGITS | BINDIGITS; // not all combinations of P/U with L/S/T/I/B/N are actually valid - this is resolved in sql.cpp INTEGER_VALUE: DIGITS ((P | U)? (L | S | T | I | B | N)?); -fragment FLOAT_EXP : E (PLUS | MINUS)? DECDIGITS ; +fragment FLOAT_EXP: E (PLUS | MINUS)? DECDIGITS; REAL: ( DECDIGITS DOT DIGIT* FLOAT_EXP? | DECDIGITS FLOAT_EXP // | DOT DECDIGITS FLOAT_EXP? // Conflicts with tuple element access through DOT - ) (F | P (F ('4'|'8') | N)?)? + ) (F | P (F ('4' | '8') | N)?)? ; BLOB: X QUOTE_SINGLE HEXDIGIT+ QUOTE_SINGLE; -// YQL Default Lexer: -// GRAMMAR_MULTILINE_COMMENT_CORE = . -// ANSI Lexer: -// GRAMMAR_MULTILINE_COMMENT_CORE = MULTILINE_COMMENT | . - fragment MULTILINE_COMMENT: '/*' ( @GRAMMAR_MULTILINE_COMMENT_CORE@ )*? '*/'; -fragment LINE_COMMENT: '--' ~('\n'|'\r')* ('\r' '\n'? | '\n' | EOF); -WS: (' '|'\r'|'\t'|'\u000C'|'\n')->channel(HIDDEN); -COMMENT: (MULTILINE_COMMENT|LINE_COMMENT)->channel(HIDDEN); +fragment LINE_COMMENT: '--' ~('\n' | '\r')* ('\r' '\n'? | '\n' | EOF); +WS: (' ' | '\r' | '\t' | '\u000C' | '\n') -> channel(HIDDEN); +COMMENT: (MULTILINE_COMMENT | LINE_COMMENT) -> channel(HIDDEN); diff --git a/yql/essentials/sql/v1/complete/sql_complete.cpp b/yql/essentials/sql/v1/complete/sql_complete.cpp index 53cb4ada420..753d0a2835c 100644 --- a/yql/essentials/sql/v1/complete/sql_complete.cpp +++ b/yql/essentials/sql/v1/complete/sql_complete.cpp @@ -125,7 +125,9 @@ namespace NSQLComplete { INameService::TPtr names = MakeStaticNameService(MakeDefaultNameSet()); return MakeSqlCompletionEngine([lexers = std::move(lexers)](bool ansi) { - return NSQLTranslationV1::MakeLexer(lexers, ansi, /* antlr4 = */ true, /* pure = */ true); + return NSQLTranslationV1::MakeLexer( + lexers, ansi, /* antlr4 = */ true, + NSQLTranslationV1::ELexerFlavor::Pure); }, std::move(names)); } diff --git a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp index 4fb6dfea587..aa242d313cb 100644 --- a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp +++ b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp @@ -43,7 +43,9 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) { lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory(); lexers.Antlr4PureAnsi = NSQLTranslationV1::MakeAntlr4PureAnsiLexerFactory(); return [lexers = std::move(lexers)](bool ansi) { - return NSQLTranslationV1::MakeLexer(lexers, ansi, /* antlr4 = */ true, /* pure = */ true); + return NSQLTranslationV1::MakeLexer( + lexers, ansi, /* antlr4 = */ true, + NSQLTranslationV1::ELexerFlavor::Pure); }; } diff --git a/yql/essentials/sql/v1/lexer/lexer.cpp b/yql/essentials/sql/v1/lexer/lexer.cpp index 5621cc65d7b..88ced55ccf4 100644 --- a/yql/essentials/sql/v1/lexer/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/lexer.cpp @@ -11,6 +11,7 @@ #include <util/string/ascii.h> #include <util/string/builder.h> #include <util/string/strip.h> +#include <util/string/join.h> #if defined(_tsan_enabled_) #include <util/system/mutex.h> @@ -29,8 +30,8 @@ using NSQLTranslation::MakeDummyLexerFactory; class TV1Lexer : public ILexer { public: - explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure) - : Factory(GetFactory(lexers, ansi, antlr4, pure)) + explicit TV1Lexer(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) + : Factory(GetFactory(lexers, ansi, antlr4, flavor)) { } @@ -42,52 +43,70 @@ public: } private: - static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4, bool pure = false) { - if (!ansi && !antlr4 && !pure) { - if (lexers.Antlr3) { - return lexers.Antlr3; - } - return MakeDummyLexerFactory("antlr3"); - } else if (ansi && !antlr4 && !pure) { - if (lexers.Antlr3Ansi) { - return lexers.Antlr3Ansi; - } - return MakeDummyLexerFactory("antlr3_ansi"); - } else if (!ansi && antlr4 && !pure) { - if (lexers.Antlr4) { - return lexers.Antlr4; - } - return MakeDummyLexerFactory("antlr4"); - } else if (ansi && antlr4 && !pure) { - if (lexers.Antlr4Ansi) { - return lexers.Antlr4Ansi; - } - return MakeDummyLexerFactory("antlr4_ansi"); - } else if (!ansi && antlr4 && pure) { - if (lexers.Antlr4Pure) { - return lexers.Antlr4Pure; - } - return MakeDummyLexerFactory("antlr4_pure"); - } else if (ansi && antlr4 && pure) { - if (lexers.Antlr4PureAnsi) { - return lexers.Antlr4PureAnsi; - } - return MakeDummyLexerFactory("antlr4_pure_ansi"); - } else if (!ansi && !antlr4 && pure) { - return MakeDummyLexerFactory("antlr3_pure"); + static NSQLTranslation::TLexerFactoryPtr GetFactory(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) { + if (auto ptr = GetMaybeFactory(lexers, ansi, antlr4, flavor)) { + return ptr; + } + return MakeDummyLexerFactory(GetLexerName(ansi, antlr4, flavor)); + } + + static NSQLTranslation::TLexerFactoryPtr GetMaybeFactory(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) { + if (!ansi && !antlr4 && flavor == ELexerFlavor::Default) { + return lexers.Antlr3; + } else if (ansi && !antlr4 && flavor == ELexerFlavor::Default) { + return lexers.Antlr3Ansi; + } else if (!ansi && antlr4 && flavor == ELexerFlavor::Default) { + return lexers.Antlr4; + } else if (ansi && antlr4 && flavor == ELexerFlavor::Default) { + return lexers.Antlr4Ansi; + } else if (!ansi && antlr4 && flavor == ELexerFlavor::Pure) { + return lexers.Antlr4Pure; + } else if (ansi && antlr4 && flavor == ELexerFlavor::Pure) { + return lexers.Antlr4PureAnsi; + } else if (!ansi && !antlr4 && flavor == ELexerFlavor::Regex) { + return lexers.Regex; + } else if (ansi && !antlr4 && flavor == ELexerFlavor::Regex) { + return lexers.RegexAnsi; } else { - return MakeDummyLexerFactory("antlr3_pure_ansi"); + return nullptr; } } + static TString GetLexerName(bool ansi, bool antlr4, ELexerFlavor flavor) { + TVector<const TStringBuf> parts; + + if (antlr4) { + parts.emplace_back("antlr4"); + } else if (!antlr4 && flavor != ELexerFlavor::Regex) { + parts.emplace_back("antlr3"); + } + + switch (flavor) { + case ELexerFlavor::Default: { + } break; + case ELexerFlavor::Pure: { + parts.emplace_back("pure"); + } break; + case ELexerFlavor::Regex: { + parts.emplace_back("regex"); + } break; + } + + if (ansi) { + parts.emplace_back("ansi"); + } + + return JoinSeq("_", parts); + } + private: NSQLTranslation::TLexerFactoryPtr Factory; }; } // namespace -NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure) { - return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4, pure)); +NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor) { + return NSQLTranslation::ILexer::TPtr(new TV1Lexer(lexers, ansi, antlr4, flavor)); } bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) { diff --git a/yql/essentials/sql/v1/lexer/lexer.h b/yql/essentials/sql/v1/lexer/lexer.h index 1cc8566fcf6..226e8b6ed25 100644 --- a/yql/essentials/sql/v1/lexer/lexer.h +++ b/yql/essentials/sql/v1/lexer/lexer.h @@ -11,9 +11,18 @@ struct TLexers { NSQLTranslation::TLexerFactoryPtr Antlr4Ansi; NSQLTranslation::TLexerFactoryPtr Antlr4Pure; NSQLTranslation::TLexerFactoryPtr Antlr4PureAnsi; + NSQLTranslation::TLexerFactoryPtr Regex; + NSQLTranslation::TLexerFactoryPtr RegexAnsi; }; -NSQLTranslation::ILexer::TPtr MakeLexer(const TLexers& lexers, bool ansi, bool antlr4, bool pure = false); +enum class ELexerFlavor { + Default, + Pure, + Regex, +}; + +NSQLTranslation::ILexer::TPtr MakeLexer( + const TLexers& lexers, bool ansi, bool antlr4, ELexerFlavor flavor = ELexerFlavor::Default); // "Probably" because YQL keyword can be an identifier // depending on a query context. For example diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp index 3ad01f631b6..53cff6ffdc7 100644 --- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp @@ -6,6 +6,7 @@ #include <yql/essentials/sql/v1/lexer/antlr3/lexer.h> #include <yql/essentials/sql/v1/lexer/antlr4/lexer.h> #include <yql/essentials/sql/v1/lexer/antlr4_pure/lexer.h> +#include <yql/essentials/sql/v1/lexer/regex/lexer.h> #include <library/cpp/testing/unittest/registar.h> @@ -59,6 +60,42 @@ void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) } Y_UNIT_TEST_SUITE(SQLv1Lexer) { + Y_UNIT_TEST(UnsupportedIssues) { + NSQLTranslationV1::TLexers factories; + + TVector<ILexer::TPtr> lexers; + for (auto ansi : {false, true}) { + for (auto antlr4 : {false, true}) { + for (auto flavor : {ELexerFlavor::Default, ELexerFlavor::Pure, ELexerFlavor::Regex}) { + lexers.emplace_back(MakeLexer(factories, ansi, antlr4, flavor)); + } + } + } + + TVector<TString> actual; + for (auto& lexer : lexers) { + auto issues = GetIssueMessages(lexer, ""); + actual.emplace_back(std::move(issues.at(0))); + } + + TVector<TString> expected = { + "<main>: Error: Lexer antlr3 is not supported", + "<main>: Error: Lexer antlr3_pure is not supported", + "<main>: Error: Lexer regex is not supported", + "<main>: Error: Lexer antlr4 is not supported", + "<main>: Error: Lexer antlr4_pure is not supported", + "<main>: Error: Lexer antlr4_regex is not supported", + "<main>: Error: Lexer antlr3_ansi is not supported", + "<main>: Error: Lexer antlr3_pure_ansi is not supported", + "<main>: Error: Lexer regex_ansi is not supported", + "<main>: Error: Lexer antlr4_ansi is not supported", + "<main>: Error: Lexer antlr4_pure_ansi is not supported", + "<main>: Error: Lexer antlr4_regex_ansi is not supported", + }; + + UNIT_ASSERT_VALUES_EQUAL(actual, expected); + } + Y_UNIT_TEST(AntlrVersionIndependent) { const TVector<TString> queriesUtf8 = { "", @@ -85,7 +122,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false); auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true); - auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, /* pure = */ true); + auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure); for (const auto& query : queriesUtf8) { auto [tokens3, issues3] = Tokenize(lexer3, query); @@ -164,19 +201,24 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { NSQLTranslationV1::TLexers lexers; lexers.Antlr3 = NSQLTranslationV1::MakeAntlr3LexerFactory(); lexers.Antlr4 = NSQLTranslationV1::MakeAntlr4LexerFactory(); + lexers.Antlr4Pure = NSQLTranslationV1::MakeAntlr4PureLexerFactory(); + lexers.Regex = NSQLTranslationV1::MakeRegexLexerFactory(/* ansi = */ false); auto lexer3 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false); auto lexer4 = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true); - auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, /* pure = */ true); + auto lexer4p = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ true, ELexerFlavor::Pure); + auto lexerR = MakeLexer(lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex); for (const auto& query : InvalidQueries()) { auto issues3 = GetIssueMessages(lexer3, query); auto issues4 = GetIssueMessages(lexer4, query); auto issues4p = GetIssueMessages(lexer4p, query); + auto issuesR = GetIssueMessages(lexerR, query); UNIT_ASSERT(!issues3.empty()); UNIT_ASSERT(!issues4.empty()); UNIT_ASSERT(!issues4p.empty()); + UNIT_ASSERT(!issuesR.empty()); } } diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp new file mode 100644 index 00000000000..1c8f2104a48 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -0,0 +1,252 @@ +#include "lexer.h" + +#include "regex.h" + +#include <contrib/libs/re2/re2/re2.h> + +#include <yql/essentials/core/issue/yql_issue.h> +#include <yql/essentials/sql/v1/reflect/sql_reflect.h> + +#include <util/generic/algorithm.h> +#include <util/generic/string.h> +#include <util/string/subst.h> + +namespace NSQLTranslationV1 { + + using NSQLTranslation::TParsedToken; + using NSQLTranslation::TParsedTokenList; + + class TRegexLexer: public NSQLTranslation::ILexer { + static constexpr const char* CommentTokenName = "COMMENT"; + + public: + TRegexLexer( + bool ansi, + NSQLReflect::TLexerGrammar grammar, + const THashMap<TString, TString>& RegexByOtherNameMap) + : Grammar_(std::move(grammar)) + , Ansi_(ansi) + { + for (auto& [token, regex] : RegexByOtherNameMap) { + if (token == CommentTokenName) { + CommentRegex_.Reset(new RE2(regex)); + } else { + OtherRegexes_.emplace(std::move(token), std::move(regex)); + } + } + } + + bool Tokenize( + const TString& query, + const TString& queryName, + const TTokenCallback& onNextToken, + NYql::TIssues& issues, + size_t maxErrors) override { + size_t errors = 0; + for (size_t pos = 0; pos < query.size();) { + TParsedToken matched = Match(TStringBuf(query, pos)); + + if (matched.Name.empty() && maxErrors == errors) { + break; + } + + if (matched.Name.empty()) { + pos += 1; + errors += 1; + issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates"); + continue; + } + + pos += matched.Content.length(); + onNextToken(std::move(matched)); + } + + onNextToken(TParsedToken{.Name = "EOF"}); + return errors == 0; + } + + private: + TParsedToken Match(const TStringBuf prefix) { + TParsedTokenList matches; + + size_t keywordCount = MatchKeyword(prefix, matches); + MatchPunctuation(prefix, matches); + size_t otherCount = MatchRegex(prefix, matches); + MatchComment(prefix, matches); + + auto max = MaxElementBy(matches, [](const TParsedToken& m) { + return m.Content.length(); + }); + + if (max == std::end(matches)) { + return {}; + } + + auto isMatched = [&](const TStringBuf name) { + return std::end(matches) != FindIf(matches, [&](const auto& m) { + return m.Name == name; + }); + }; + + Y_ENSURE( + otherCount <= 1 || + (otherCount == 2 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); + + size_t conflicts = CountIf(matches, [&](const TParsedToken& m) { + return m.Content.length() == max->Content.length(); + }); + conflicts -= 1; + Y_ENSURE( + conflicts == 0 || + (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) || + (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); + + Y_ENSURE(!max->Content.empty()); + return *max; + } + + bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { + size_t count = 0; + for (const auto& keyword : Grammar_.KeywordNames) { + if (prefix.substr(0, keyword.length()) == keyword) { + matches.emplace_back(keyword, keyword); + count += 1; + } + } + return count; + } + + size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) { + size_t count = 0; + for (const auto& name : Grammar_.PunctuationNames) { + const auto& content = Grammar_.BlockByName.at(name); + if (prefix.substr(0, content.length()) == content) { + matches.emplace_back(name, content); + count += 1; + } + } + return count; + } + + size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) { + size_t count = 0; + for (const auto& [token, regex] : OtherRegexes_) { + if (const TStringBuf match = TryMatchRegex(prefix, regex); !match.empty()) { + matches.emplace_back(token, TString(match)); + count += 1; + } + } + return count; + } + + const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) { + re2::StringPiece input(prefix.data(), prefix.size()); + if (RE2::Consume(&input, regex)) { + return TStringBuf(prefix.data(), input.data()); + } + return ""; + } + + size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) { + const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_); + if (reContent.empty()) { + return 0; + } + + if (!(Ansi_ && prefix.StartsWith("/*"))) { + matches.emplace_back(CommentTokenName, TString(reContent)); + return 1; + } + + size_t ll1Length = MatchANSIMultilineComment(prefix); + const TStringBuf ll1Content = prefix.SubString(0, ll1Length); + + Y_ENSURE(ll1Content == 0 || reContent <= ll1Content); + if (ll1Content == 0) { + matches.emplace_back(CommentTokenName, TString(reContent)); + return 1; + } + + matches.emplace_back(CommentTokenName, TString(ll1Content)); + return 1; + } + + size_t MatchANSIMultilineComment(TStringBuf remaining) { + if (!remaining.StartsWith("/*")) { + return 0; + } + + size_t skipped = 0; + + remaining.Skip(2); + skipped += 2; + + for (;;) { + if (remaining.StartsWith("*/")) { + remaining.Skip(2); + skipped += 2; + return skipped; + } + + bool isSkipped = false; + if (remaining.StartsWith("/*")) { + size_t limit = remaining.rfind("*/"); + if (limit == std::string::npos) { + return 0; + } + + size_t len = MatchANSIMultilineComment(remaining.Head(limit)); + remaining.Skip(len); + skipped += len; + + isSkipped = len != 0; + } + + if (isSkipped) { + continue; + } + + if (remaining.size() == 0) { + return 0; + } + + remaining.Skip(1); + skipped += 1; + } + } + + NSQLReflect::TLexerGrammar Grammar_; + THashMap<TString, RE2> OtherRegexes_; + THolder<RE2> CommentRegex_; + bool Ansi_; + }; + + namespace { + + class TFactory final: public NSQLTranslation::ILexerFactory { + public: + explicit TFactory(bool ansi) + : Ansi_(ansi) + , Grammar_(NSQLReflect::LoadLexerGrammar()) + , RegexByOtherNameMap_(MakeRegexByOtherNameMap(Grammar_, Ansi_)) + { + } + + NSQLTranslation::ILexer::TPtr MakeLexer() const override { + return NSQLTranslation::ILexer::TPtr( + new TRegexLexer(Ansi_, Grammar_, RegexByOtherNameMap_)); + } + + private: + bool Ansi_; + NSQLReflect::TLexerGrammar Grammar_; + THashMap<TString, TString> RegexByOtherNameMap_; + }; + + } // namespace + + NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) { + return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi)); + } + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h new file mode 100644 index 00000000000..e9968954e1f --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/lexer.h @@ -0,0 +1,9 @@ +#pragma once + +#include <yql/essentials/parser/lexer_common/lexer.h> + +namespace NSQLTranslationV1 { + + NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi); + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp new file mode 100644 index 00000000000..ae0d018e42d --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp @@ -0,0 +1,219 @@ +#include "lexer.h" + +#include <yql/essentials/public/issue/yql_issue.h> +#include <yql/essentials/sql/settings/translation_settings.h> +#include <yql/essentials/sql/v1/lexer/lexer.h> +#include <yql/essentials/sql/v1/lexer/antlr4_pure_ansi/lexer.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/random/random.h> + +using namespace NSQLTranslationV1; +using NSQLTranslation::SQL_MAX_PARSER_ERRORS; +using NSQLTranslation::Tokenize; +using NSQLTranslation::TParsedToken; +using NSQLTranslation::TParsedTokenList; +using NYql::TIssues; + +TLexers Lexers = { + .Antlr4PureAnsi = MakeAntlr4PureAnsiLexerFactory(), + .Regex = MakeRegexLexerFactory(/* ansi = */ false), + .RegexAnsi = MakeRegexLexerFactory(/* ansi = */ true), +}; + +auto PureAnsiLexer = MakeLexer( + Lexers, /* ansi = */ true, /* antlr4 = */ true, ELexerFlavor::Pure); + +auto DefaultLexer = MakeLexer( + Lexers, /* ansi = */ false, /* antlr4 = */ false, ELexerFlavor::Regex); + +auto AnsiLexer = MakeLexer( + Lexers, /* ansi = */ true, /* antlr4 = */ false, ELexerFlavor::Regex); + +TString ToString(TParsedToken token) { + TString& string = token.Name; + if (token.Name != token.Content && token.Name != "EOF") { + string += "("; + string += token.Content; + string += ")"; + } + return string; +} + +TString Tokenized(NSQLTranslation::ILexer& lexer, const TString& query) { + TParsedTokenList tokens; + TIssues issues; + bool ok = Tokenize(lexer, query, "Test", tokens, issues, SQL_MAX_PARSER_ERRORS); + + TString out; + if (!ok) { + out = "[INVALID] "; + } + + for (auto& token : tokens) { + out += ToString(std::move(token)); + out += " "; + } + if (!out.empty()) { + out.pop_back(); + } + return out; +} + +TString RandomMultilineCommentLikeText(size_t maxSize) { + auto size = RandomNumber<size_t>(maxSize); + TString comment; + for (size_t i = 0; i < size; ++i) { + if (auto /* isOpen */ _ = RandomNumber<bool>()) { + comment += "/*"; + } else { + comment += "*/"; + } + + for (int gap = RandomNumber<size_t>(2); gap > 0; --gap) { + comment += " "; + } + } + return comment; +} + +void Check(TString input, TString expected, bool ansi) { + auto* lexer = DefaultLexer.Get(); + if (ansi) { + lexer = AnsiLexer.Get(); + } + UNIT_ASSERT_VALUES_EQUAL(Tokenized(*lexer, input), expected); +} + +void Check(TString input, TString expected) { + Check(input, expected, /* ansi = */ false); + Check(input, expected, /* ansi = */ true); +} + +Y_UNIT_TEST_SUITE(RegexLexerTests) { + Y_UNIT_TEST(Whitespace) { + Check("", "EOF"); + Check(" ", "WS( ) EOF"); + Check(" ", "WS( ) WS( ) EOF"); + Check("\n", "WS(\n) EOF"); + } + + Y_UNIT_TEST(SinleLineComment) { + Check("--yql", "COMMENT(--yql) EOF"); + Check("-- yql ", "COMMENT(-- yql ) EOF"); + Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF"); + Check("-- yql --", "COMMENT(-- yql --) EOF"); + } + + Y_UNIT_TEST(MultiLineComment) { + Check("/* yql */", "COMMENT(/* yql */) EOF"); + Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); + Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF"); + } + + Y_UNIT_TEST(RecursiveMultiLineCommentDefault) { + Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false); + Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false); + } + + Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) { + Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true); + Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true); + Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true); + Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true); + } + + Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) { + SetRandomSeed(100); + for (size_t i = 0; i < 512; ++i) { + auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128); + TString actual = Tokenized(*AnsiLexer, input); + TString expected = Tokenized(*PureAnsiLexer, input); + UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input); + } + } + + Y_UNIT_TEST(Keyword) { + Check("SELECT", "SELECT EOF"); + Check("INSERT", "INSERT EOF"); + Check("FROM", "FROM EOF"); + } + + Y_UNIT_TEST(Punctuation) { + Check( + "* / + - <|", + "ASTERISK(*) WS( ) SLASH(/) WS( ) " + "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF"); + Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF"); + } + + Y_UNIT_TEST(IdPlain) { + Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF"); + } + + Y_UNIT_TEST(IdQuoted) { + Check("``", "ID_QUOTED(``) EOF"); + Check("` `", "ID_QUOTED(` `) EOF"); + Check("` `", "ID_QUOTED(` `) EOF"); + Check("`local/table`", "ID_QUOTED(`local/table`) EOF"); + } + + Y_UNIT_TEST(SinleLineString) { + Check("\"\"", "STRING_VALUE(\"\") EOF"); + Check("\' \'", "STRING_VALUE(\' \') EOF"); + Check("\" \"", "STRING_VALUE(\" \") EOF"); + Check("\"test\"", "STRING_VALUE(\"test\") EOF"); + + Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false); + Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true); + + Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false); + Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true); + } + + Y_UNIT_TEST(MultiLineString) { + Check("@@@@", "STRING_VALUE(@@@@) EOF"); + Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF"); + Check("@@test@@", "STRING_VALUE(@@test@@) EOF"); + Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF"); + } + + Y_UNIT_TEST(Query) { + TString query = + "SELECT\n" + " 123467,\n" + " \"Hello, {name}!\",\n" + " (1 + (5 * 1 / 0)),\n" + " MIN(identifier),\n" + " Bool(field),\n" + " Math::Sin(var)\n" + "FROM `local/test/space/table`\n" + "JOIN test;"; + + TString expected = + "SELECT WS(\n) " + "WS( ) WS( ) INTEGER_VALUE(123467) COMMA(,) WS(\n) " + "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) " + "WS( ) WS( ) LPAREN(() INTEGER_VALUE(1) WS( ) PLUS(+) WS( ) LPAREN(() INTEGER_VALUE(5) WS( ) " + "ASTERISK(*) WS( ) INTEGER_VALUE(1) WS( ) SLASH(/) WS( ) INTEGER_VALUE(0) RPAREN()) " + "RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) " + "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) " + "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF"; + + Check(query, expected); + } + + Y_UNIT_TEST(Invalid) { + Check("\"", "[INVALID] EOF"); + Check("\" SELECT", "[INVALID] WS( ) SELECT EOF"); + } + +} // Y_UNIT_TEST_SUITE(RegexLexerTests) diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp new file mode 100644 index 00000000000..a8aca8a1318 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp @@ -0,0 +1,240 @@ +#include "regex.h" + +#include <contrib/libs/re2/re2/re2.h> + +#include <util/generic/vector.h> + +#define SUBSTITUTION(name, mode) \ + {#name, name##_##mode} + +#define SUBSTITUTIONS(mode) \ + { \ + #mode, { \ + SUBSTITUTION(GRAMMAR_STRING_CORE_SINGLE, mode), \ + SUBSTITUTION(GRAMMAR_STRING_CORE_DOUBLE, mode), \ + SUBSTITUTION(GRAMMAR_MULTILINE_COMMENT_CORE, mode), \ + } \ + } + +namespace NSQLTranslationV1 { + + class TLexerGrammarToRegexTranslator { + private: + struct TRewriteRule { + TString Repr; + std::function<void(TString&)> Apply; + }; + + using TRewriteRules = TVector<TRewriteRule>; + + public: + explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi) + : Grammar_(&grammar) + , Mode_(ansi ? "ANSI" : "DEFAULT") + { + AddExternalRules(Inliners_); + AddFragmentRules(Inliners_); + + AddLetterRules(Transformations_); + AddTransformationRules(Transformations_); + + UnwrapQuotes_ = UnwrapQuotesRule(); + AddSpaceCollapses(SpaceCollapses_); + UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule(); + } + + TString ToRegex(const TStringBuf name) { + TString text = Grammar_->BlockByName.at(name); + Inline(text); + Transform(text); + Finalize(text); + return text; + } + + private: + void Inline(TString& text) { + ApplyEachWhileChanging(text, Inliners_); + } + + void AddExternalRules(TRewriteRules& rules) { + THashMap<TString, THashMap<TString, TString>> Substitutions = { + SUBSTITUTIONS(DEFAULT), + SUBSTITUTIONS(ANSI), + }; + + // ANSI mode MULTILINE_COMMENT is recursive + Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] = + Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"]; + + for (const auto& [k, v] : Substitutions.at(Mode_)) { + rules.emplace_back(RegexRewriteRule("@" + k + "@", v)); + } + } + + void AddFragmentRules(TRewriteRules& rules) { + const THashSet<TString> PunctuationFragments = { + "BACKSLASH", + "QUOTE_DOUBLE", + "QUOTE_SINGLE", + "BACKTICK", + "DOUBLE_COMMAT", + }; + + for (const auto& [name, definition] : Grammar_->BlockByName) { + TString def = definition; + if ( + Grammar_->PunctuationNames.contains(name) || + PunctuationFragments.contains(name)) { + def = "'" + def + "'"; + } + def = QuoteAntlrRewrite(std::move(def)); + + rules.emplace_back(RegexRewriteRule( + "(\\b" + name + "\\b)", + "(" + def + ")")); + } + } + + void Transform(TString& text) { + ApplyEachWhileChanging(text, Transformations_); + } + + void AddLetterRules(TRewriteRules& rules) { + for (char letter = 'A'; letter <= 'Z'; ++letter) { + TString lower(char(ToLower(letter))); + TString upper(char(ToUpper(letter))); + rules.emplace_back(RegexRewriteRule( + "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)", + "\\1[" + lower + upper + "]\\2")); + } + } + + void AddTransformationRules(TRewriteRules& rules) { + rules.emplace_back(RegexRewriteRule( + R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])")); + + rules.emplace_back(RegexRewriteRule( + R"(~\('(..?)'\))", R"([^\1])")); + + rules.emplace_back(RegexRewriteRule( + R"(('..?')\.\.('..?'))", R"([\1-\2])")); + + rules.emplace_back(RegexRewriteRule( + R"(\((.)\))", R"(\1)")); + + rules.emplace_back(RegexRewriteRule( + R"(\((\[.{1,8}\])\))", R"(\1)")); + + rules.emplace_back(RegexRewriteRule( + R"(\(('..?')\))", R"(\1)")); + + rules.emplace_back(RegexRewriteRule( + R"( \.)", R"( (.|\\n))")); + + rules.emplace_back(RegexRewriteRule( + R"(\bEOF\b)", R"($)")); + + rules.emplace_back(RegexRewriteRule( + R"('\\u000C' \|)", "")); + } + + void Finalize(TString& text) { + UnwrapQuotes_.Apply(text); + ApplyEachWhileChanging(text, SpaceCollapses_); + UnwrapQuotedSpace_.Apply(text); + } + + void AddSpaceCollapses(TRewriteRules& rules) { + rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)")); + rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)")); + } + + void ApplyEachOnce(TString& text, const TRewriteRules& rules) { + for (const auto& rule : rules) { + rule.Apply(text); + } + } + + void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) { + constexpr size_t Limit = 16; + + TString prev; + for (size_t i = 0; i < Limit + 1 && prev != text; ++i) { + prev = text; + ApplyEachOnce(text, rules); + Y_ENSURE(i != Limit); + } + } + + TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) { + auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); + Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); + + TString error; + Y_ENSURE( + re2->CheckRewriteString(rewrite, &error), + error << " on rewrite '" << rewrite << "'"); + + return { + .Repr = regex + " -> " + rewrite, + .Apply = [re2, rewrite = std::move(rewrite)](TString& text) { + RE2::GlobalReplace(&text, *re2, rewrite); + }, + }; + } + + TRewriteRule UnwrapQuotesRule() { + const TString regex = R"('([^ ][^ ]?)')"; + auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); + Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); + + return { + .Repr = regex + " -> Quoted(\\1)", + .Apply = [re2](TString& text) { + TString content; + std::size_t i = 256; + while (RE2::PartialMatch(text, *re2, &content) && --i != 0) { + TString quoted = RE2::QuoteMeta(content); + for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) { + quoted.erase(std::begin(quoted)); + } + SubstGlobal(text, "'" + content + "'", quoted); + } + Y_ENSURE(i != 0); + }, + }; + } + + TRewriteRule UnwrapQuotedSpaceRule() { + return RegexRewriteRule(R"(' ')", R"( )"); + } + + TString QuoteAntlrRewrite(TString rewrite) { + SubstGlobal(rewrite, R"(\)", R"(\\)"); + SubstGlobal(rewrite, R"('\\')", R"('\\\\')"); + return rewrite; + } + + const NSQLReflect::TLexerGrammar* Grammar_; + const TStringBuf Mode_; + + TRewriteRules Inliners_; + + TRewriteRules Transformations_; + + TRewriteRule UnwrapQuotes_; + TRewriteRules SpaceCollapses_; + TRewriteRule UnwrapQuotedSpace_; + }; + + THashMap<TString, TString> MakeRegexByOtherNameMap(const NSQLReflect::TLexerGrammar& grammar, bool ansi) { + TLexerGrammarToRegexTranslator translator(grammar, ansi); + + THashMap<TString, TString> regexes; + for (const auto& token : grammar.OtherNames) { + regexes.emplace(token, translator.ToRegex(token)); + } + return regexes; + } + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h new file mode 100644 index 00000000000..9e29c3df25b --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/regex.h @@ -0,0 +1,14 @@ +#pragma once + +#include <yql/essentials/sql/v1/reflect/sql_reflect.h> + +#include <util/generic/hash.h> + +namespace NSQLTranslationV1 { + + // Makes regexes only for tokens from OtherNames, + // as keywords and punctuation are trivially matched. + THashMap<TString, TString> MakeRegexByOtherNameMap( + const NSQLReflect::TLexerGrammar& grammar, bool ansi); + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp new file mode 100644 index 00000000000..47a94f53ed0 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp @@ -0,0 +1,90 @@ +#include "regex.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <contrib/libs/re2/re2/re2.h> + +using namespace NSQLTranslationV1; + +namespace { + auto grammar = NSQLReflect::LoadLexerGrammar(); + auto defaultRegexes = MakeRegexByOtherNameMap(grammar, /* ansi = */ false); + auto ansiRegexes = MakeRegexByOtherNameMap(grammar, /* ansi = */ true); + + void CheckRegex(bool ansi, const TStringBuf name, const TStringBuf expected) { + const auto& regexes = ansi ? ansiRegexes : defaultRegexes; + const TString regex = regexes.at(name); + + const RE2 re2(regex); + Y_ENSURE(re2.ok(), re2.error()); + + UNIT_ASSERT_VALUES_EQUAL(regex, expected); + } + +} // namespace + +Y_UNIT_TEST_SUITE(SqlRegexTests) { + Y_UNIT_TEST(StringValue) { + CheckRegex( + /* ansi = */ false, + "STRING_VALUE", + R"(((((\'([^'\\]|(\\(.|\n)))*\'))|((\"([^"\\]|(\\(.|\n)))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sS]|[uU]|[yY]|[jJ]|[pP]([tT]|[bB]|[vV])?)?))"); + } + + Y_UNIT_TEST(AnsiStringValue) { + CheckRegex( + /* ansi = */ true, + "STRING_VALUE", + R"(((((\'([^']|(\'\'))*\'))|((\"([^"]|(\"\"))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sS]|[uU]|[yY]|[jJ]|[pP]([tT]|[bB]|[vV])?)?))"); + } + + Y_UNIT_TEST(IdPlain) { + CheckRegex( + /* ansi = */ false, + "ID_PLAIN", + R"(([a-z]|[A-Z]|_)([a-z]|[A-Z]|_|[0-9])*)"); + } + + Y_UNIT_TEST(IdQuoted) { + CheckRegex( + /* ansi = */ false, + "ID_QUOTED", + R"(\`(\\(.|\n)|\`\`|[^`\\])*\`)"); + } + + Y_UNIT_TEST(Digits) { + CheckRegex( + /* ansi = */ false, + "DIGITS", + R"(([0-9]+)|(0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+))"); + } + + Y_UNIT_TEST(Real) { + CheckRegex( + /* ansi = */ false, + "REAL", + R"((([0-9]+)\.[0-9]*([eE](\+|\-)?([0-9]+))?|([0-9]+)([eE](\+|\-)?([0-9]+)))([fF]|[pP]([fF](4|8)|[nN])?)?)"); + } + + Y_UNIT_TEST(Ws) { + CheckRegex( + /* ansi = */ false, + "WS", + R"(( |\r|\t|\n))"); + } + + Y_UNIT_TEST(Comment) { + CheckRegex( + /* ansi = */ false, + "COMMENT", + R"(((\/\*(.|\n)*?\*\/)|(\-\-[^\n\r]*(\r\n?|\n|$))))"); + } + + Y_UNIT_TEST(AnsiCommentSameAsDefault) { + // Because of recursive definition + UNIT_ASSERT_VALUES_EQUAL( + ansiRegexes.at("COMMENT"), + defaultRegexes.at("COMMENT")); + } + +} // Y_UNIT_TEST_SUITE(SqlRegexTests) diff --git a/yql/essentials/sql/v1/lexer/regex/ut/ya.make b/yql/essentials/sql/v1/lexer/regex/ut/ya.make new file mode 100644 index 00000000000..09eb74a3f68 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/ut/ya.make @@ -0,0 +1,13 @@ +UNITTEST_FOR(yql/essentials/sql/v1/lexer/regex) + +PEERDIR( + yql/essentials/sql/v1/lexer + yql/essentials/sql/v1/lexer/antlr4_pure_ansi +) + +SRCS( + lexer_ut.cpp + regex_ut.cpp +) + +END() diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make new file mode 100644 index 00000000000..249dfbd11df --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/ya.make @@ -0,0 +1,39 @@ +LIBRARY() + +PEERDIR( + contrib/libs/re2 + yql/essentials/public/issue + yql/essentials/parser/lexer_common + yql/essentials/sql/settings + yql/essentials/sql/v1/reflect +) + +# TODO(vityaman): Extract to a single ya.make for reusage. + +SET(GRAMMAR_STRING_CORE_SINGLE_DEFAULT "~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .)") +SET(GRAMMAR_STRING_CORE_DOUBLE_DEFAULT "~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .)") +SET(GRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT "(.)") + +SET(GRAMMAR_STRING_CORE_SINGLE_ANSI "~QUOTE_SINGLE | (QUOTE_SINGLE QUOTE_SINGLE)") +SET(GRAMMAR_STRING_CORE_DOUBLE_ANSI "~QUOTE_DOUBLE | (QUOTE_DOUBLE QUOTE_DOUBLE)") +SET(GRAMMAR_MULTILINE_COMMENT_CORE_ANSI "MULTILINE_COMMENT | .") + +CFLAGS( + -DGRAMMAR_STRING_CORE_SINGLE_DEFAULT="\\\"${GRAMMAR_STRING_CORE_SINGLE_DEFAULT}\\\"" + -DGRAMMAR_STRING_CORE_DOUBLE_DEFAULT="\\\"${GRAMMAR_STRING_CORE_DOUBLE_DEFAULT}\\\"" + -DGRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT="\\\"${GRAMMAR_MULTILINE_COMMENT_CORE_DEFAULT}\\\"" + -DGRAMMAR_STRING_CORE_SINGLE_ANSI="\\\"${GRAMMAR_STRING_CORE_SINGLE_ANSI}\\\"" + -DGRAMMAR_STRING_CORE_DOUBLE_ANSI="\\\"${GRAMMAR_STRING_CORE_DOUBLE_ANSI}\\\"" + -DGRAMMAR_MULTILINE_COMMENT_CORE_ANSI="\\\"${GRAMMAR_MULTILINE_COMMENT_CORE_ANSI}\\\"" +) + +SRCS( + lexer.cpp + regex.cpp +) + +END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make index c50c8cd7277..7e62fb50c85 100644 --- a/yql/essentials/sql/v1/lexer/ut/ya.make +++ b/yql/essentials/sql/v1/lexer/ut/ya.make @@ -6,6 +6,7 @@ PEERDIR( yql/essentials/sql/v1/lexer/antlr3 yql/essentials/sql/v1/lexer/antlr4 yql/essentials/sql/v1/lexer/antlr4_pure + yql/essentials/sql/v1/lexer/regex ) SRCS( diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp new file mode 100644 index 00000000000..f47f35cb9de --- /dev/null +++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp @@ -0,0 +1,173 @@ +#include "sql_reflect.h" + +#include <library/cpp/resource/resource.h> + +#include <util/string/split.h> +#include <util/string/strip.h> + +namespace NSQLReflect { + + const TStringBuf ReflectPrefix = "//!"; + const TStringBuf SectionPrefix = "//! section:"; + const TStringBuf SectionPunctuation = "//! section:punctuation"; + const TStringBuf SectionLetter = "//! section:letter"; + const TStringBuf SectionKeyword = "//! section:keyword"; + const TStringBuf SectionOther = "//! section:other"; + const TStringBuf FragmentPrefix = "fragment "; + + TVector<TString> GetResourceLines(const TStringBuf key) { + TString text; + Y_ENSURE(NResource::FindExact(key, &text)); + + TVector<TString> lines; + Split(text, "\n", lines); + return lines; + } + + void Format(TVector<TString>& lines) { + for (size_t i = 0; i < lines.size(); ++i) { + auto& line = lines[i]; + + StripInPlace(line); + + if (line.StartsWith("//") || (line.Contains(':') && line.Contains(';'))) { + continue; + } + + size_t j = i + 1; + do { + line += lines.at(j); + } while (!lines.at(j++).Contains(';')); + + auto first = std::next(std::begin(lines), i + 1); + auto last = std::next(std::begin(lines), j); + lines.erase(first, last); + } + + for (auto& line : lines) { + CollapseInPlace(line); + SubstGlobal(line, " ;", ";"); + SubstGlobal(line, " :", ":"); + SubstGlobal(line, " )", ")"); + SubstGlobal(line, "( ", "("); + } + } + + void Purify(TVector<TString>& lines) { + const auto [first, last] = std::ranges::remove_if(lines, [](const TString& line) { + return (line.StartsWith("//") && !line.StartsWith(ReflectPrefix)) || line.empty(); + }); + lines.erase(first, last); + } + + THashMap<TStringBuf, TVector<TString>> GroupBySection(TVector<TString>&& lines) { + TVector<TStringBuf> sections = { + "", + SectionPunctuation, + SectionLetter, + SectionKeyword, + SectionOther, + }; + + size_t section = 0; + + THashMap<TStringBuf, TVector<TString>> groups; + for (auto& line : lines) { + if (line.StartsWith(SectionPrefix)) { + Y_ENSURE(sections.at(section + 1) == line); + section += 1; + continue; + } + + groups[sections.at(section)].emplace_back(std::move(line)); + } + + groups.erase(""); + groups.erase(SectionLetter); + + return groups; + } + + std::tuple<TString, TString> ParseLexerRule(TString&& line) { + size_t colonPos = line.find(':'); + size_t semiPos = line.rfind(';'); + + Y_ENSURE( + colonPos != TString::npos && + semiPos != TString::npos && + colonPos < semiPos); + + TString block = line.substr(colonPos + 2, semiPos - colonPos - 2); + SubstGlobal(block, "\\\\", "\\"); + + TString name = std::move(line); + name.resize(colonPos); + + return std::make_tuple(std::move(name), std::move(block)); + } + + void ParsePunctuationLine(TString&& line, TLexerGrammar& grammar) { + auto [name, block] = ParseLexerRule(std::move(line)); + block = block.erase(std::begin(block)); + block.pop_back(); + + SubstGlobal(block, "\\\'", "\'"); + + if (!name.StartsWith(FragmentPrefix)) { + grammar.PunctuationNames.emplace(name); + } + + SubstGlobal(name, FragmentPrefix, ""); + grammar.BlockByName.emplace(std::move(name), std::move(block)); + } + + void ParseKeywordLine(TString&& line, TLexerGrammar& grammar) { + auto [name, block] = ParseLexerRule(std::move(line)); + SubstGlobal(block, "'", ""); + SubstGlobal(block, " ", ""); + + Y_ENSURE(name == block || (name == "TSKIP" && block == "SKIP")); + grammar.KeywordNames.emplace(std::move(name)); + } + + void ParseOtherLine(TString&& line, TLexerGrammar& grammar) { + auto [name, block] = ParseLexerRule(std::move(line)); + + if (!name.StartsWith(FragmentPrefix)) { + grammar.OtherNames.emplace(name); + } + + SubstGlobal(name, FragmentPrefix, ""); + SubstGlobal(block, " -> channel(HIDDEN)", ""); + grammar.BlockByName.emplace(std::move(name), std::move(block)); + } + + TLexerGrammar LoadLexerGrammar() { + TVector<TString> lines = GetResourceLines("SQLv1Antlr4.g.in"); + Purify(lines); + Format(lines); + Purify(lines); + + THashMap<TStringBuf, TVector<TString>> sections; + sections = GroupBySection(std::move(lines)); + + TLexerGrammar grammar; + + for (auto& [section, lines] : sections) { + for (auto& line : lines) { + if (section == SectionPunctuation) { + ParsePunctuationLine(std::move(line), grammar); + } else if (section == SectionKeyword) { + ParseKeywordLine(std::move(line), grammar); + } else if (section == SectionOther) { + ParseOtherLine(std::move(line), grammar); + } else { + Y_ABORT("Unexpected section %s", section); + } + } + } + + return grammar; + } + +} // namespace NSQLReflect diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.h b/yql/essentials/sql/v1/reflect/sql_reflect.h new file mode 100644 index 00000000000..5225a3c996b --- /dev/null +++ b/yql/essentials/sql/v1/reflect/sql_reflect.h @@ -0,0 +1,18 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/hash_set.h> +#include <util/generic/hash.h> + +namespace NSQLReflect { + + struct TLexerGrammar { + THashSet<TString> KeywordNames; + THashSet<TString> PunctuationNames; + THashSet<TString> OtherNames; + THashMap<TString, TString> BlockByName; + }; + + TLexerGrammar LoadLexerGrammar(); + +} // namespace NSQLReflect diff --git a/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp b/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp new file mode 100644 index 00000000000..7bef2879e55 --- /dev/null +++ b/yql/essentials/sql/v1/reflect/sql_reflect_ut.cpp @@ -0,0 +1,46 @@ +#include "sql_reflect.h" + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NSQLReflect; + +namespace { + TLexerGrammar grammar = LoadLexerGrammar(); +} // namespace + +Y_UNIT_TEST_SUITE(SqlReflectTests) { + Y_UNIT_TEST(Keywords) { + UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("SELECT"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("INSERT"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("WHERE"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.KeywordNames.contains("COMMIT"), true); + } + + Y_UNIT_TEST(Punctuation) { + UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("LPAREN"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("LPAREN"), "("); + + UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("MINUS"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("MINUS"), "-"); + + UNIT_ASSERT_VALUES_EQUAL(grammar.PunctuationNames.contains("NAMESPACE"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.BlockByName.at("NAMESPACE"), "::"); + } + + Y_UNIT_TEST(Other) { + UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("REAL"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("STRING_VALUE"), true); + UNIT_ASSERT_VALUES_EQUAL(grammar.OtherNames.contains("STRING_MULTILINE"), false); + + UNIT_ASSERT_VALUES_EQUAL( + grammar.BlockByName.at("FLOAT_EXP"), + "E (PLUS | MINUS)? DECDIGITS"); + UNIT_ASSERT_VALUES_EQUAL( + grammar.BlockByName.at("STRING_MULTILINE"), + "(DOUBLE_COMMAT .*? DOUBLE_COMMAT)+ COMMAT?"); + UNIT_ASSERT_VALUES_EQUAL( + grammar.BlockByName.at("REAL"), + "(DECDIGITS DOT DIGIT* FLOAT_EXP? | DECDIGITS FLOAT_EXP) (F | P (F ('4' | '8') | N)?)?"); + } + +} // Y_UNIT_TEST_SUITE(SqlReflectTests) diff --git a/yql/essentials/sql/v1/reflect/ut/ya.make b/yql/essentials/sql/v1/reflect/ut/ya.make new file mode 100644 index 00000000000..ee52ff0837a --- /dev/null +++ b/yql/essentials/sql/v1/reflect/ut/ya.make @@ -0,0 +1,7 @@ +UNITTEST_FOR(yql/essentials/sql/v1/reflect) + +SRCS( + sql_reflect_ut.cpp +) + +END() diff --git a/yql/essentials/sql/v1/reflect/ya.make b/yql/essentials/sql/v1/reflect/ya.make new file mode 100644 index 00000000000..5865654c86e --- /dev/null +++ b/yql/essentials/sql/v1/reflect/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +SRCS( + sql_reflect.cpp +) + +RESOURCE(DONT_PARSE yql/essentials/sql/v1/SQLv1Antlr4.g.in SQLv1Antlr4.g.in) + +END() + +RECURSE_FOR_TESTS( + ut +) |