#include "lexer.h" #include "regex.h" #include <contrib/libs/re2/re2/re2.h> #include <yql/essentials/core/issue/yql_issue.h> #include <yql/essentials/sql/v1/reflect/sql_reflect.h> #include <util/generic/algorithm.h> #include <util/generic/string.h> #include <util/string/subst.h> #include <util/string/ascii.h> namespace NSQLTranslationV1 { using NSQLTranslation::TParsedToken; using NSQLTranslation::TParsedTokenList; class TRegexLexer: public NSQLTranslation::ILexer { static constexpr const char* CommentTokenName = "COMMENT"; public: TRegexLexer( bool ansi, NSQLReflect::TLexerGrammar grammar, const TVector<std::tuple<TString, TString>>& RegexByOtherName) : Grammar_(std::move(grammar)) , Ansi_(ansi) { for (const auto& [token, regex] : RegexByOtherName) { if (token == CommentTokenName) { CommentRegex_.Reset(new RE2(regex)); } else { OtherRegexes_.emplace_back(token, new RE2(regex)); } } } bool Tokenize( const TString& query, const TString& queryName, const TTokenCallback& onNextToken, NYql::TIssues& issues, size_t maxErrors) override { size_t errors = 0; for (size_t pos = 0; pos < query.size();) { TParsedToken matched = Match(TStringBuf(query, pos)); if (matched.Name.empty() && maxErrors == errors) { break; } if (matched.Name.empty()) { pos += 1; errors += 1; issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates"); continue; } pos += matched.Content.length(); onNextToken(std::move(matched)); } onNextToken(TParsedToken{.Name = "EOF"}); return errors == 0; } private: TParsedToken Match(const TStringBuf prefix) { TParsedTokenList matches; size_t keywordCount = MatchKeyword(prefix, matches); MatchPunctuation(prefix, matches); MatchRegex(prefix, matches); MatchComment(prefix, matches); if (matches.empty()) { return {}; } auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) { return m.Content.length(); })->Content.length(); auto max = FindIf(matches, [&](const TParsedToken& m) { return m.Content.length() == maxLength; }); auto isMatched = [&](const TStringBuf name) { return std::end(matches) != FindIf(matches, [&](const auto& m) { return m.Name == name; }); }; size_t conflicts = CountIf(matches, [&](const TParsedToken& m) { return m.Content.length() == max->Content.length(); }); conflicts -= 1; Y_ENSURE( conflicts == 0 || (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) || (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); Y_ENSURE(!max->Content.empty()); return *max; } bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { size_t count = 0; for (const auto& keyword : Grammar_.KeywordNames) { const TStringBuf content = prefix.substr(0, keyword.length()); if (AsciiEqualsIgnoreCase(content, keyword)) { matches.emplace_back(keyword, TString(content)); count += 1; } } return count; } size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) { size_t count = 0; for (const auto& name : Grammar_.PunctuationNames) { const auto& content = Grammar_.BlockByName.at(name); if (prefix.substr(0, content.length()) == content) { matches.emplace_back(name, content); count += 1; } } return count; } size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) { size_t count = 0; for (const auto& [token, regex] : OtherRegexes_) { if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) { matches.emplace_back(token, TString(match)); count += 1; } } return count; } const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) { re2::StringPiece input(prefix.data(), prefix.size()); if (RE2::Consume(&input, regex)) { return TStringBuf(prefix.data(), input.data()); } return ""; } size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) { const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_); if (reContent.empty()) { return 0; } if (!(Ansi_ && prefix.StartsWith("/*"))) { matches.emplace_back(CommentTokenName, TString(reContent)); return 1; } size_t ll1Length = MatchANSIMultilineComment(prefix); const TStringBuf ll1Content = prefix.SubString(0, ll1Length); Y_ENSURE(ll1Content == 0 || reContent <= ll1Content); if (ll1Content == 0) { matches.emplace_back(CommentTokenName, TString(reContent)); return 1; } matches.emplace_back(CommentTokenName, TString(ll1Content)); return 1; } size_t MatchANSIMultilineComment(TStringBuf remaining) { if (!remaining.StartsWith("/*")) { return 0; } size_t skipped = 0; remaining.Skip(2); skipped += 2; for (;;) { if (remaining.StartsWith("*/")) { remaining.Skip(2); skipped += 2; return skipped; } bool isSkipped = false; if (remaining.StartsWith("/*")) { size_t limit = remaining.rfind("*/"); if (limit == std::string::npos) { return 0; } size_t len = MatchANSIMultilineComment(remaining.Head(limit)); remaining.Skip(len); skipped += len; isSkipped = len != 0; } if (isSkipped) { continue; } if (remaining.size() == 0) { return 0; } remaining.Skip(1); skipped += 1; } } NSQLReflect::TLexerGrammar Grammar_; TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_; THolder<RE2> CommentRegex_; bool Ansi_; }; namespace { class TFactory final: public NSQLTranslation::ILexerFactory { public: explicit TFactory(bool ansi) : Ansi_(ansi) , Grammar_(NSQLReflect::LoadLexerGrammar()) , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_)) { } NSQLTranslation::ILexer::TPtr MakeLexer() const override { return NSQLTranslation::ILexer::TPtr( new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_)); } private: bool Ansi_; NSQLReflect::TLexerGrammar Grammar_; TVector<std::tuple<TString, TString>> RegexByOtherName_; }; } // namespace NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) { return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi)); } } // namespace NSQLTranslationV1