diff options
author | robot-piglet <[email protected]> | 2025-05-12 13:53:24 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-05-12 14:05:50 +0300 |
commit | 7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch) | |
tree | 70c132d1b611697ad23b90cf35215b035f247ec0 /yql/essentials/sql/v1/lexer/regex/lexer.cpp | |
parent | bf1279129bcf6c1b1001e39c39a13d80737898d3 (diff) |
Intermediate changes
commit_hash:3a624a323006078de71f50747f7b2e8cadba7ccd
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/lexer.cpp')
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 304 |
1 files changed, 113 insertions, 191 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index a1d96253bf7..58c98edfd31 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -1,5 +1,6 @@ #include "lexer.h" +#include "generic.h" #include "regex.h" #include <contrib/libs/re2/re2/re2.h> @@ -9,256 +10,177 @@ #include <util/generic/algorithm.h> #include <util/generic/string.h> +#include <util/generic/maybe.h> #include <util/string/subst.h> #include <util/string/ascii.h> namespace NSQLTranslationV1 { + using NSQLReflect::TLexerGrammar; using NSQLTranslation::TParsedToken; using NSQLTranslation::TParsedTokenList; - class TRegexLexer: public NSQLTranslation::ILexer { - static constexpr const char* CommentTokenName = "COMMENT"; - static constexpr const char* StringValueName = "STRING_VALUE"; - - static constexpr const TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - - public: - TRegexLexer( - bool ansi, - NSQLReflect::TLexerGrammar grammar, - const TVector<std::tuple<TString, TString>>& RegexByOtherName) - : Grammar_(std::move(grammar)) - , Ansi_(ansi) - { - for (const auto& [token, regex] : RegexByOtherName) { - RE2::Options custom; - if (token != CommentTokenName && token != StringValueName) { - custom.set_longest_match(true); - } + size_t MatchANSIMultilineComment(TStringBuf remaining); - RE2* re2 = new RE2(regex, custom); - if (token == CommentTokenName) { - CommentRegex_.Reset(re2); - } else { - OtherRegexes_.emplace_back(token, re2); - } + TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) { + return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> { + const auto basic = defaultComment(prefix); + if (basic.Empty()) { + return Nothing(); } - } - - bool Tokenize( - const TString& query, - const TString& queryName, - const TTokenCallback& onNextToken, - NYql::TIssues& issues, - size_t maxErrors) override { - size_t errors = 0; - size_t pos = 0; - if (query.StartsWith(Utf8BOM)) { - pos += Utf8BOM.size(); + if (!prefix.StartsWith("/*")) { + return basic; } - while (pos < query.size()) { - TParsedToken matched = Match(TStringBuf(query, pos)); - - if (matched.Name.empty() && maxErrors == errors) { - break; - } - - if (matched.Name.empty()) { - pos += 1; - errors += 1; - issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates"); - continue; - } + size_t ll1Length = MatchANSIMultilineComment(prefix); + TStringBuf ll1Content = prefix.SubString(0, ll1Length); - pos += matched.Content.length(); - onNextToken(std::move(matched)); + Y_ENSURE(ll1Content == 0 || basic <= ll1Content); + if (ll1Content == 0) { + return basic; } - onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"}); - return errors == 0; + return ll1Content; + }; + } + + size_t MatchANSIMultilineComment(TStringBuf prefix) { + if (!prefix.StartsWith("/*")) { + return 0; } - private: - TParsedToken Match(const TStringBuf prefix) { - TParsedTokenList matches; + size_t skipped = 0; - size_t keywordCount = MatchKeyword(prefix, matches); - MatchPunctuation(prefix, matches); - MatchRegex(prefix, matches); - MatchComment(prefix, matches); + prefix.Skip(2); + skipped += 2; - if (matches.empty()) { - return {}; + for (;;) { + if (prefix.StartsWith("*/")) { + prefix.Skip(2); + skipped += 2; + return skipped; } - auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) { - return m.Content.length(); - })->Content.length(); - - auto max = FindIf(matches, [&](const TParsedToken& m) { - return m.Content.length() == maxLength; - }); - - auto isMatched = [&](const TStringBuf name) { - return std::end(matches) != FindIf(matches, [&](const auto& m) { - return m.Name == name; - }); - }; - - size_t conflicts = CountIf(matches, [&](const TParsedToken& m) { - return m.Content.length() == max->Content.length(); - }); - conflicts -= 1; - Y_ENSURE( - conflicts == 0 || - (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) || - (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); - - Y_ENSURE(!max->Content.empty()); - return *max; - } - - bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& keyword : Grammar_.KeywordNames) { - const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword); - const TStringBuf content = prefix.substr(0, block.length()); - if (AsciiEqualsIgnoreCase(content, block)) { - matches.emplace_back(keyword, TString(content)); - count += 1; + bool isSkipped = false; + if (prefix.StartsWith("/*")) { + size_t limit = prefix.rfind("*/"); + if (limit == std::string::npos) { + return 0; } - } - return count; - } - size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& name : Grammar_.PunctuationNames) { - const auto& content = Grammar_.BlockByName.at(name); - if (prefix.substr(0, content.length()) == content) { - matches.emplace_back(name, content); - count += 1; - } - } - return count; - } + size_t len = MatchANSIMultilineComment(prefix.Head(limit)); + prefix.Skip(len); + skipped += len; - size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& [token, regex] : OtherRegexes_) { - if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) { - matches.emplace_back(token, TString(match)); - count += 1; - } + isSkipped = len != 0; } - return count; - } - const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) { - re2::StringPiece input(prefix.data(), prefix.size()); - if (RE2::Consume(&input, regex)) { - return TStringBuf(prefix.data(), input.data()); + if (isSkipped) { + continue; } - return ""; - } - size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) { - const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_); - if (reContent.empty()) { + if (prefix.size() == 0) { return 0; } - if (!(Ansi_ && prefix.StartsWith("/*"))) { - matches.emplace_back(CommentTokenName, TString(reContent)); - return 1; - } - - size_t ll1Length = MatchANSIMultilineComment(prefix); - const TStringBuf ll1Content = prefix.SubString(0, ll1Length); - - Y_ENSURE(ll1Content == 0 || reContent <= ll1Content); - if (ll1Content == 0) { - matches.emplace_back(CommentTokenName, TString(reContent)); - return 1; - } - - matches.emplace_back(CommentTokenName, TString(ll1Content)); - return 1; + prefix.Skip(1); + skipped += 1; } + } - size_t MatchANSIMultilineComment(TStringBuf remaining) { - if (!remaining.StartsWith("/*")) { - return 0; - } + TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; - size_t skipped = 0; + for (const auto& name : grammar.KeywordNames) { + auto matcher = Compile({ + .Body = TString(TLexerGrammar::KeywordBlock(name)), + .IsCaseInsensitive = true, + }); + generic.emplace_back(name, std::move(matcher)); + } - remaining.Skip(2); - skipped += 2; + for (const auto& name : grammar.PunctuationNames) { + generic.emplace_back( + name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))})); + } - for (;;) { - if (remaining.StartsWith("*/")) { - remaining.Skip(2); - skipped += 2; - return skipped; - } + for (const auto& [name, regex] : regexByOtherName) { + auto matcher = Compile({ + .Body = regex, + }); + generic.emplace_back(name, std::move(matcher)); + } - bool isSkipped = false; - if (remaining.StartsWith("/*")) { - size_t limit = remaining.rfind("*/"); - if (limit == std::string::npos) { - return 0; - } + if (ansi) { + auto it = FindIf(generic, [](const auto& m) { + return m.TokenName == "COMMENT"; + }); + Y_ENSURE(it != std::end(generic)); + it->Match = ANSICommentMatcher(it->Match); + } - size_t len = MatchANSIMultilineComment(remaining.Head(limit)); - remaining.Skip(len); - skipped += len; + return generic; + } - isSkipped = len != 0; - } + class TRegexLexer: public NSQLTranslation::ILexer { + public: + TRegexLexer(IGenericLexer::TPtr lexer) + : Lexer_(std::move(lexer)) + { + } - if (isSkipped) { - continue; + bool Tokenize( + const TString& query, + const TString& queryName, + const TTokenCallback& onNextToken, + NYql::TIssues& issues, + size_t maxErrors) override { + bool isFailed = false; + + const auto onNext = [&](TGenericToken&& token) { + if (token.Name == TGenericToken::Error) { + NYql::TPosition pos(token.Begin, 0, queryName); + TString message = TString("no candidates, skipping ") + token.Content; + issues.AddIssue(std::move(pos), std::move(message)); + isFailed = true; + return; } - if (remaining.size() == 0) { - return 0; - } + onNextToken({ + .Name = TString(token.Name), + .Content = TString(token.Content), + }); + }; - remaining.Skip(1); - skipped += 1; - } + Lexer_->Tokenize(query, onNext, maxErrors); + return !isFailed; } - NSQLReflect::TLexerGrammar Grammar_; - TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_; - THolder<RE2> CommentRegex_; - bool Ansi_; + private: + IGenericLexer::TPtr Lexer_; }; namespace { class TFactory final: public NSQLTranslation::ILexerFactory { public: - explicit TFactory(bool ansi) - : Ansi_(ansi) - , Grammar_(NSQLReflect::LoadLexerGrammar()) - , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_)) - { + explicit TFactory(bool ansi) { + auto grammar = NSQLReflect::LoadLexerGrammar(); + auto regexes = MakeRegexByOtherName(grammar, ansi); + Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); } NSQLTranslation::ILexer::TPtr MakeLexer() const override { return NSQLTranslation::ILexer::TPtr( - new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_)); + new TRegexLexer(Lexer_)); } private: - bool Ansi_; - NSQLReflect::TLexerGrammar Grammar_; - TVector<std::tuple<TString, TString>> RegexByOtherName_; + IGenericLexer::TPtr Lexer_; }; } // namespace |