diff options
| author | vitya-smirnov <[email protected]> | 2025-10-07 09:34:39 +0300 |
|---|---|---|
| committer | vitya-smirnov <[email protected]> | 2025-10-07 09:52:14 +0300 |
| commit | babe7533f18c11be1f8a195ed2324d2d9a89436a (patch) | |
| tree | 45b7627141bf5a52b45a3d61fd1fbdd564bb8dd9 /yql/essentials/sql/v1/lexer/regex/lexer.cpp | |
| parent | 8fe7cfe254fde2772477a8933a163b5f303716b4 (diff) | |
YQL-20086 sql/v1
commit_hash:55bc611cdaa0d8a0fc3c4c7708ed9f17cc4976cf
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/lexer.cpp')
| -rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 344 |
1 files changed, 172 insertions, 172 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index 5d48c092716..7b9f2ba6e33 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -17,219 +17,219 @@ namespace NSQLTranslationV1 { - using NSQLReflect::TLexerGrammar; - using NSQLTranslation::TParsedToken; - using NSQLTranslation::TParsedTokenList; +using NSQLReflect::TLexerGrammar; +using NSQLTranslation::TParsedToken; +using NSQLTranslation::TParsedTokenList; - size_t MatchANSIMultilineComment(TStringBuf remaining); +size_t MatchANSIMultilineComment(TStringBuf remaining); - TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) { - return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { - const auto basic = defaultComment(prefix); - if (basic.Empty()) { - return Nothing(); - } - - if (!prefix.StartsWith("/*")) { - return basic; - } - - size_t ll1Length = MatchANSIMultilineComment(prefix); - TStringBuf ll1Content = prefix.SubString(0, ll1Length); - - Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content); - if (ll1Content == 0) { - return basic; - } - - return TGenericToken{ - .Name = name, - .Content = ll1Content, - }; - }; - } +TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) { + return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { + const auto basic = defaultComment(prefix); + if (basic.Empty()) { + return Nothing(); + } - size_t MatchANSIMultilineComment(TStringBuf prefix) { if (!prefix.StartsWith("/*")) { - return 0; + return basic; } - size_t skipped = 0; + size_t ll1Length = MatchANSIMultilineComment(prefix); + TStringBuf ll1Content = prefix.SubString(0, ll1Length); - prefix.Skip(2); - skipped += 2; + Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content); + if (ll1Content == 0) { + return basic; + } - for (;;) { - if (prefix.StartsWith("*/")) { - prefix.Skip(2); - skipped += 2; - return skipped; - } + return TGenericToken{ + .Name = name, + .Content = ll1Content, + }; + }; +} - bool isSkipped = false; - if (prefix.StartsWith("/*")) { - size_t limit = prefix.rfind("*/"); - if (limit == std::string::npos) { - return 0; - } +size_t MatchANSIMultilineComment(TStringBuf prefix) { + if (!prefix.StartsWith("/*")) { + return 0; + } - size_t len = MatchANSIMultilineComment(prefix.Head(limit)); - prefix.Skip(len); - skipped += len; + size_t skipped = 0; - isSkipped = len != 0; - } + prefix.Skip(2); + skipped += 2; - if (isSkipped) { - continue; - } + for (;;) { + if (prefix.StartsWith("*/")) { + prefix.Skip(2); + skipped += 2; + return skipped; + } - if (prefix.size() == 0) { + bool isSkipped = false; + if (prefix.StartsWith("/*")) { + size_t limit = prefix.rfind("*/"); + if (limit == std::string::npos) { return 0; } - prefix.Skip(1); - skipped += 1; - } - } + size_t len = MatchANSIMultilineComment(prefix.Head(limit)); + prefix.Skip(len); + skipped += len; - TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) { - auto keyword = Compile("Keyword", KeywordPattern(grammar)); - return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> { - if (auto token = keyword(content)) { - return TGenericToken{ - .Name = TLexerGrammar::KeywordNameByBlock(token->Content), - .Content = token->Content, - }; - } - return Nothing(); - }; - } - - TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) { - TVector<TRegexPattern> patterns; - patterns.reserve(grammar.KeywordNames.size()); - for (const auto& keyword : grammar.KeywordNames) { - const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); - patterns.push_back({ - .Body = TString(content), - .IsCaseInsensitive = true, - }); + isSkipped = len != 0; } - return Merged(std::move(patterns)); - } - TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) { - THashMap<TString, TString> nameByBlock; - nameByBlock.reserve(grammar.PunctuationNames.size()); - for (const auto& name : grammar.PunctuationNames) { - const auto& block = grammar.BlockByName.at(name); - nameByBlock[block] = name; + if (isSkipped) { + continue; } - auto punct = Compile("Punctuation", PuntuationPattern(grammar)); + if (prefix.size() == 0) { + return 0; + } - return [nameByBlock = std::move(nameByBlock), - punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> { - if (auto token = punct(content)) { - return TGenericToken{ - .Name = nameByBlock.at(token->Content), - .Content = token->Content, - }; - } - return Nothing(); - }; + prefix.Skip(1); + skipped += 1; } +} - TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) { - TVector<TRegexPattern> patterns; - patterns.reserve(grammar.PunctuationNames.size()); - for (const auto& name : grammar.PunctuationNames) { - patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))}); +TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) { + auto keyword = Compile("Keyword", KeywordPattern(grammar)); + return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = keyword(content)) { + return TGenericToken{ + .Name = TLexerGrammar::KeywordNameByBlock(token->Content), + .Content = token->Content, + }; } - return Merged(std::move(patterns)); + return Nothing(); + }; +} + +TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.KeywordNames.size()); + for (const auto& keyword : grammar.KeywordNames) { + const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); + patterns.push_back({ + .Body = TString(content), + .IsCaseInsensitive = true, + }); + } + return Merged(std::move(patterns)); +} + +TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) { + THashMap<TString, TString> nameByBlock; + nameByBlock.reserve(grammar.PunctuationNames.size()); + for (const auto& name : grammar.PunctuationNames) { + const auto& block = grammar.BlockByName.at(name); + nameByBlock[block] = name; } - TGenericLexerGrammar MakeGenericLexerGrammar( - bool ansi, - const TLexerGrammar& grammar, - const TVector<std::tuple<TString, TString>>& regexByOtherName) { - TGenericLexerGrammar generic; - - generic.emplace_back(KeywordMatcher(grammar)); - generic.emplace_back(PuntuationMatcher(grammar)); + auto punct = Compile("Punctuation", PuntuationPattern(grammar)); - for (const auto& [name, regex] : regexByOtherName) { - generic.emplace_back(Compile(name, {regex})); - if (name == "COMMENT" && ansi) { - generic.back() = ANSICommentMatcher(name, std::move(generic.back())); - } + return [nameByBlock = std::move(nameByBlock), + punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = punct(content)) { + return TGenericToken{ + .Name = nameByBlock.at(token->Content), + .Content = token->Content, + }; } + return Nothing(); + }; +} - return generic; +TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.PunctuationNames.size()); + for (const auto& name : grammar.PunctuationNames) { + patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))}); } - - class TRegexLexer: public NSQLTranslation::ILexer { - public: - TRegexLexer(IGenericLexer::TPtr lexer) - : Lexer_(std::move(lexer)) - { + return Merged(std::move(patterns)); +} + +TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; + + generic.emplace_back(KeywordMatcher(grammar)); + generic.emplace_back(PuntuationMatcher(grammar)); + + for (const auto& [name, regex] : regexByOtherName) { + generic.emplace_back(Compile(name, {regex})); + if (name == "COMMENT" && ansi) { + generic.back() = ANSICommentMatcher(name, std::move(generic.back())); } + } - bool Tokenize( - const TString& query, - const TString& queryName, - const TTokenCallback& onNextToken, - NYql::TIssues& issues, - size_t maxErrors) override { - bool isFailed = false; - - const auto onNext = [&](TGenericToken&& token) { - if (token.Name == TGenericToken::Error) { - NYql::TPosition pos(token.Begin, 0, queryName); - TString message = TString("no candidates, skipping ") + token.Content; - issues.AddIssue(std::move(pos), std::move(message)); - isFailed = true; - return; - } - - onNextToken({ - .Name = TString(token.Name), - .Content = TString(token.Content), - }); - }; + return generic; +} - Lexer_->Tokenize(query, onNext, maxErrors); - return !isFailed; - } +class TRegexLexer: public NSQLTranslation::ILexer { +public: + TRegexLexer(IGenericLexer::TPtr lexer) + : Lexer_(std::move(lexer)) + { + } - private: - IGenericLexer::TPtr Lexer_; - }; + bool Tokenize( + const TString& query, + const TString& queryName, + const TTokenCallback& onNextToken, + NYql::TIssues& issues, + size_t maxErrors) override { + bool isFailed = false; + + const auto onNext = [&](TGenericToken&& token) { + if (token.Name == TGenericToken::Error) { + NYql::TPosition pos(token.Begin, 0, queryName); + TString message = TString("no candidates, skipping ") + token.Content; + issues.AddIssue(std::move(pos), std::move(message)); + isFailed = true; + return; + } - namespace { + onNextToken({ + .Name = TString(token.Name), + .Content = TString(token.Content), + }); + }; - class TFactory final: public NSQLTranslation::ILexerFactory { - public: - explicit TFactory(bool ansi) { - auto grammar = NSQLReflect::LoadLexerGrammar(); - auto regexes = MakeRegexByOtherName(grammar, ansi); - Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); - } + Lexer_->Tokenize(query, onNext, maxErrors); + return !isFailed; + } - NSQLTranslation::ILexer::TPtr MakeLexer() const override { - return NSQLTranslation::ILexer::TPtr( - new TRegexLexer(Lexer_)); - } +private: + IGenericLexer::TPtr Lexer_; +}; - private: - IGenericLexer::TPtr Lexer_; - }; +namespace { - } // namespace +class TFactory final: public NSQLTranslation::ILexerFactory { +public: + explicit TFactory(bool ansi) { + auto grammar = NSQLReflect::LoadLexerGrammar(); + auto regexes = MakeRegexByOtherName(grammar, ansi); + Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); + } - NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) { - return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi)); + NSQLTranslation::ILexer::TPtr MakeLexer() const override { + return NSQLTranslation::ILexer::TPtr( + new TRegexLexer(Lexer_)); } +private: + IGenericLexer::TPtr Lexer_; +}; + +} // namespace + +NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) { + return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi)); +} + } // namespace NSQLTranslationV1 |
