Intermediate changes

commit_hash:3a624a323006078de71f50747f7b2e8cadba7ccd
author: robot-piglet <[email protected]> 2025-05-12 13:53:24 +0300
committer: robot-piglet <[email protected]> 2025-05-12 14:05:50 +0300
commit: 7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch)
tree: 70c132d1b611697ad23b90cf35215b035f247ec0 /yql/essentials/sql/v1/lexer/regex/lexer.cpp
parent: bf1279129bcf6c1b1001e39c39a13d80737898d3 (diff)
1 files changed, 113 insertions, 191 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index a1d96253bf7..58c98edfd31 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -1,5 +1,6 @@
 #include "lexer.h"
 
+#include "generic.h"
 #include "regex.h"
 
 #include <contrib/libs/re2/re2/re2.h>
@@ -9,256 +10,177 @@
 
 #include <util/generic/algorithm.h>
 #include <util/generic/string.h>
+#include <util/generic/maybe.h>
 #include <util/string/subst.h>
 #include <util/string/ascii.h>
 
 namespace NSQLTranslationV1 {
 
+    using NSQLReflect::TLexerGrammar;
     using NSQLTranslation::TParsedToken;
     using NSQLTranslation::TParsedTokenList;
 
-    class TRegexLexer: public NSQLTranslation::ILexer {
-        static constexpr const char* CommentTokenName = "COMMENT";
-        static constexpr const char* StringValueName = "STRING_VALUE";
-
-        static constexpr const TStringBuf Utf8BOM = "\xEF\xBB\xBF";
-
-    public:
-        TRegexLexer(
-            bool ansi,
-            NSQLReflect::TLexerGrammar grammar,
-            const TVector<std::tuple<TString, TString>>& RegexByOtherName)
-            : Grammar_(std::move(grammar))
-            , Ansi_(ansi)
-        {
-            for (const auto& [token, regex] : RegexByOtherName) {
-                RE2::Options custom;
-                if (token != CommentTokenName && token != StringValueName) {
-                    custom.set_longest_match(true);
-                }
+    size_t MatchANSIMultilineComment(TStringBuf remaining);
 
-                RE2* re2 = new RE2(regex, custom);
-                if (token == CommentTokenName) {
-                    CommentRegex_.Reset(re2);
-                } else {
-                    OtherRegexes_.emplace_back(token, re2);
-                }
+    TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) {
+        return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> {
+            const auto basic = defaultComment(prefix);
+            if (basic.Empty()) {
+                return Nothing();
             }
-        }
-
-        bool Tokenize(
-            const TString& query,
-            const TString& queryName,
-            const TTokenCallback& onNextToken,
-            NYql::TIssues& issues,
-            size_t maxErrors) override {
-            size_t errors = 0;
 
-            size_t pos = 0;
-            if (query.StartsWith(Utf8BOM)) {
-                pos += Utf8BOM.size();
+            if (!prefix.StartsWith("/*")) {
+                return basic;
             }
 
-            while (pos < query.size()) {
-                TParsedToken matched = Match(TStringBuf(query, pos));
-
-                if (matched.Name.empty() && maxErrors == errors) {
-                    break;
-                }
-
-                if (matched.Name.empty()) {
-                    pos += 1;
-                    errors += 1;
-                    issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates");
-                    continue;
-                }
+            size_t ll1Length = MatchANSIMultilineComment(prefix);
+            TStringBuf ll1Content = prefix.SubString(0, ll1Length);
 
-                pos += matched.Content.length();
-                onNextToken(std::move(matched));
+            Y_ENSURE(ll1Content == 0 || basic <= ll1Content);
+            if (ll1Content == 0) {
+                return basic;
             }
 
-            onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"});
-            return errors == 0;
+            return ll1Content;
+        };
+    }
+
+    size_t MatchANSIMultilineComment(TStringBuf prefix) {
+        if (!prefix.StartsWith("/*")) {
+            return 0;
         }
 
-    private:
-        TParsedToken Match(const TStringBuf prefix) {
-            TParsedTokenList matches;
+        size_t skipped = 0;
 
-            size_t keywordCount = MatchKeyword(prefix, matches);
-            MatchPunctuation(prefix, matches);
-            MatchRegex(prefix, matches);
-            MatchComment(prefix, matches);
+        prefix.Skip(2);
+        skipped += 2;
 
-            if (matches.empty()) {
-                return {};
+        for (;;) {
+            if (prefix.StartsWith("*/")) {
+                prefix.Skip(2);
+                skipped += 2;
+                return skipped;
             }
 
-            auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) {
-                                 return m.Content.length();
-                             })->Content.length();
-
-            auto max = FindIf(matches, [&](const TParsedToken& m) {
-                return m.Content.length() == maxLength;
-            });
-
-            auto isMatched = [&](const TStringBuf name) {
-                return std::end(matches) != FindIf(matches, [&](const auto& m) {
-                           return m.Name == name;
-                       });
-            };
-
-            size_t conflicts = CountIf(matches, [&](const TParsedToken& m) {
-                return m.Content.length() == max->Content.length();
-            });
-            conflicts -= 1;
-            Y_ENSURE(
-                conflicts == 0 ||
-                (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) ||
-                (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
-
-            Y_ENSURE(!max->Content.empty());
-            return *max;
-        }
-
-        bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
-            size_t count = 0;
-            for (const auto& keyword : Grammar_.KeywordNames) {
-                const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword);
-                const TStringBuf content = prefix.substr(0, block.length());
-                if (AsciiEqualsIgnoreCase(content, block)) {
-                    matches.emplace_back(keyword, TString(content));
-                    count += 1;
+            bool isSkipped = false;
+            if (prefix.StartsWith("/*")) {
+                size_t limit = prefix.rfind("*/");
+                if (limit == std::string::npos) {
+                    return 0;
                 }
-            }
-            return count;
-        }
 
-        size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) {
-            size_t count = 0;
-            for (const auto& name : Grammar_.PunctuationNames) {
-                const auto& content = Grammar_.BlockByName.at(name);
-                if (prefix.substr(0, content.length()) == content) {
-                    matches.emplace_back(name, content);
-                    count += 1;
-                }
-            }
-            return count;
-        }
+                size_t len = MatchANSIMultilineComment(prefix.Head(limit));
+                prefix.Skip(len);
+                skipped += len;
 
-        size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) {
-            size_t count = 0;
-            for (const auto& [token, regex] : OtherRegexes_) {
-                if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) {
-                    matches.emplace_back(token, TString(match));
-                    count += 1;
-                }
+                isSkipped = len != 0;
             }
-            return count;
-        }
 
-        const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) {
-            re2::StringPiece input(prefix.data(), prefix.size());
-            if (RE2::Consume(&input, regex)) {
-                return TStringBuf(prefix.data(), input.data());
+            if (isSkipped) {
+                continue;
             }
-            return "";
-        }
 
-        size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) {
-            const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_);
-            if (reContent.empty()) {
+            if (prefix.size() == 0) {
                 return 0;
             }
 
-            if (!(Ansi_ && prefix.StartsWith("/*"))) {
-                matches.emplace_back(CommentTokenName, TString(reContent));
-                return 1;
-            }
-
-            size_t ll1Length = MatchANSIMultilineComment(prefix);
-            const TStringBuf ll1Content = prefix.SubString(0, ll1Length);
-
-            Y_ENSURE(ll1Content == 0 || reContent <= ll1Content);
-            if (ll1Content == 0) {
-                matches.emplace_back(CommentTokenName, TString(reContent));
-                return 1;
-            }
-
-            matches.emplace_back(CommentTokenName, TString(ll1Content));
-            return 1;
+            prefix.Skip(1);
+            skipped += 1;
         }
+    }
 
-        size_t MatchANSIMultilineComment(TStringBuf remaining) {
-            if (!remaining.StartsWith("/*")) {
-                return 0;
-            }
+    TGenericLexerGrammar MakeGenericLexerGrammar(
+        bool ansi,
+        const TLexerGrammar& grammar,
+        const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+        TGenericLexerGrammar generic;
 
-            size_t skipped = 0;
+        for (const auto& name : grammar.KeywordNames) {
+            auto matcher = Compile({
+                .Body = TString(TLexerGrammar::KeywordBlock(name)),
+                .IsCaseInsensitive = true,
+            });
+            generic.emplace_back(name, std::move(matcher));
+        }
 
-            remaining.Skip(2);
-            skipped += 2;
+        for (const auto& name : grammar.PunctuationNames) {
+            generic.emplace_back(
+                name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))}));
+        }
 
-            for (;;) {
-                if (remaining.StartsWith("*/")) {
-                    remaining.Skip(2);
-                    skipped += 2;
-                    return skipped;
-                }
+        for (const auto& [name, regex] : regexByOtherName) {
+            auto matcher = Compile({
+                .Body = regex,
+            });
+            generic.emplace_back(name, std::move(matcher));
+        }
 
-                bool isSkipped = false;
-                if (remaining.StartsWith("/*")) {
-                    size_t limit = remaining.rfind("*/");
-                    if (limit == std::string::npos) {
-                        return 0;
-                    }
+        if (ansi) {
+            auto it = FindIf(generic, [](const auto& m) {
+                return m.TokenName == "COMMENT";
+            });
+            Y_ENSURE(it != std::end(generic));
+            it->Match = ANSICommentMatcher(it->Match);
+        }
 
-                    size_t len = MatchANSIMultilineComment(remaining.Head(limit));
-                    remaining.Skip(len);
-                    skipped += len;
+        return generic;
+    }
 
-                    isSkipped = len != 0;
-                }
+    class TRegexLexer: public NSQLTranslation::ILexer {
+    public:
+        TRegexLexer(IGenericLexer::TPtr lexer)
+            : Lexer_(std::move(lexer))
+        {
+        }
 
-                if (isSkipped) {
-                    continue;
+        bool Tokenize(
+            const TString& query,
+            const TString& queryName,
+            const TTokenCallback& onNextToken,
+            NYql::TIssues& issues,
+            size_t maxErrors) override {
+            bool isFailed = false;
+
+            const auto onNext = [&](TGenericToken&& token) {
+                if (token.Name == TGenericToken::Error) {
+                    NYql::TPosition pos(token.Begin, 0, queryName);
+                    TString message = TString("no candidates, skipping ") + token.Content;
+                    issues.AddIssue(std::move(pos), std::move(message));
+                    isFailed = true;
+                    return;
                 }
 
-                if (remaining.size() == 0) {
-                    return 0;
-                }
+                onNextToken({
+                    .Name = TString(token.Name),
+                    .Content = TString(token.Content),
+                });
+            };
 
-                remaining.Skip(1);
-                skipped += 1;
-            }
+            Lexer_->Tokenize(query, onNext, maxErrors);
+            return !isFailed;
         }
 
-        NSQLReflect::TLexerGrammar Grammar_;
-        TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_;
-        THolder<RE2> CommentRegex_;
-        bool Ansi_;
+    private:
+        IGenericLexer::TPtr Lexer_;
     };
 
     namespace {
 
         class TFactory final: public NSQLTranslation::ILexerFactory {
         public:
-            explicit TFactory(bool ansi)
-                : Ansi_(ansi)
-                , Grammar_(NSQLReflect::LoadLexerGrammar())
-                , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_))
-            {
+            explicit TFactory(bool ansi) {
+                auto grammar = NSQLReflect::LoadLexerGrammar();
+                auto regexes = MakeRegexByOtherName(grammar, ansi);
+                Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
             }
 
             NSQLTranslation::ILexer::TPtr MakeLexer() const override {
                 return NSQLTranslation::ILexer::TPtr(
-                    new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_));
+                    new TRegexLexer(Lexer_));
             }
 
         private:
-            bool Ansi_;
-            NSQLReflect::TLexerGrammar Grammar_;
-            TVector<std::tuple<TString, TString>> RegexByOtherName_;
+            IGenericLexer::TPtr Lexer_;
         };
 
     } // namespace
author	robot-piglet <[email protected]>	2025-05-12 13:53:24 +0300
committer	robot-piglet <[email protected]>	2025-05-12 14:05:50 +0300
commit	7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch)
tree	70c132d1b611697ad23b90cf35215b035f247ec0 /yql/essentials/sql/v1/lexer/regex/lexer.cpp
parent	bf1279129bcf6c1b1001e39c39a13d80737898d3 (diff)