diff options
author | robot-piglet <[email protected]> | 2025-05-12 13:53:24 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-05-12 14:05:50 +0300 |
commit | 7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch) | |
tree | 70c132d1b611697ad23b90cf35215b035f247ec0 /yql/essentials/sql/v1/lexer/regex | |
parent | bf1279129bcf6c1b1001e39c39a13d80737898d3 (diff) |
Intermediate changes
commit_hash:3a624a323006078de71f50747f7b2e8cadba7ccd
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex')
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.cpp | 127 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.h | 56 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 304 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.h | 4 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex.cpp | 16 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex_ut.cpp | 9 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/ya.make | 1 |
7 files changed, 325 insertions, 192 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp new file mode 100644 index 00000000000..2a451b4ef5c --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp @@ -0,0 +1,127 @@ +#include "generic.h" + +#include <contrib/libs/re2/re2/re2.h> + +namespace NSQLTranslationV1 { + + namespace { + + TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) { + re2::StringPiece input(prefix.data(), prefix.size()); + if (RE2::Consume(&input, regex)) { + return TStringBuf(prefix.data(), input.data()); + } + return Nothing(); + } + + } // namespace + + class TGenericLexer: public IGenericLexer { + private: + static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF"; + + public: + explicit TGenericLexer(TGenericLexerGrammar grammar) + : Grammar_(std::move(grammar)) + { + } + + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors) const override { + Y_ENSURE(0 < maxErrors); + size_t errors = 0; + + size_t pos = 0; + if (text.StartsWith(Utf8BOM)) { + pos += Utf8BOM.size(); + } + + while (pos < text.size() && errors < maxErrors) { + TGenericToken matched = Match(TStringBuf(text, pos)); + matched.Begin = pos; + + pos += matched.Content.size(); + + if (matched.Name == TGenericToken::Error) { + errors += 1; + } + + onNext(std::move(matched)); + } + + if (errors == maxErrors) { + return false; + } + + onNext(TGenericToken{ + .Name = "EOF", + .Content = "<EOF>", + .Begin = pos, + }); + + return errors == 0; + } + + private: + TGenericToken Match(TStringBuf prefix) const { + TMaybe<TGenericToken> max; + Match(prefix, [&](TGenericToken&& token) { + if (max.Empty() || max->Content.size() < token.Content.size()) { + max = std::move(token); + } + }); + + if (max) { + return *max; + } + + return { + .Name = TGenericToken::Error, + .Content = prefix.substr(0, 1), + }; + } + + void Match(TStringBuf prefix, auto onMatch) const { + for (const auto& token : Grammar_) { + if (auto content = token.Match(prefix)) { + onMatch(TGenericToken{ + .Name = token.TokenName, + .Content = *content, + }); + } + } + } + + TGenericLexerGrammar Grammar_; + }; + + TTokenMatcher Compile(const TRegexPattern& regex) { + RE2::Options options; + options.set_case_sensitive(!regex.IsCaseInsensitive); + + return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options), + afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> { + TMaybe<TStringBuf> body, after; + if ((body = Match(prefix, *bodyRe)) && + (after = Match(prefix.Tail(body->size()), *afterRe))) { + return body; + } + return Nothing(); + }; + } + + IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { + return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); + } + + TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) { + TVector<TGenericToken> tokens; + lexer->Tokenize(text, [&](TGenericToken&& token) { + tokens.emplace_back(std::move(token)); + }); + return tokens; + } + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h new file mode 100644 index 00000000000..cde028cc599 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/generic.h @@ -0,0 +1,56 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/maybe.h> +#include <util/generic/ylimits.h> + +#include <functional> + +namespace NSQLTranslationV1 { + + struct TGenericToken { + static constexpr const char* Error = "<ERROR>"; + + TStringBuf Name; + TStringBuf Content; + size_t Begin = 0; // In bytes + }; + + class IGenericLexer: public TThrRefBase { + public: + using TPtr = TIntrusivePtr<IGenericLexer>; + using TTokenCallback = std::function<void(TGenericToken&& token)>; + + static constexpr size_t MaxErrorsLimit = Max<size_t>(); + + virtual ~IGenericLexer() = default; + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0; + }; + + using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>; + + struct TTokenRule { + TString TokenName; + TTokenMatcher Match; + }; + + using TGenericLexerGrammar = TVector<TTokenRule>; + + struct TRegexPattern { + TString Body; + TString After = ""; + bool IsCaseInsensitive = false; + }; + + TTokenMatcher Compile(const TRegexPattern& regex); + + IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar); + + TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text); + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index a1d96253bf7..58c98edfd31 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -1,5 +1,6 @@ #include "lexer.h" +#include "generic.h" #include "regex.h" #include <contrib/libs/re2/re2/re2.h> @@ -9,256 +10,177 @@ #include <util/generic/algorithm.h> #include <util/generic/string.h> +#include <util/generic/maybe.h> #include <util/string/subst.h> #include <util/string/ascii.h> namespace NSQLTranslationV1 { + using NSQLReflect::TLexerGrammar; using NSQLTranslation::TParsedToken; using NSQLTranslation::TParsedTokenList; - class TRegexLexer: public NSQLTranslation::ILexer { - static constexpr const char* CommentTokenName = "COMMENT"; - static constexpr const char* StringValueName = "STRING_VALUE"; - - static constexpr const TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - - public: - TRegexLexer( - bool ansi, - NSQLReflect::TLexerGrammar grammar, - const TVector<std::tuple<TString, TString>>& RegexByOtherName) - : Grammar_(std::move(grammar)) - , Ansi_(ansi) - { - for (const auto& [token, regex] : RegexByOtherName) { - RE2::Options custom; - if (token != CommentTokenName && token != StringValueName) { - custom.set_longest_match(true); - } + size_t MatchANSIMultilineComment(TStringBuf remaining); - RE2* re2 = new RE2(regex, custom); - if (token == CommentTokenName) { - CommentRegex_.Reset(re2); - } else { - OtherRegexes_.emplace_back(token, re2); - } + TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) { + return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> { + const auto basic = defaultComment(prefix); + if (basic.Empty()) { + return Nothing(); } - } - - bool Tokenize( - const TString& query, - const TString& queryName, - const TTokenCallback& onNextToken, - NYql::TIssues& issues, - size_t maxErrors) override { - size_t errors = 0; - size_t pos = 0; - if (query.StartsWith(Utf8BOM)) { - pos += Utf8BOM.size(); + if (!prefix.StartsWith("/*")) { + return basic; } - while (pos < query.size()) { - TParsedToken matched = Match(TStringBuf(query, pos)); - - if (matched.Name.empty() && maxErrors == errors) { - break; - } - - if (matched.Name.empty()) { - pos += 1; - errors += 1; - issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates"); - continue; - } + size_t ll1Length = MatchANSIMultilineComment(prefix); + TStringBuf ll1Content = prefix.SubString(0, ll1Length); - pos += matched.Content.length(); - onNextToken(std::move(matched)); + Y_ENSURE(ll1Content == 0 || basic <= ll1Content); + if (ll1Content == 0) { + return basic; } - onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"}); - return errors == 0; + return ll1Content; + }; + } + + size_t MatchANSIMultilineComment(TStringBuf prefix) { + if (!prefix.StartsWith("/*")) { + return 0; } - private: - TParsedToken Match(const TStringBuf prefix) { - TParsedTokenList matches; + size_t skipped = 0; - size_t keywordCount = MatchKeyword(prefix, matches); - MatchPunctuation(prefix, matches); - MatchRegex(prefix, matches); - MatchComment(prefix, matches); + prefix.Skip(2); + skipped += 2; - if (matches.empty()) { - return {}; + for (;;) { + if (prefix.StartsWith("*/")) { + prefix.Skip(2); + skipped += 2; + return skipped; } - auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) { - return m.Content.length(); - })->Content.length(); - - auto max = FindIf(matches, [&](const TParsedToken& m) { - return m.Content.length() == maxLength; - }); - - auto isMatched = [&](const TStringBuf name) { - return std::end(matches) != FindIf(matches, [&](const auto& m) { - return m.Name == name; - }); - }; - - size_t conflicts = CountIf(matches, [&](const TParsedToken& m) { - return m.Content.length() == max->Content.length(); - }); - conflicts -= 1; - Y_ENSURE( - conflicts == 0 || - (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) || - (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); - - Y_ENSURE(!max->Content.empty()); - return *max; - } - - bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& keyword : Grammar_.KeywordNames) { - const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword); - const TStringBuf content = prefix.substr(0, block.length()); - if (AsciiEqualsIgnoreCase(content, block)) { - matches.emplace_back(keyword, TString(content)); - count += 1; + bool isSkipped = false; + if (prefix.StartsWith("/*")) { + size_t limit = prefix.rfind("*/"); + if (limit == std::string::npos) { + return 0; } - } - return count; - } - size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& name : Grammar_.PunctuationNames) { - const auto& content = Grammar_.BlockByName.at(name); - if (prefix.substr(0, content.length()) == content) { - matches.emplace_back(name, content); - count += 1; - } - } - return count; - } + size_t len = MatchANSIMultilineComment(prefix.Head(limit)); + prefix.Skip(len); + skipped += len; - size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& [token, regex] : OtherRegexes_) { - if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) { - matches.emplace_back(token, TString(match)); - count += 1; - } + isSkipped = len != 0; } - return count; - } - const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) { - re2::StringPiece input(prefix.data(), prefix.size()); - if (RE2::Consume(&input, regex)) { - return TStringBuf(prefix.data(), input.data()); + if (isSkipped) { + continue; } - return ""; - } - size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) { - const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_); - if (reContent.empty()) { + if (prefix.size() == 0) { return 0; } - if (!(Ansi_ && prefix.StartsWith("/*"))) { - matches.emplace_back(CommentTokenName, TString(reContent)); - return 1; - } - - size_t ll1Length = MatchANSIMultilineComment(prefix); - const TStringBuf ll1Content = prefix.SubString(0, ll1Length); - - Y_ENSURE(ll1Content == 0 || reContent <= ll1Content); - if (ll1Content == 0) { - matches.emplace_back(CommentTokenName, TString(reContent)); - return 1; - } - - matches.emplace_back(CommentTokenName, TString(ll1Content)); - return 1; + prefix.Skip(1); + skipped += 1; } + } - size_t MatchANSIMultilineComment(TStringBuf remaining) { - if (!remaining.StartsWith("/*")) { - return 0; - } + TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; - size_t skipped = 0; + for (const auto& name : grammar.KeywordNames) { + auto matcher = Compile({ + .Body = TString(TLexerGrammar::KeywordBlock(name)), + .IsCaseInsensitive = true, + }); + generic.emplace_back(name, std::move(matcher)); + } - remaining.Skip(2); - skipped += 2; + for (const auto& name : grammar.PunctuationNames) { + generic.emplace_back( + name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))})); + } - for (;;) { - if (remaining.StartsWith("*/")) { - remaining.Skip(2); - skipped += 2; - return skipped; - } + for (const auto& [name, regex] : regexByOtherName) { + auto matcher = Compile({ + .Body = regex, + }); + generic.emplace_back(name, std::move(matcher)); + } - bool isSkipped = false; - if (remaining.StartsWith("/*")) { - size_t limit = remaining.rfind("*/"); - if (limit == std::string::npos) { - return 0; - } + if (ansi) { + auto it = FindIf(generic, [](const auto& m) { + return m.TokenName == "COMMENT"; + }); + Y_ENSURE(it != std::end(generic)); + it->Match = ANSICommentMatcher(it->Match); + } - size_t len = MatchANSIMultilineComment(remaining.Head(limit)); - remaining.Skip(len); - skipped += len; + return generic; + } - isSkipped = len != 0; - } + class TRegexLexer: public NSQLTranslation::ILexer { + public: + TRegexLexer(IGenericLexer::TPtr lexer) + : Lexer_(std::move(lexer)) + { + } - if (isSkipped) { - continue; + bool Tokenize( + const TString& query, + const TString& queryName, + const TTokenCallback& onNextToken, + NYql::TIssues& issues, + size_t maxErrors) override { + bool isFailed = false; + + const auto onNext = [&](TGenericToken&& token) { + if (token.Name == TGenericToken::Error) { + NYql::TPosition pos(token.Begin, 0, queryName); + TString message = TString("no candidates, skipping ") + token.Content; + issues.AddIssue(std::move(pos), std::move(message)); + isFailed = true; + return; } - if (remaining.size() == 0) { - return 0; - } + onNextToken({ + .Name = TString(token.Name), + .Content = TString(token.Content), + }); + }; - remaining.Skip(1); - skipped += 1; - } + Lexer_->Tokenize(query, onNext, maxErrors); + return !isFailed; } - NSQLReflect::TLexerGrammar Grammar_; - TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_; - THolder<RE2> CommentRegex_; - bool Ansi_; + private: + IGenericLexer::TPtr Lexer_; }; namespace { class TFactory final: public NSQLTranslation::ILexerFactory { public: - explicit TFactory(bool ansi) - : Ansi_(ansi) - , Grammar_(NSQLReflect::LoadLexerGrammar()) - , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_)) - { + explicit TFactory(bool ansi) { + auto grammar = NSQLReflect::LoadLexerGrammar(); + auto regexes = MakeRegexByOtherName(grammar, ansi); + Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); } NSQLTranslation::ILexer::TPtr MakeLexer() const override { return NSQLTranslation::ILexer::TPtr( - new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_)); + new TRegexLexer(Lexer_)); } private: - bool Ansi_; - NSQLReflect::TLexerGrammar Grammar_; - TVector<std::tuple<TString, TString>> RegexByOtherName_; + IGenericLexer::TPtr Lexer_; }; } // namespace diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h index e9968954e1f..42d99a0a530 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.h +++ b/yql/essentials/sql/v1/lexer/regex/lexer.h @@ -1,9 +1,13 @@ #pragma once +#include "generic.h" + #include <yql/essentials/parser/lexer_common/lexer.h> namespace NSQLTranslationV1 { + TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment); + NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi); } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp index e634ff009a7..3f8af88eb4c 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp @@ -45,6 +45,7 @@ namespace NSQLTranslationV1 { TString ToRegex(const TStringBuf name) { TString text = Grammar_->BlockByName.at(name); + Preprocess(text); Inline(text); Transform(text); Finalize(text); @@ -52,6 +53,10 @@ namespace NSQLTranslationV1 { } private: + void Preprocess(TString& text) { + text = ChangedDigitsPrecendence(std::move(text)); + } + void Inline(TString& text) { ApplyEachWhileChanging(text, Inliners_); } @@ -86,6 +91,8 @@ namespace NSQLTranslationV1 { Grammar_->PunctuationNames.contains(name) || PunctuationFragments.contains(name)) { def = "'" + def + "'"; + } else if (name == "DIGITS") { + def = ChangedDigitsPrecendence(std::move(def)); } def = QuoteAntlrRewrite(std::move(def)); @@ -95,6 +102,15 @@ namespace NSQLTranslationV1 { } } + // Regex engine matches the first matched alternative, + // even if it is not the longest one, while ANTLR is more gready. + TString ChangedDigitsPrecendence(TString body) { + if (SubstGlobal(body, "DECDIGITS | ", "") != 0) { + SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS"); + } + return body; + } + void Transform(TString& text) { ApplyEachWhileChanging(text, Transformations_); } diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp index e62bb0e609f..8c7688aadcd 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp @@ -62,7 +62,14 @@ Y_UNIT_TEST_SUITE(SqlRegexTests) { CheckRegex( /* ansi = */ false, "DIGITS", - R"(([0-9]+)|(0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+))"); + R"((0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))"); + } + + Y_UNIT_TEST(IntegerValue) { + CheckRegex( + /* ansi = */ false, + "INTEGER_VALUE", + R"(((0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))(([pP]|[uU])?([lL]|[sS]|[tT]|[iI]|[bB]|[nN])?))"); } Y_UNIT_TEST(Real) { diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make index 249dfbd11df..3a7fe19b94c 100644 --- a/yql/essentials/sql/v1/lexer/regex/ya.make +++ b/yql/essentials/sql/v1/lexer/regex/ya.make @@ -28,6 +28,7 @@ CFLAGS( ) SRCS( + generic.cpp lexer.cpp regex.cpp ) |