summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/lexer.cpp
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2025-05-12 13:53:24 +0300
committerrobot-piglet <[email protected]>2025-05-12 14:05:50 +0300
commit7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch)
tree70c132d1b611697ad23b90cf35215b035f247ec0 /yql/essentials/sql/v1/lexer/regex/lexer.cpp
parentbf1279129bcf6c1b1001e39c39a13d80737898d3 (diff)
Intermediate changes
commit_hash:3a624a323006078de71f50747f7b2e8cadba7ccd
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/lexer.cpp')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp304
1 files changed, 113 insertions, 191 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index a1d96253bf7..58c98edfd31 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -1,5 +1,6 @@
#include "lexer.h"
+#include "generic.h"
#include "regex.h"
#include <contrib/libs/re2/re2/re2.h>
@@ -9,256 +10,177 @@
#include <util/generic/algorithm.h>
#include <util/generic/string.h>
+#include <util/generic/maybe.h>
#include <util/string/subst.h>
#include <util/string/ascii.h>
namespace NSQLTranslationV1 {
+ using NSQLReflect::TLexerGrammar;
using NSQLTranslation::TParsedToken;
using NSQLTranslation::TParsedTokenList;
- class TRegexLexer: public NSQLTranslation::ILexer {
- static constexpr const char* CommentTokenName = "COMMENT";
- static constexpr const char* StringValueName = "STRING_VALUE";
-
- static constexpr const TStringBuf Utf8BOM = "\xEF\xBB\xBF";
-
- public:
- TRegexLexer(
- bool ansi,
- NSQLReflect::TLexerGrammar grammar,
- const TVector<std::tuple<TString, TString>>& RegexByOtherName)
- : Grammar_(std::move(grammar))
- , Ansi_(ansi)
- {
- for (const auto& [token, regex] : RegexByOtherName) {
- RE2::Options custom;
- if (token != CommentTokenName && token != StringValueName) {
- custom.set_longest_match(true);
- }
+ size_t MatchANSIMultilineComment(TStringBuf remaining);
- RE2* re2 = new RE2(regex, custom);
- if (token == CommentTokenName) {
- CommentRegex_.Reset(re2);
- } else {
- OtherRegexes_.emplace_back(token, re2);
- }
+ TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) {
+ return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ const auto basic = defaultComment(prefix);
+ if (basic.Empty()) {
+ return Nothing();
}
- }
-
- bool Tokenize(
- const TString& query,
- const TString& queryName,
- const TTokenCallback& onNextToken,
- NYql::TIssues& issues,
- size_t maxErrors) override {
- size_t errors = 0;
- size_t pos = 0;
- if (query.StartsWith(Utf8BOM)) {
- pos += Utf8BOM.size();
+ if (!prefix.StartsWith("/*")) {
+ return basic;
}
- while (pos < query.size()) {
- TParsedToken matched = Match(TStringBuf(query, pos));
-
- if (matched.Name.empty() && maxErrors == errors) {
- break;
- }
-
- if (matched.Name.empty()) {
- pos += 1;
- errors += 1;
- issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates");
- continue;
- }
+ size_t ll1Length = MatchANSIMultilineComment(prefix);
+ TStringBuf ll1Content = prefix.SubString(0, ll1Length);
- pos += matched.Content.length();
- onNextToken(std::move(matched));
+ Y_ENSURE(ll1Content == 0 || basic <= ll1Content);
+ if (ll1Content == 0) {
+ return basic;
}
- onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"});
- return errors == 0;
+ return ll1Content;
+ };
+ }
+
+ size_t MatchANSIMultilineComment(TStringBuf prefix) {
+ if (!prefix.StartsWith("/*")) {
+ return 0;
}
- private:
- TParsedToken Match(const TStringBuf prefix) {
- TParsedTokenList matches;
+ size_t skipped = 0;
- size_t keywordCount = MatchKeyword(prefix, matches);
- MatchPunctuation(prefix, matches);
- MatchRegex(prefix, matches);
- MatchComment(prefix, matches);
+ prefix.Skip(2);
+ skipped += 2;
- if (matches.empty()) {
- return {};
+ for (;;) {
+ if (prefix.StartsWith("*/")) {
+ prefix.Skip(2);
+ skipped += 2;
+ return skipped;
}
- auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) {
- return m.Content.length();
- })->Content.length();
-
- auto max = FindIf(matches, [&](const TParsedToken& m) {
- return m.Content.length() == maxLength;
- });
-
- auto isMatched = [&](const TStringBuf name) {
- return std::end(matches) != FindIf(matches, [&](const auto& m) {
- return m.Name == name;
- });
- };
-
- size_t conflicts = CountIf(matches, [&](const TParsedToken& m) {
- return m.Content.length() == max->Content.length();
- });
- conflicts -= 1;
- Y_ENSURE(
- conflicts == 0 ||
- (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) ||
- (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
-
- Y_ENSURE(!max->Content.empty());
- return *max;
- }
-
- bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
- size_t count = 0;
- for (const auto& keyword : Grammar_.KeywordNames) {
- const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword);
- const TStringBuf content = prefix.substr(0, block.length());
- if (AsciiEqualsIgnoreCase(content, block)) {
- matches.emplace_back(keyword, TString(content));
- count += 1;
+ bool isSkipped = false;
+ if (prefix.StartsWith("/*")) {
+ size_t limit = prefix.rfind("*/");
+ if (limit == std::string::npos) {
+ return 0;
}
- }
- return count;
- }
- size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) {
- size_t count = 0;
- for (const auto& name : Grammar_.PunctuationNames) {
- const auto& content = Grammar_.BlockByName.at(name);
- if (prefix.substr(0, content.length()) == content) {
- matches.emplace_back(name, content);
- count += 1;
- }
- }
- return count;
- }
+ size_t len = MatchANSIMultilineComment(prefix.Head(limit));
+ prefix.Skip(len);
+ skipped += len;
- size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) {
- size_t count = 0;
- for (const auto& [token, regex] : OtherRegexes_) {
- if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) {
- matches.emplace_back(token, TString(match));
- count += 1;
- }
+ isSkipped = len != 0;
}
- return count;
- }
- const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) {
- re2::StringPiece input(prefix.data(), prefix.size());
- if (RE2::Consume(&input, regex)) {
- return TStringBuf(prefix.data(), input.data());
+ if (isSkipped) {
+ continue;
}
- return "";
- }
- size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) {
- const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_);
- if (reContent.empty()) {
+ if (prefix.size() == 0) {
return 0;
}
- if (!(Ansi_ && prefix.StartsWith("/*"))) {
- matches.emplace_back(CommentTokenName, TString(reContent));
- return 1;
- }
-
- size_t ll1Length = MatchANSIMultilineComment(prefix);
- const TStringBuf ll1Content = prefix.SubString(0, ll1Length);
-
- Y_ENSURE(ll1Content == 0 || reContent <= ll1Content);
- if (ll1Content == 0) {
- matches.emplace_back(CommentTokenName, TString(reContent));
- return 1;
- }
-
- matches.emplace_back(CommentTokenName, TString(ll1Content));
- return 1;
+ prefix.Skip(1);
+ skipped += 1;
}
+ }
- size_t MatchANSIMultilineComment(TStringBuf remaining) {
- if (!remaining.StartsWith("/*")) {
- return 0;
- }
+ TGenericLexerGrammar MakeGenericLexerGrammar(
+ bool ansi,
+ const TLexerGrammar& grammar,
+ const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+ TGenericLexerGrammar generic;
- size_t skipped = 0;
+ for (const auto& name : grammar.KeywordNames) {
+ auto matcher = Compile({
+ .Body = TString(TLexerGrammar::KeywordBlock(name)),
+ .IsCaseInsensitive = true,
+ });
+ generic.emplace_back(name, std::move(matcher));
+ }
- remaining.Skip(2);
- skipped += 2;
+ for (const auto& name : grammar.PunctuationNames) {
+ generic.emplace_back(
+ name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))}));
+ }
- for (;;) {
- if (remaining.StartsWith("*/")) {
- remaining.Skip(2);
- skipped += 2;
- return skipped;
- }
+ for (const auto& [name, regex] : regexByOtherName) {
+ auto matcher = Compile({
+ .Body = regex,
+ });
+ generic.emplace_back(name, std::move(matcher));
+ }
- bool isSkipped = false;
- if (remaining.StartsWith("/*")) {
- size_t limit = remaining.rfind("*/");
- if (limit == std::string::npos) {
- return 0;
- }
+ if (ansi) {
+ auto it = FindIf(generic, [](const auto& m) {
+ return m.TokenName == "COMMENT";
+ });
+ Y_ENSURE(it != std::end(generic));
+ it->Match = ANSICommentMatcher(it->Match);
+ }
- size_t len = MatchANSIMultilineComment(remaining.Head(limit));
- remaining.Skip(len);
- skipped += len;
+ return generic;
+ }
- isSkipped = len != 0;
- }
+ class TRegexLexer: public NSQLTranslation::ILexer {
+ public:
+ TRegexLexer(IGenericLexer::TPtr lexer)
+ : Lexer_(std::move(lexer))
+ {
+ }
- if (isSkipped) {
- continue;
+ bool Tokenize(
+ const TString& query,
+ const TString& queryName,
+ const TTokenCallback& onNextToken,
+ NYql::TIssues& issues,
+ size_t maxErrors) override {
+ bool isFailed = false;
+
+ const auto onNext = [&](TGenericToken&& token) {
+ if (token.Name == TGenericToken::Error) {
+ NYql::TPosition pos(token.Begin, 0, queryName);
+ TString message = TString("no candidates, skipping ") + token.Content;
+ issues.AddIssue(std::move(pos), std::move(message));
+ isFailed = true;
+ return;
}
- if (remaining.size() == 0) {
- return 0;
- }
+ onNextToken({
+ .Name = TString(token.Name),
+ .Content = TString(token.Content),
+ });
+ };
- remaining.Skip(1);
- skipped += 1;
- }
+ Lexer_->Tokenize(query, onNext, maxErrors);
+ return !isFailed;
}
- NSQLReflect::TLexerGrammar Grammar_;
- TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_;
- THolder<RE2> CommentRegex_;
- bool Ansi_;
+ private:
+ IGenericLexer::TPtr Lexer_;
};
namespace {
class TFactory final: public NSQLTranslation::ILexerFactory {
public:
- explicit TFactory(bool ansi)
- : Ansi_(ansi)
- , Grammar_(NSQLReflect::LoadLexerGrammar())
- , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_))
- {
+ explicit TFactory(bool ansi) {
+ auto grammar = NSQLReflect::LoadLexerGrammar();
+ auto regexes = MakeRegexByOtherName(grammar, ansi);
+ Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
}
NSQLTranslation::ILexer::TPtr MakeLexer() const override {
return NSQLTranslation::ILexer::TPtr(
- new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_));
+ new TRegexLexer(Lexer_));
}
private:
- bool Ansi_;
- NSQLReflect::TLexerGrammar Grammar_;
- TVector<std::tuple<TString, TString>> RegexByOtherName_;
+ IGenericLexer::TPtr Lexer_;
};
} // namespace