summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/lexer.cpp
diff options
context:
space:
mode:
authorvitya-smirnov <[email protected]>2025-10-07 09:34:39 +0300
committervitya-smirnov <[email protected]>2025-10-07 09:52:14 +0300
commitbabe7533f18c11be1f8a195ed2324d2d9a89436a (patch)
tree45b7627141bf5a52b45a3d61fd1fbdd564bb8dd9 /yql/essentials/sql/v1/lexer/regex/lexer.cpp
parent8fe7cfe254fde2772477a8933a163b5f303716b4 (diff)
YQL-20086 sql/v1
commit_hash:55bc611cdaa0d8a0fc3c4c7708ed9f17cc4976cf
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/lexer.cpp')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp344
1 files changed, 172 insertions, 172 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index 5d48c092716..7b9f2ba6e33 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -17,219 +17,219 @@
namespace NSQLTranslationV1 {
- using NSQLReflect::TLexerGrammar;
- using NSQLTranslation::TParsedToken;
- using NSQLTranslation::TParsedTokenList;
+using NSQLReflect::TLexerGrammar;
+using NSQLTranslation::TParsedToken;
+using NSQLTranslation::TParsedTokenList;
- size_t MatchANSIMultilineComment(TStringBuf remaining);
+size_t MatchANSIMultilineComment(TStringBuf remaining);
- TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) {
- return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
- const auto basic = defaultComment(prefix);
- if (basic.Empty()) {
- return Nothing();
- }
-
- if (!prefix.StartsWith("/*")) {
- return basic;
- }
-
- size_t ll1Length = MatchANSIMultilineComment(prefix);
- TStringBuf ll1Content = prefix.SubString(0, ll1Length);
-
- Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content);
- if (ll1Content == 0) {
- return basic;
- }
-
- return TGenericToken{
- .Name = name,
- .Content = ll1Content,
- };
- };
- }
+TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) {
+ return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
+ const auto basic = defaultComment(prefix);
+ if (basic.Empty()) {
+ return Nothing();
+ }
- size_t MatchANSIMultilineComment(TStringBuf prefix) {
if (!prefix.StartsWith("/*")) {
- return 0;
+ return basic;
}
- size_t skipped = 0;
+ size_t ll1Length = MatchANSIMultilineComment(prefix);
+ TStringBuf ll1Content = prefix.SubString(0, ll1Length);
- prefix.Skip(2);
- skipped += 2;
+ Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content);
+ if (ll1Content == 0) {
+ return basic;
+ }
- for (;;) {
- if (prefix.StartsWith("*/")) {
- prefix.Skip(2);
- skipped += 2;
- return skipped;
- }
+ return TGenericToken{
+ .Name = name,
+ .Content = ll1Content,
+ };
+ };
+}
- bool isSkipped = false;
- if (prefix.StartsWith("/*")) {
- size_t limit = prefix.rfind("*/");
- if (limit == std::string::npos) {
- return 0;
- }
+size_t MatchANSIMultilineComment(TStringBuf prefix) {
+ if (!prefix.StartsWith("/*")) {
+ return 0;
+ }
- size_t len = MatchANSIMultilineComment(prefix.Head(limit));
- prefix.Skip(len);
- skipped += len;
+ size_t skipped = 0;
- isSkipped = len != 0;
- }
+ prefix.Skip(2);
+ skipped += 2;
- if (isSkipped) {
- continue;
- }
+ for (;;) {
+ if (prefix.StartsWith("*/")) {
+ prefix.Skip(2);
+ skipped += 2;
+ return skipped;
+ }
- if (prefix.size() == 0) {
+ bool isSkipped = false;
+ if (prefix.StartsWith("/*")) {
+ size_t limit = prefix.rfind("*/");
+ if (limit == std::string::npos) {
return 0;
}
- prefix.Skip(1);
- skipped += 1;
- }
- }
+ size_t len = MatchANSIMultilineComment(prefix.Head(limit));
+ prefix.Skip(len);
+ skipped += len;
- TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) {
- auto keyword = Compile("Keyword", KeywordPattern(grammar));
- return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> {
- if (auto token = keyword(content)) {
- return TGenericToken{
- .Name = TLexerGrammar::KeywordNameByBlock(token->Content),
- .Content = token->Content,
- };
- }
- return Nothing();
- };
- }
-
- TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) {
- TVector<TRegexPattern> patterns;
- patterns.reserve(grammar.KeywordNames.size());
- for (const auto& keyword : grammar.KeywordNames) {
- const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
- patterns.push_back({
- .Body = TString(content),
- .IsCaseInsensitive = true,
- });
+ isSkipped = len != 0;
}
- return Merged(std::move(patterns));
- }
- TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) {
- THashMap<TString, TString> nameByBlock;
- nameByBlock.reserve(grammar.PunctuationNames.size());
- for (const auto& name : grammar.PunctuationNames) {
- const auto& block = grammar.BlockByName.at(name);
- nameByBlock[block] = name;
+ if (isSkipped) {
+ continue;
}
- auto punct = Compile("Punctuation", PuntuationPattern(grammar));
+ if (prefix.size() == 0) {
+ return 0;
+ }
- return [nameByBlock = std::move(nameByBlock),
- punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> {
- if (auto token = punct(content)) {
- return TGenericToken{
- .Name = nameByBlock.at(token->Content),
- .Content = token->Content,
- };
- }
- return Nothing();
- };
+ prefix.Skip(1);
+ skipped += 1;
}
+}
- TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) {
- TVector<TRegexPattern> patterns;
- patterns.reserve(grammar.PunctuationNames.size());
- for (const auto& name : grammar.PunctuationNames) {
- patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))});
+TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ auto keyword = Compile("Keyword", KeywordPattern(grammar));
+ return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = keyword(content)) {
+ return TGenericToken{
+ .Name = TLexerGrammar::KeywordNameByBlock(token->Content),
+ .Content = token->Content,
+ };
}
- return Merged(std::move(patterns));
+ return Nothing();
+ };
+}
+
+TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.KeywordNames.size());
+ for (const auto& keyword : grammar.KeywordNames) {
+ const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
+ patterns.push_back({
+ .Body = TString(content),
+ .IsCaseInsensitive = true,
+ });
+ }
+ return Merged(std::move(patterns));
+}
+
+TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ THashMap<TString, TString> nameByBlock;
+ nameByBlock.reserve(grammar.PunctuationNames.size());
+ for (const auto& name : grammar.PunctuationNames) {
+ const auto& block = grammar.BlockByName.at(name);
+ nameByBlock[block] = name;
}
- TGenericLexerGrammar MakeGenericLexerGrammar(
- bool ansi,
- const TLexerGrammar& grammar,
- const TVector<std::tuple<TString, TString>>& regexByOtherName) {
- TGenericLexerGrammar generic;
-
- generic.emplace_back(KeywordMatcher(grammar));
- generic.emplace_back(PuntuationMatcher(grammar));
+ auto punct = Compile("Punctuation", PuntuationPattern(grammar));
- for (const auto& [name, regex] : regexByOtherName) {
- generic.emplace_back(Compile(name, {regex}));
- if (name == "COMMENT" && ansi) {
- generic.back() = ANSICommentMatcher(name, std::move(generic.back()));
- }
+ return [nameByBlock = std::move(nameByBlock),
+ punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = punct(content)) {
+ return TGenericToken{
+ .Name = nameByBlock.at(token->Content),
+ .Content = token->Content,
+ };
}
+ return Nothing();
+ };
+}
- return generic;
+TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.PunctuationNames.size());
+ for (const auto& name : grammar.PunctuationNames) {
+ patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))});
}
-
- class TRegexLexer: public NSQLTranslation::ILexer {
- public:
- TRegexLexer(IGenericLexer::TPtr lexer)
- : Lexer_(std::move(lexer))
- {
+ return Merged(std::move(patterns));
+}
+
+TGenericLexerGrammar MakeGenericLexerGrammar(
+ bool ansi,
+ const TLexerGrammar& grammar,
+ const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+ TGenericLexerGrammar generic;
+
+ generic.emplace_back(KeywordMatcher(grammar));
+ generic.emplace_back(PuntuationMatcher(grammar));
+
+ for (const auto& [name, regex] : regexByOtherName) {
+ generic.emplace_back(Compile(name, {regex}));
+ if (name == "COMMENT" && ansi) {
+ generic.back() = ANSICommentMatcher(name, std::move(generic.back()));
}
+ }
- bool Tokenize(
- const TString& query,
- const TString& queryName,
- const TTokenCallback& onNextToken,
- NYql::TIssues& issues,
- size_t maxErrors) override {
- bool isFailed = false;
-
- const auto onNext = [&](TGenericToken&& token) {
- if (token.Name == TGenericToken::Error) {
- NYql::TPosition pos(token.Begin, 0, queryName);
- TString message = TString("no candidates, skipping ") + token.Content;
- issues.AddIssue(std::move(pos), std::move(message));
- isFailed = true;
- return;
- }
-
- onNextToken({
- .Name = TString(token.Name),
- .Content = TString(token.Content),
- });
- };
+ return generic;
+}
- Lexer_->Tokenize(query, onNext, maxErrors);
- return !isFailed;
- }
+class TRegexLexer: public NSQLTranslation::ILexer {
+public:
+ TRegexLexer(IGenericLexer::TPtr lexer)
+ : Lexer_(std::move(lexer))
+ {
+ }
- private:
- IGenericLexer::TPtr Lexer_;
- };
+ bool Tokenize(
+ const TString& query,
+ const TString& queryName,
+ const TTokenCallback& onNextToken,
+ NYql::TIssues& issues,
+ size_t maxErrors) override {
+ bool isFailed = false;
+
+ const auto onNext = [&](TGenericToken&& token) {
+ if (token.Name == TGenericToken::Error) {
+ NYql::TPosition pos(token.Begin, 0, queryName);
+ TString message = TString("no candidates, skipping ") + token.Content;
+ issues.AddIssue(std::move(pos), std::move(message));
+ isFailed = true;
+ return;
+ }
- namespace {
+ onNextToken({
+ .Name = TString(token.Name),
+ .Content = TString(token.Content),
+ });
+ };
- class TFactory final: public NSQLTranslation::ILexerFactory {
- public:
- explicit TFactory(bool ansi) {
- auto grammar = NSQLReflect::LoadLexerGrammar();
- auto regexes = MakeRegexByOtherName(grammar, ansi);
- Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
- }
+ Lexer_->Tokenize(query, onNext, maxErrors);
+ return !isFailed;
+ }
- NSQLTranslation::ILexer::TPtr MakeLexer() const override {
- return NSQLTranslation::ILexer::TPtr(
- new TRegexLexer(Lexer_));
- }
+private:
+ IGenericLexer::TPtr Lexer_;
+};
- private:
- IGenericLexer::TPtr Lexer_;
- };
+namespace {
- } // namespace
+class TFactory final: public NSQLTranslation::ILexerFactory {
+public:
+ explicit TFactory(bool ansi) {
+ auto grammar = NSQLReflect::LoadLexerGrammar();
+ auto regexes = MakeRegexByOtherName(grammar, ansi);
+ Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
+ }
- NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) {
- return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi));
+ NSQLTranslation::ILexer::TPtr MakeLexer() const override {
+ return NSQLTranslation::ILexer::TPtr(
+ new TRegexLexer(Lexer_));
}
+private:
+ IGenericLexer::TPtr Lexer_;
+};
+
+} // namespace
+
+NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) {
+ return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi));
+}
+
} // namespace NSQLTranslationV1