summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex
diff options
context:
space:
mode:
authorvitya-smirnov <[email protected]>2025-10-07 09:34:39 +0300
committervitya-smirnov <[email protected]>2025-10-07 09:52:14 +0300
commitbabe7533f18c11be1f8a195ed2324d2d9a89436a (patch)
tree45b7627141bf5a52b45a3d61fd1fbdd564bb8dd9 /yql/essentials/sql/v1/lexer/regex
parent8fe7cfe254fde2772477a8933a163b5f303716b4 (diff)
YQL-20086 sql/v1
commit_hash:55bc611cdaa0d8a0fc3c4c7708ed9f17cc4976cf
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex')
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.cpp282
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.h58
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp344
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.h8
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp214
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.cpp398
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.h8
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex_ut.cpp56
8 files changed, 684 insertions, 684 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp
index 50b2d78cf77..c27eec99b28 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp
@@ -6,176 +6,176 @@
namespace NSQLTranslationV1 {
- namespace {
+namespace {
- TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
- re2::StringPiece input(prefix.data(), prefix.size());
- if (RE2::Consume(&input, regex)) {
- return TStringBuf(prefix.data(), input.data());
- }
- return Nothing();
- }
-
- } // namespace
-
- class TGenericLexer: public IGenericLexer {
- private:
- static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";
-
- public:
- explicit TGenericLexer(TGenericLexerGrammar grammar)
- : Grammar_(std::move(grammar))
- {
- }
-
- virtual bool Tokenize(
- TStringBuf text,
- const TTokenCallback& onNext,
- size_t maxErrors) const override {
- Y_ENSURE(0 < maxErrors);
- size_t errors = 0;
+TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
+ re2::StringPiece input(prefix.data(), prefix.size());
+ if (RE2::Consume(&input, regex)) {
+ return TStringBuf(prefix.data(), input.data());
+ }
+ return Nothing();
+}
- size_t pos = 0;
- if (text.StartsWith(Utf8BOM)) {
- pos += Utf8BOM.size();
- }
+} // namespace
- while (pos < text.size() && errors < maxErrors) {
- TMaybe<TGenericToken> prev;
- TGenericToken next = Match(TStringBuf(text, pos));
+class TGenericLexer: public IGenericLexer {
+private:
+ static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";
- size_t skipped = next.Begin;
- next.Begin = skipped + pos;
+public:
+ explicit TGenericLexer(TGenericLexerGrammar grammar)
+ : Grammar_(std::move(grammar))
+ {
+ }
- if (skipped != 0) {
- prev = Match(TStringBuf(text, pos, skipped));
- prev->Begin = pos;
- }
+ virtual bool Tokenize(
+ TStringBuf text,
+ const TTokenCallback& onNext,
+ size_t maxErrors) const override {
+ Y_ENSURE(0 < maxErrors);
+ size_t errors = 0;
- pos += skipped + next.Content.size();
+ size_t pos = 0;
+ if (text.StartsWith(Utf8BOM)) {
+ pos += Utf8BOM.size();
+ }
- if (next.Name == TGenericToken::Error) {
- errors += 1;
- }
+ while (pos < text.size() && errors < maxErrors) {
+ TMaybe<TGenericToken> prev;
+ TGenericToken next = Match(TStringBuf(text, pos));
- if (prev) {
- onNext(std::move(*prev));
- }
- onNext(std::move(next));
- }
+ size_t skipped = next.Begin;
+ next.Begin = skipped + pos;
- if (errors == maxErrors) {
- return false;
+ if (skipped != 0) {
+ prev = Match(TStringBuf(text, pos, skipped));
+ prev->Begin = pos;
}
- onNext(TGenericToken{
- .Name = "EOF",
- .Content = "<EOF>",
- .Begin = pos,
- });
-
- return errors == 0;
- }
+ pos += skipped + next.Content.size();
- private:
- TGenericToken Match(TStringBuf prefix) const {
- TMaybe<TGenericToken> max;
- Match(prefix, [&](TGenericToken&& token) {
- if (max.Empty() || max->Content.size() < token.Content.size()) {
- max = std::move(token);
- }
- });
-
- if (max) {
- return *max;
+ if (next.Name == TGenericToken::Error) {
+ errors += 1;
}
- return {
- .Name = TGenericToken::Error,
- .Content = prefix.substr(0, 1),
- };
+ if (prev) {
+ onNext(std::move(*prev));
+ }
+ onNext(std::move(next));
}
- void Match(TStringBuf prefix, auto onMatch) const {
- for (const auto& matcher : Grammar_) {
- if (auto token = matcher(prefix)) {
- onMatch(std::move(*token));
- }
- }
+ if (errors == maxErrors) {
+ return false;
}
- TGenericLexerGrammar Grammar_;
- };
+ onNext(TGenericToken{
+ .Name = "EOF",
+ .Content = "<EOF>",
+ .Begin = pos,
+ });
- TTokenMatcher Compile(TString name, const TRegexPattern& regex) {
- RE2::Options options;
- options.set_case_sensitive(!regex.IsCaseInsensitive);
-
- return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options),
- bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
- afterRe = MakeAtomicShared<RE2>(regex.After, options),
- name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
- TMaybe<TStringBuf> before, body, after;
- if ((before = Match(prefix, *beforeRe)) &&
- (body = Match(prefix.Tail(before->size()), *bodyRe)) &&
- (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) {
- return TGenericToken{
- .Name = name,
- .Content = *body,
- .Begin = before->size(),
- };
- }
- return Nothing();
- };
+ return errors == 0;
}
- TRegexPattern Merged(TVector<TRegexPattern> patterns) {
- Y_ENSURE(!patterns.empty());
-
- const TRegexPattern& sample = patterns.back();
- Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
- return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) ==
- std::tie(sample.After, sample.Before, sample.IsCaseInsensitive);
- }));
-
- Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
- const auto lhs_length = lhs.Body.length();
- const auto rhs_length = rhs.Body.length();
-
- // Note: do not compare After and Before here as they are equal.
- return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body);
+private:
+ TGenericToken Match(TStringBuf prefix) const {
+ TMaybe<TGenericToken> max;
+ Match(prefix, [&](TGenericToken&& token) {
+ if (max.Empty() || max->Content.size() < token.Content.size()) {
+ max = std::move(token);
+ }
});
- TStringBuilder body;
- for (const auto& pattern : patterns) {
- TString regex = pattern.Body;
- if (pattern.Body.Contains('|')) {
- regex.prepend('(');
- regex.append(')');
- }
- body << regex << "|";
+ if (max) {
+ return *max;
}
- Y_ENSURE(body.back() == '|');
- body.pop_back();
-
- return TRegexPattern{
- .Body = std::move(body),
- .After = sample.After,
- .Before = sample.Before,
- .IsCaseInsensitive = sample.IsCaseInsensitive,
+
+ return {
+ .Name = TGenericToken::Error,
+ .Content = prefix.substr(0, 1),
};
}
- IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
- return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
+ void Match(TStringBuf prefix, auto onMatch) const {
+ for (const auto& matcher : Grammar_) {
+ if (auto token = matcher(prefix)) {
+ onMatch(std::move(*token));
+ }
+ }
}
- TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
- TVector<TGenericToken> tokens;
- lexer->Tokenize(text, [&](TGenericToken&& token) {
- tokens.emplace_back(std::move(token));
- });
- return tokens;
+ TGenericLexerGrammar Grammar_;
+};
+
+TTokenMatcher Compile(TString name, const TRegexPattern& regex) {
+ RE2::Options options;
+ options.set_case_sensitive(!regex.IsCaseInsensitive);
+
+ return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options),
+ bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
+ afterRe = MakeAtomicShared<RE2>(regex.After, options),
+ name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
+ TMaybe<TStringBuf> before, body, after;
+ if ((before = Match(prefix, *beforeRe)) &&
+ (body = Match(prefix.Tail(before->size()), *bodyRe)) &&
+ (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) {
+ return TGenericToken{
+ .Name = name,
+ .Content = *body,
+ .Begin = before->size(),
+ };
+ }
+ return Nothing();
+ };
+}
+
+TRegexPattern Merged(TVector<TRegexPattern> patterns) {
+ Y_ENSURE(!patterns.empty());
+
+ const TRegexPattern& sample = patterns.back();
+ Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
+ return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) ==
+ std::tie(sample.After, sample.Before, sample.IsCaseInsensitive);
+ }));
+
+ Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
+ const auto lhs_length = lhs.Body.length();
+ const auto rhs_length = rhs.Body.length();
+
+ // Note: do not compare After and Before here as they are equal.
+ return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body);
+ });
+
+ TStringBuilder body;
+ for (const auto& pattern : patterns) {
+ TString regex = pattern.Body;
+ if (pattern.Body.Contains('|')) {
+ regex.prepend('(');
+ regex.append(')');
+ }
+ body << regex << "|";
}
+ Y_ENSURE(body.back() == '|');
+ body.pop_back();
+
+ return TRegexPattern{
+ .Body = std::move(body),
+ .After = sample.After,
+ .Before = sample.Before,
+ .IsCaseInsensitive = sample.IsCaseInsensitive,
+ };
+}
+
+IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
+ return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
+}
+
+TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
+ TVector<TGenericToken> tokens;
+ lexer->Tokenize(text, [&](TGenericToken&& token) {
+ tokens.emplace_back(std::move(token));
+ });
+ return tokens;
+}
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h
index 60c2a53207c..52d1498106c 100644
--- a/yql/essentials/sql/v1/lexer/regex/generic.h
+++ b/yql/essentials/sql/v1/lexer/regex/generic.h
@@ -10,44 +10,44 @@
namespace NSQLTranslationV1 {
- struct TGenericToken {
- static constexpr const char* Error = "<ERROR>";
+struct TGenericToken {
+ static constexpr const char* Error = "<ERROR>";
- TString Name;
- TStringBuf Content;
- size_t Begin = 0; // In bytes
- };
+ TString Name;
+ TStringBuf Content;
+ size_t Begin = 0; // In bytes
+};
- class IGenericLexer: public TThrRefBase {
- public:
- using TPtr = TIntrusivePtr<IGenericLexer>;
- using TTokenCallback = std::function<void(TGenericToken&& token)>;
+class IGenericLexer: public TThrRefBase {
+public:
+ using TPtr = TIntrusivePtr<IGenericLexer>;
+ using TTokenCallback = std::function<void(TGenericToken&& token)>;
- static constexpr size_t MaxErrorsLimit = Max<size_t>();
+ static constexpr size_t MaxErrorsLimit = Max<size_t>();
- virtual ~IGenericLexer() = default;
- virtual bool Tokenize(
- TStringBuf text,
- const TTokenCallback& onNext,
- size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0;
- };
+ virtual ~IGenericLexer() = default;
+ virtual bool Tokenize(
+ TStringBuf text,
+ const TTokenCallback& onNext,
+ size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0;
+};
- using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>;
+using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>;
- using TGenericLexerGrammar = TVector<TTokenMatcher>;
+using TGenericLexerGrammar = TVector<TTokenMatcher>;
- struct TRegexPattern {
- TString Body;
- TString After = "";
- TString Before = "";
- bool IsCaseInsensitive = false;
- };
+struct TRegexPattern {
+ TString Body;
+ TString After = "";
+ TString Before = "";
+ bool IsCaseInsensitive = false;
+};
- TTokenMatcher Compile(TString name, const TRegexPattern& regex);
- TRegexPattern Merged(TVector<TRegexPattern> patterns);
+TTokenMatcher Compile(TString name, const TRegexPattern& regex);
+TRegexPattern Merged(TVector<TRegexPattern> patterns);
- IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar);
+IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar);
- TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text);
+TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text);
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index 5d48c092716..7b9f2ba6e33 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -17,219 +17,219 @@
namespace NSQLTranslationV1 {
- using NSQLReflect::TLexerGrammar;
- using NSQLTranslation::TParsedToken;
- using NSQLTranslation::TParsedTokenList;
+using NSQLReflect::TLexerGrammar;
+using NSQLTranslation::TParsedToken;
+using NSQLTranslation::TParsedTokenList;
- size_t MatchANSIMultilineComment(TStringBuf remaining);
+size_t MatchANSIMultilineComment(TStringBuf remaining);
- TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) {
- return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
- const auto basic = defaultComment(prefix);
- if (basic.Empty()) {
- return Nothing();
- }
-
- if (!prefix.StartsWith("/*")) {
- return basic;
- }
-
- size_t ll1Length = MatchANSIMultilineComment(prefix);
- TStringBuf ll1Content = prefix.SubString(0, ll1Length);
-
- Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content);
- if (ll1Content == 0) {
- return basic;
- }
-
- return TGenericToken{
- .Name = name,
- .Content = ll1Content,
- };
- };
- }
+TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) {
+ return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> {
+ const auto basic = defaultComment(prefix);
+ if (basic.Empty()) {
+ return Nothing();
+ }
- size_t MatchANSIMultilineComment(TStringBuf prefix) {
if (!prefix.StartsWith("/*")) {
- return 0;
+ return basic;
}
- size_t skipped = 0;
+ size_t ll1Length = MatchANSIMultilineComment(prefix);
+ TStringBuf ll1Content = prefix.SubString(0, ll1Length);
- prefix.Skip(2);
- skipped += 2;
+ Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content);
+ if (ll1Content == 0) {
+ return basic;
+ }
- for (;;) {
- if (prefix.StartsWith("*/")) {
- prefix.Skip(2);
- skipped += 2;
- return skipped;
- }
+ return TGenericToken{
+ .Name = name,
+ .Content = ll1Content,
+ };
+ };
+}
- bool isSkipped = false;
- if (prefix.StartsWith("/*")) {
- size_t limit = prefix.rfind("*/");
- if (limit == std::string::npos) {
- return 0;
- }
+size_t MatchANSIMultilineComment(TStringBuf prefix) {
+ if (!prefix.StartsWith("/*")) {
+ return 0;
+ }
- size_t len = MatchANSIMultilineComment(prefix.Head(limit));
- prefix.Skip(len);
- skipped += len;
+ size_t skipped = 0;
- isSkipped = len != 0;
- }
+ prefix.Skip(2);
+ skipped += 2;
- if (isSkipped) {
- continue;
- }
+ for (;;) {
+ if (prefix.StartsWith("*/")) {
+ prefix.Skip(2);
+ skipped += 2;
+ return skipped;
+ }
- if (prefix.size() == 0) {
+ bool isSkipped = false;
+ if (prefix.StartsWith("/*")) {
+ size_t limit = prefix.rfind("*/");
+ if (limit == std::string::npos) {
return 0;
}
- prefix.Skip(1);
- skipped += 1;
- }
- }
+ size_t len = MatchANSIMultilineComment(prefix.Head(limit));
+ prefix.Skip(len);
+ skipped += len;
- TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) {
- auto keyword = Compile("Keyword", KeywordPattern(grammar));
- return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> {
- if (auto token = keyword(content)) {
- return TGenericToken{
- .Name = TLexerGrammar::KeywordNameByBlock(token->Content),
- .Content = token->Content,
- };
- }
- return Nothing();
- };
- }
-
- TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) {
- TVector<TRegexPattern> patterns;
- patterns.reserve(grammar.KeywordNames.size());
- for (const auto& keyword : grammar.KeywordNames) {
- const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
- patterns.push_back({
- .Body = TString(content),
- .IsCaseInsensitive = true,
- });
+ isSkipped = len != 0;
}
- return Merged(std::move(patterns));
- }
- TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) {
- THashMap<TString, TString> nameByBlock;
- nameByBlock.reserve(grammar.PunctuationNames.size());
- for (const auto& name : grammar.PunctuationNames) {
- const auto& block = grammar.BlockByName.at(name);
- nameByBlock[block] = name;
+ if (isSkipped) {
+ continue;
}
- auto punct = Compile("Punctuation", PuntuationPattern(grammar));
+ if (prefix.size() == 0) {
+ return 0;
+ }
- return [nameByBlock = std::move(nameByBlock),
- punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> {
- if (auto token = punct(content)) {
- return TGenericToken{
- .Name = nameByBlock.at(token->Content),
- .Content = token->Content,
- };
- }
- return Nothing();
- };
+ prefix.Skip(1);
+ skipped += 1;
}
+}
- TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) {
- TVector<TRegexPattern> patterns;
- patterns.reserve(grammar.PunctuationNames.size());
- for (const auto& name : grammar.PunctuationNames) {
- patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))});
+TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ auto keyword = Compile("Keyword", KeywordPattern(grammar));
+ return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = keyword(content)) {
+ return TGenericToken{
+ .Name = TLexerGrammar::KeywordNameByBlock(token->Content),
+ .Content = token->Content,
+ };
}
- return Merged(std::move(patterns));
+ return Nothing();
+ };
+}
+
+TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.KeywordNames.size());
+ for (const auto& keyword : grammar.KeywordNames) {
+ const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword);
+ patterns.push_back({
+ .Body = TString(content),
+ .IsCaseInsensitive = true,
+ });
+ }
+ return Merged(std::move(patterns));
+}
+
+TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) {
+ THashMap<TString, TString> nameByBlock;
+ nameByBlock.reserve(grammar.PunctuationNames.size());
+ for (const auto& name : grammar.PunctuationNames) {
+ const auto& block = grammar.BlockByName.at(name);
+ nameByBlock[block] = name;
}
- TGenericLexerGrammar MakeGenericLexerGrammar(
- bool ansi,
- const TLexerGrammar& grammar,
- const TVector<std::tuple<TString, TString>>& regexByOtherName) {
- TGenericLexerGrammar generic;
-
- generic.emplace_back(KeywordMatcher(grammar));
- generic.emplace_back(PuntuationMatcher(grammar));
+ auto punct = Compile("Punctuation", PuntuationPattern(grammar));
- for (const auto& [name, regex] : regexByOtherName) {
- generic.emplace_back(Compile(name, {regex}));
- if (name == "COMMENT" && ansi) {
- generic.back() = ANSICommentMatcher(name, std::move(generic.back()));
- }
+ return [nameByBlock = std::move(nameByBlock),
+ punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> {
+ if (auto token = punct(content)) {
+ return TGenericToken{
+ .Name = nameByBlock.at(token->Content),
+ .Content = token->Content,
+ };
}
+ return Nothing();
+ };
+}
- return generic;
+TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) {
+ TVector<TRegexPattern> patterns;
+ patterns.reserve(grammar.PunctuationNames.size());
+ for (const auto& name : grammar.PunctuationNames) {
+ patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))});
}
-
- class TRegexLexer: public NSQLTranslation::ILexer {
- public:
- TRegexLexer(IGenericLexer::TPtr lexer)
- : Lexer_(std::move(lexer))
- {
+ return Merged(std::move(patterns));
+}
+
+TGenericLexerGrammar MakeGenericLexerGrammar(
+ bool ansi,
+ const TLexerGrammar& grammar,
+ const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+ TGenericLexerGrammar generic;
+
+ generic.emplace_back(KeywordMatcher(grammar));
+ generic.emplace_back(PuntuationMatcher(grammar));
+
+ for (const auto& [name, regex] : regexByOtherName) {
+ generic.emplace_back(Compile(name, {regex}));
+ if (name == "COMMENT" && ansi) {
+ generic.back() = ANSICommentMatcher(name, std::move(generic.back()));
}
+ }
- bool Tokenize(
- const TString& query,
- const TString& queryName,
- const TTokenCallback& onNextToken,
- NYql::TIssues& issues,
- size_t maxErrors) override {
- bool isFailed = false;
-
- const auto onNext = [&](TGenericToken&& token) {
- if (token.Name == TGenericToken::Error) {
- NYql::TPosition pos(token.Begin, 0, queryName);
- TString message = TString("no candidates, skipping ") + token.Content;
- issues.AddIssue(std::move(pos), std::move(message));
- isFailed = true;
- return;
- }
-
- onNextToken({
- .Name = TString(token.Name),
- .Content = TString(token.Content),
- });
- };
+ return generic;
+}
- Lexer_->Tokenize(query, onNext, maxErrors);
- return !isFailed;
- }
+class TRegexLexer: public NSQLTranslation::ILexer {
+public:
+ TRegexLexer(IGenericLexer::TPtr lexer)
+ : Lexer_(std::move(lexer))
+ {
+ }
- private:
- IGenericLexer::TPtr Lexer_;
- };
+ bool Tokenize(
+ const TString& query,
+ const TString& queryName,
+ const TTokenCallback& onNextToken,
+ NYql::TIssues& issues,
+ size_t maxErrors) override {
+ bool isFailed = false;
+
+ const auto onNext = [&](TGenericToken&& token) {
+ if (token.Name == TGenericToken::Error) {
+ NYql::TPosition pos(token.Begin, 0, queryName);
+ TString message = TString("no candidates, skipping ") + token.Content;
+ issues.AddIssue(std::move(pos), std::move(message));
+ isFailed = true;
+ return;
+ }
- namespace {
+ onNextToken({
+ .Name = TString(token.Name),
+ .Content = TString(token.Content),
+ });
+ };
- class TFactory final: public NSQLTranslation::ILexerFactory {
- public:
- explicit TFactory(bool ansi) {
- auto grammar = NSQLReflect::LoadLexerGrammar();
- auto regexes = MakeRegexByOtherName(grammar, ansi);
- Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
- }
+ Lexer_->Tokenize(query, onNext, maxErrors);
+ return !isFailed;
+ }
- NSQLTranslation::ILexer::TPtr MakeLexer() const override {
- return NSQLTranslation::ILexer::TPtr(
- new TRegexLexer(Lexer_));
- }
+private:
+ IGenericLexer::TPtr Lexer_;
+};
- private:
- IGenericLexer::TPtr Lexer_;
- };
+namespace {
- } // namespace
+class TFactory final: public NSQLTranslation::ILexerFactory {
+public:
+ explicit TFactory(bool ansi) {
+ auto grammar = NSQLReflect::LoadLexerGrammar();
+ auto regexes = MakeRegexByOtherName(grammar, ansi);
+ Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
+ }
- NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) {
- return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi));
+ NSQLTranslation::ILexer::TPtr MakeLexer() const override {
+ return NSQLTranslation::ILexer::TPtr(
+ new TRegexLexer(Lexer_));
}
+private:
+ IGenericLexer::TPtr Lexer_;
+};
+
+} // namespace
+
+NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) {
+ return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi));
+}
+
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h
index 32c145c6484..462d749f1ae 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.h
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.h
@@ -7,12 +7,12 @@
namespace NSQLTranslationV1 {
- TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment);
+TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment);
- TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar);
+TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar);
- TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar);
+TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar);
- NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi);
+NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi);
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
index 6ac25008b34..cbff9085514 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp
@@ -92,128 +92,128 @@ void Check(TString input, TString expected) {
}
Y_UNIT_TEST_SUITE(RegexLexerTests) {
- Y_UNIT_TEST(Whitespace) {
- Check("", "EOF");
- Check(" ", "WS( ) EOF");
- Check(" ", "WS( ) WS( ) EOF");
- Check("\n", "WS(\n) EOF");
- }
+Y_UNIT_TEST(Whitespace) {
+ Check("", "EOF");
+ Check(" ", "WS( ) EOF");
+ Check(" ", "WS( ) WS( ) EOF");
+ Check("\n", "WS(\n) EOF");
+}
- Y_UNIT_TEST(SinleLineComment) {
- Check("--yql", "COMMENT(--yql) EOF");
- Check("-- yql ", "COMMENT(-- yql ) EOF");
- Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF");
- Check("-- yql --", "COMMENT(-- yql --) EOF");
- }
+Y_UNIT_TEST(SinleLineComment) {
+ Check("--yql", "COMMENT(--yql) EOF");
+ Check("-- yql ", "COMMENT(-- yql ) EOF");
+ Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF");
+ Check("-- yql --", "COMMENT(-- yql --) EOF");
+}
- Y_UNIT_TEST(MultiLineComment) {
- Check("/* yql */", "COMMENT(/* yql */) EOF");
- Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
- Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF");
- }
+Y_UNIT_TEST(MultiLineComment) {
+ Check("/* yql */", "COMMENT(/* yql */) EOF");
+ Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF");
+ Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF");
+}
- Y_UNIT_TEST(RecursiveMultiLineCommentDefault) {
- Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false);
- Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false);
- }
+Y_UNIT_TEST(RecursiveMultiLineCommentDefault) {
+ Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false);
+ Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false);
+}
- Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) {
- Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true);
- Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
- Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true);
- Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
- Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true);
- Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
- Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
- Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true);
- }
+Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) {
+ Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true);
+ Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true);
+ Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true);
+ Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true);
+ Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true);
+}
- Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) {
- SetRandomSeed(100);
- for (size_t i = 0; i < 512; ++i) {
- auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128);
- TString actual = Tokenized(*AnsiLexer, input);
- TString expected = Tokenized(*PureAnsiLexer, input);
- UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input);
- }
+Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) {
+ SetRandomSeed(100);
+ for (size_t i = 0; i < 512; ++i) {
+ auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128);
+ TString actual = Tokenized(*AnsiLexer, input);
+ TString expected = Tokenized(*PureAnsiLexer, input);
+ UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input);
}
+}
- Y_UNIT_TEST(Keyword) {
- Check("SELECT", "SELECT EOF");
- Check("INSERT", "INSERT EOF");
- Check("FROM", "FROM EOF");
- }
+Y_UNIT_TEST(Keyword) {
+ Check("SELECT", "SELECT EOF");
+ Check("INSERT", "INSERT EOF");
+ Check("FROM", "FROM EOF");
+}
- Y_UNIT_TEST(Punctuation) {
- Check(
- "* / + - <|",
- "ASTERISK(*) WS( ) SLASH(/) WS( ) "
- "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF");
- Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF");
- }
+Y_UNIT_TEST(Punctuation) {
+ Check(
+ "* / + - <|",
+ "ASTERISK(*) WS( ) SLASH(/) WS( ) "
+ "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF");
+ Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF");
+}
- Y_UNIT_TEST(IdPlain) {
- Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF");
- }
+Y_UNIT_TEST(IdPlain) {
+ Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF");
+}
- Y_UNIT_TEST(IdQuoted) {
- Check("``", "ID_QUOTED(``) EOF");
- Check("` `", "ID_QUOTED(` `) EOF");
- Check("` `", "ID_QUOTED(` `) EOF");
- Check("`local/table`", "ID_QUOTED(`local/table`) EOF");
- }
+Y_UNIT_TEST(IdQuoted) {
+ Check("``", "ID_QUOTED(``) EOF");
+ Check("` `", "ID_QUOTED(` `) EOF");
+ Check("` `", "ID_QUOTED(` `) EOF");
+ Check("`local/table`", "ID_QUOTED(`local/table`) EOF");
+}
- Y_UNIT_TEST(SinleLineString) {
- Check("\"\"", "STRING_VALUE(\"\") EOF");
- Check("\' \'", "STRING_VALUE(\' \') EOF");
- Check("\" \"", "STRING_VALUE(\" \") EOF");
- Check("\"test\"", "STRING_VALUE(\"test\") EOF");
+Y_UNIT_TEST(SinleLineString) {
+ Check("\"\"", "STRING_VALUE(\"\") EOF");
+ Check("\' \'", "STRING_VALUE(\' \') EOF");
+ Check("\" \"", "STRING_VALUE(\" \") EOF");
+ Check("\"test\"", "STRING_VALUE(\"test\") EOF");
- Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false);
- Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true);
+ Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false);
+ Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true);
- Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false);
- Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true);
- }
+ Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false);
+ Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true);
+}
- Y_UNIT_TEST(MultiLineString) {
- Check("@@@@", "STRING_VALUE(@@@@) EOF");
- Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF");
- Check("@@test@@", "STRING_VALUE(@@test@@) EOF");
- Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF");
- }
+Y_UNIT_TEST(MultiLineString) {
+ Check("@@@@", "STRING_VALUE(@@@@) EOF");
+ Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF");
+ Check("@@test@@", "STRING_VALUE(@@test@@) EOF");
+ Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF");
+}
- Y_UNIT_TEST(Query) {
- TString query =
- "SELECT\n"
- " 123467,\n"
- " \"Hello, {name}!\",\n"
- " (1 + (5 * 1 / 0)),\n"
- " MIN(identifier),\n"
- " Bool(field),\n"
- " Math::Sin(var)\n"
- "FROM `local/test/space/table`\n"
- "JOIN test;";
-
- TString expected =
- "SELECT WS(\n) "
- "WS( ) WS( ) DIGITS(123467) COMMA(,) WS(\n) "
- "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) "
- "WS( ) WS( ) LPAREN(() DIGITS(1) WS( ) PLUS(+) WS( ) LPAREN(() DIGITS(5) WS( ) "
- "ASTERISK(*) WS( ) DIGITS(1) WS( ) SLASH(/) WS( ) DIGITS(0) RPAREN()) "
- "RPAREN()) COMMA(,) WS(\n) "
- "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) "
- "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) "
- "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) "
- "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) "
- "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF";
-
- Check(query, expected);
- }
+Y_UNIT_TEST(Query) {
+ TString query =
+ "SELECT\n"
+ " 123467,\n"
+ " \"Hello, {name}!\",\n"
+ " (1 + (5 * 1 / 0)),\n"
+ " MIN(identifier),\n"
+ " Bool(field),\n"
+ " Math::Sin(var)\n"
+ "FROM `local/test/space/table`\n"
+ "JOIN test;";
+
+ TString expected =
+ "SELECT WS(\n) "
+ "WS( ) WS( ) DIGITS(123467) COMMA(,) WS(\n) "
+ "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) "
+ "WS( ) WS( ) LPAREN(() DIGITS(1) WS( ) PLUS(+) WS( ) LPAREN(() DIGITS(5) WS( ) "
+ "ASTERISK(*) WS( ) DIGITS(1) WS( ) SLASH(/) WS( ) DIGITS(0) RPAREN()) "
+ "RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) "
+ "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) "
+ "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) "
+ "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF";
+
+ Check(query, expected);
+}
- Y_UNIT_TEST(Invalid) {
- Check("\"", "[INVALID] EOF");
- Check("\" SELECT", "[INVALID] WS( ) SELECT EOF");
- }
+Y_UNIT_TEST(Invalid) {
+ Check("\"", "[INVALID] EOF");
+ Check("\" SELECT", "[INVALID] WS( ) SELECT EOF");
+}
} // Y_UNIT_TEST_SUITE(RegexLexerTests)
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp
index 4d50d26e046..05e62b65423 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp
@@ -18,254 +18,254 @@
namespace NSQLTranslationV1 {
- class TLexerGrammarToRegexTranslator {
- private:
- struct TRewriteRule {
- TString Repr;
- std::function<void(TString&)> Apply;
- };
+class TLexerGrammarToRegexTranslator {
+private:
+ struct TRewriteRule {
+ TString Repr;
+ std::function<void(TString&)> Apply;
+ };
- using TRewriteRules = TVector<TRewriteRule>;
+ using TRewriteRules = TVector<TRewriteRule>;
- public:
- explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi)
- : Grammar_(&grammar)
- , Mode_(ansi ? "ANSI" : "DEFAULT")
- {
- AddExternalRules(Inliners_);
- AddFragmentRules(Inliners_);
+public:
+ explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi)
+ : Grammar_(&grammar)
+ , Mode_(ansi ? "ANSI" : "DEFAULT")
+ {
+ AddExternalRules(Inliners_);
+ AddFragmentRules(Inliners_);
- AddLetterRules(Transformations_);
- AddTransformationRules(Transformations_);
+ AddLetterRules(Transformations_);
+ AddTransformationRules(Transformations_);
- UnwrapQuotes_ = UnwrapQuotesRule();
- AddSpaceCollapses(SpaceCollapses_);
- UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule();
- AddRegexOptimizations(RegexOptimizations_);
- }
+ UnwrapQuotes_ = UnwrapQuotesRule();
+ AddSpaceCollapses(SpaceCollapses_);
+ UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule();
+ AddRegexOptimizations(RegexOptimizations_);
+ }
- TString ToRegex(const TStringBuf name) {
- TString text = Grammar_->BlockByName.at(name);
- Preprocess(text);
- Inline(text);
- Transform(text);
- Finalize(text);
- return text;
- }
+ TString ToRegex(const TStringBuf name) {
+ TString text = Grammar_->BlockByName.at(name);
+ Preprocess(text);
+ Inline(text);
+ Transform(text);
+ Finalize(text);
+ return text;
+ }
- private:
- void Preprocess(TString& text) {
- text = ChangedDigitsPrecendence(std::move(text));
- }
+private:
+ void Preprocess(TString& text) {
+ text = ChangedDigitsPrecendence(std::move(text));
+ }
- void Inline(TString& text) {
- ApplyEachWhileChanging(text, Inliners_);
- }
+ void Inline(TString& text) {
+ ApplyEachWhileChanging(text, Inliners_);
+ }
- void AddExternalRules(TRewriteRules& rules) {
- THashMap<TString, THashMap<TString, TString>> Substitutions = {
- SUBSTITUTIONS(DEFAULT),
- SUBSTITUTIONS(ANSI),
- };
+ void AddExternalRules(TRewriteRules& rules) {
+ THashMap<TString, THashMap<TString, TString>> Substitutions = {
+ SUBSTITUTIONS(DEFAULT),
+ SUBSTITUTIONS(ANSI),
+ };
- // ANSI mode MULTILINE_COMMENT is recursive
- Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] =
- Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"];
+ // ANSI mode MULTILINE_COMMENT is recursive
+ Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] =
+ Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"];
- for (const auto& [k, v] : Substitutions.at(Mode_)) {
- rules.emplace_back(RegexRewriteRule("@" + k + "@", v));
- }
+ for (const auto& [k, v] : Substitutions.at(Mode_)) {
+ rules.emplace_back(RegexRewriteRule("@" + k + "@", v));
}
+ }
- void AddFragmentRules(TRewriteRules& rules) {
- const THashSet<TString> PunctuationFragments = {
- "BACKSLASH",
- "QUOTE_DOUBLE",
- "QUOTE_SINGLE",
- "BACKTICK",
- "DOUBLE_COMMAT",
- };
-
- for (const auto& [name, definition] : Grammar_->BlockByName) {
- TString def = definition;
- if (
- Grammar_->PunctuationNames.contains(name) ||
- PunctuationFragments.contains(name)) {
- def = "'" + def + "'";
- } else if (name == "DIGITS") {
- def = ChangedDigitsPrecendence(std::move(def));
- }
- def = QuoteAntlrRewrite(std::move(def));
+ void AddFragmentRules(TRewriteRules& rules) {
+ const THashSet<TString> PunctuationFragments = {
+ "BACKSLASH",
+ "QUOTE_DOUBLE",
+ "QUOTE_SINGLE",
+ "BACKTICK",
+ "DOUBLE_COMMAT",
+ };
- rules.emplace_back(RegexRewriteRule(
- "(\\b" + name + "\\b)",
- "(" + def + ")"));
+ for (const auto& [name, definition] : Grammar_->BlockByName) {
+ TString def = definition;
+ if (
+ Grammar_->PunctuationNames.contains(name) ||
+ PunctuationFragments.contains(name)) {
+ def = "'" + def + "'";
+ } else if (name == "DIGITS") {
+ def = ChangedDigitsPrecendence(std::move(def));
}
- }
+ def = QuoteAntlrRewrite(std::move(def));
- // Regex engine matches the first matched alternative,
- // even if it is not the longest one, while ANTLR is more gready.
- TString ChangedDigitsPrecendence(TString body) {
- if (SubstGlobal(body, "DECDIGITS | ", "") != 0) {
- SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS");
- }
- return body;
+ rules.emplace_back(RegexRewriteRule(
+ "(\\b" + name + "\\b)",
+ "(" + def + ")"));
}
+ }
- void Transform(TString& text) {
- ApplyEachWhileChanging(text, Transformations_);
+ // Regex engine matches the first matched alternative,
+ // even if it is not the longest one, while ANTLR is more gready.
+ TString ChangedDigitsPrecendence(TString body) {
+ if (SubstGlobal(body, "DECDIGITS | ", "") != 0) {
+ SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS");
}
+ return body;
+ }
- void AddLetterRules(TRewriteRules& rules) {
- for (char letter = 'A'; letter <= 'Z'; ++letter) {
- TString lower(char(ToLower(letter)));
- TString upper(char(ToUpper(letter)));
- rules.emplace_back(RegexRewriteRule(
- "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)",
- "\\1[" + lower + upper + "]\\2"));
- }
- }
+ void Transform(TString& text) {
+ ApplyEachWhileChanging(text, Transformations_);
+ }
- void AddTransformationRules(TRewriteRules& rules) {
+ void AddLetterRules(TRewriteRules& rules) {
+ for (char letter = 'A'; letter <= 'Z'; ++letter) {
+ TString lower(char(ToLower(letter)));
+ TString upper(char(ToUpper(letter)));
rules.emplace_back(RegexRewriteRule(
- R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])"));
+ "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)",
+ "\\1[" + lower + upper + "]\\2"));
+ }
+ }
- rules.emplace_back(RegexRewriteRule(
- R"(~\('(..?)'\))", R"([^\1])"));
+ void AddTransformationRules(TRewriteRules& rules) {
+ rules.emplace_back(RegexRewriteRule(
+ R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])"));
- rules.emplace_back(RegexRewriteRule(
- R"(('..?')\.\.('..?'))", R"([\1-\2])"));
+ rules.emplace_back(RegexRewriteRule(
+ R"(~\('(..?)'\))", R"([^\1])"));
- rules.emplace_back(RegexRewriteRule(
- R"(\((.)\))", R"(\1)"));
+ rules.emplace_back(RegexRewriteRule(
+ R"(('..?')\.\.('..?'))", R"([\1-\2])"));
- rules.emplace_back(RegexRewriteRule(
- R"(\((\[.{1,8}\])\))", R"(\1)"));
+ rules.emplace_back(RegexRewriteRule(
+ R"(\((.)\))", R"(\1)"));
- rules.emplace_back(RegexRewriteRule(
- R"(\(('..?')\))", R"(\1)"));
+ rules.emplace_back(RegexRewriteRule(
+ R"(\((\[.{1,8}\])\))", R"(\1)"));
- rules.emplace_back(RegexRewriteRule(
- R"( \.)", R"( (.|\\n))"));
+ rules.emplace_back(RegexRewriteRule(
+ R"(\(('..?')\))", R"(\1)"));
- rules.emplace_back(RegexRewriteRule(
- R"(\bEOF\b)", R"($)"));
+ rules.emplace_back(RegexRewriteRule(
+ R"( \.)", R"( (.|\\n))"));
- rules.emplace_back(RegexRewriteRule(
- R"('\\u000C' \|)", R"('\\f' |)"));
- }
+ rules.emplace_back(RegexRewriteRule(
+ R"(\bEOF\b)", R"($)"));
- void Finalize(TString& text) {
- UnwrapQuotes_.Apply(text);
- ApplyEachWhileChanging(text, SpaceCollapses_);
- UnwrapQuotedSpace_.Apply(text);
- ApplyEachWhileChanging(text, RegexOptimizations_);
- }
+ rules.emplace_back(RegexRewriteRule(
+ R"('\\u000C' \|)", R"('\\f' |)"));
+ }
- void AddSpaceCollapses(TRewriteRules& rules) {
- rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)"));
- rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)"));
- }
+ void Finalize(TString& text) {
+ UnwrapQuotes_.Apply(text);
+ ApplyEachWhileChanging(text, SpaceCollapses_);
+ UnwrapQuotedSpace_.Apply(text);
+ ApplyEachWhileChanging(text, RegexOptimizations_);
+ }
- void AddRegexOptimizations(TRewriteRules& rules) {
- // ([a-z]|_) -> ([a-z_])
- rules.emplace_back(RegexRewriteRule(
- R"re(\[([^\^\[\]]+)\]\|(.)([\)\|]))re",
- R"re([\1\2]\3)re"));
+ void AddSpaceCollapses(TRewriteRules& rules) {
+ rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)"));
+ rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)"));
+ }
- // ([a-z]|[A-Z]) -> ([a-zA-Z])
- rules.emplace_back(RegexRewriteRule(
- R"re(\[([^\^\[\]]+)\]\|\[([^\^\[\]]+)\]([\)\|]))re",
- R"re([\1\2]\3)re"));
- }
+ void AddRegexOptimizations(TRewriteRules& rules) {
+ // ([a-z]|_) -> ([a-z_])
+ rules.emplace_back(RegexRewriteRule(
+ R"re(\[([^\^\[\]]+)\]\|(.)([\)\|]))re",
+ R"re([\1\2]\3)re"));
- void ApplyEachOnce(TString& text, const TRewriteRules& rules) {
- for (const auto& rule : rules) {
- rule.Apply(text);
- }
+ // ([a-z]|[A-Z]) -> ([a-zA-Z])
+ rules.emplace_back(RegexRewriteRule(
+ R"re(\[([^\^\[\]]+)\]\|\[([^\^\[\]]+)\]([\)\|]))re",
+ R"re([\1\2]\3)re"));
+ }
+
+ void ApplyEachOnce(TString& text, const TRewriteRules& rules) {
+ for (const auto& rule : rules) {
+ rule.Apply(text);
}
+ }
- void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) {
- constexpr size_t Limit = 16;
+ void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) {
+ constexpr size_t Limit = 16;
- TString prev;
- for (size_t i = 0; i < Limit + 1 && prev != text; ++i) {
- prev = text;
- ApplyEachOnce(text, rules);
- Y_ENSURE(i != Limit);
- }
+ TString prev;
+ for (size_t i = 0; i < Limit + 1 && prev != text; ++i) {
+ prev = text;
+ ApplyEachOnce(text, rules);
+ Y_ENSURE(i != Limit);
}
+ }
- TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) {
- auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
- Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
-
- TString error;
- Y_ENSURE(
- re2->CheckRewriteString(rewrite, &error),
- error << " on rewrite '" << rewrite << "'");
-
- return {
- .Repr = regex + " -> " + rewrite,
- .Apply = [re2, rewrite = std::move(rewrite)](TString& text) {
- RE2::GlobalReplace(&text, *re2, rewrite);
- },
- };
- }
+ TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) {
+ auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
+ Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
+
+ TString error;
+ Y_ENSURE(
+ re2->CheckRewriteString(rewrite, &error),
+ error << " on rewrite '" << rewrite << "'");
+
+ return {
+ .Repr = regex + " -> " + rewrite,
+ .Apply = [re2, rewrite = std::move(rewrite)](TString& text) {
+ RE2::GlobalReplace(&text, *re2, rewrite);
+ },
+ };
+ }
- TRewriteRule UnwrapQuotesRule() {
- const TString regex = R"('([^ ][^ ]?)')";
- auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
- Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
-
- return {
- .Repr = regex + " -> Quoted(\\1)",
- .Apply = [re2](TString& text) {
- TString content;
- std::size_t i = 256;
- while (RE2::PartialMatch(text, *re2, &content) && --i != 0) {
- TString quoted = RE2::QuoteMeta(content);
- for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) {
- quoted.erase(std::begin(quoted));
- }
- SubstGlobal(text, "'" + content + "'", quoted);
+ TRewriteRule UnwrapQuotesRule() {
+ const TString regex = R"('([^ ][^ ]?)')";
+ auto re2 = std::make_shared<RE2>(regex, RE2::Quiet);
+ Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'");
+
+ return {
+ .Repr = regex + " -> Quoted(\\1)",
+ .Apply = [re2](TString& text) {
+ TString content;
+ std::size_t i = 256;
+ while (RE2::PartialMatch(text, *re2, &content) && --i != 0) {
+ TString quoted = RE2::QuoteMeta(content);
+ for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) {
+ quoted.erase(std::begin(quoted));
}
- Y_ENSURE(i != 0);
- },
- };
- }
+ SubstGlobal(text, "'" + content + "'", quoted);
+ }
+ Y_ENSURE(i != 0);
+ },
+ };
+ }
- TRewriteRule UnwrapQuotedSpaceRule() {
- return RegexRewriteRule(R"(' ')", R"( )");
- }
+ TRewriteRule UnwrapQuotedSpaceRule() {
+ return RegexRewriteRule(R"(' ')", R"( )");
+ }
- TString QuoteAntlrRewrite(TString rewrite) {
- SubstGlobal(rewrite, R"(\)", R"(\\)");
- SubstGlobal(rewrite, R"('\\')", R"('\\\\')");
- return rewrite;
- }
+ TString QuoteAntlrRewrite(TString rewrite) {
+ SubstGlobal(rewrite, R"(\)", R"(\\)");
+ SubstGlobal(rewrite, R"('\\')", R"('\\\\')");
+ return rewrite;
+ }
- const NSQLReflect::TLexerGrammar* Grammar_;
- const TStringBuf Mode_;
+ const NSQLReflect::TLexerGrammar* Grammar_;
+ const TStringBuf Mode_;
- TRewriteRules Inliners_;
+ TRewriteRules Inliners_;
- TRewriteRules Transformations_;
+ TRewriteRules Transformations_;
- TRewriteRule UnwrapQuotes_;
- TRewriteRules SpaceCollapses_;
- TRewriteRule UnwrapQuotedSpace_;
- TRewriteRules RegexOptimizations_;
- };
+ TRewriteRule UnwrapQuotes_;
+ TRewriteRules SpaceCollapses_;
+ TRewriteRule UnwrapQuotedSpace_;
+ TRewriteRules RegexOptimizations_;
+};
- TVector<std::tuple<TString, TString>> MakeRegexByOtherName(const NSQLReflect::TLexerGrammar& grammar, bool ansi) {
- TLexerGrammarToRegexTranslator translator(grammar, ansi);
+TVector<std::tuple<TString, TString>> MakeRegexByOtherName(const NSQLReflect::TLexerGrammar& grammar, bool ansi) {
+ TLexerGrammarToRegexTranslator translator(grammar, ansi);
- TVector<std::tuple<TString, TString>> regexes;
- for (const auto& token : grammar.OtherNames) {
- regexes.emplace_back(token, translator.ToRegex(token));
- }
- return regexes;
+ TVector<std::tuple<TString, TString>> regexes;
+ for (const auto& token : grammar.OtherNames) {
+ regexes.emplace_back(token, translator.ToRegex(token));
}
+ return regexes;
+}
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h
index 1e9d92b6535..943f8c73e42 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex.h
+++ b/yql/essentials/sql/v1/lexer/regex/regex.h
@@ -6,9 +6,9 @@
namespace NSQLTranslationV1 {
- // Makes regexes only for tokens from OtherNames,
- // as keywords and punctuation are trivially matched.
- TVector<std::tuple<TString, TString>> MakeRegexByOtherName(
- const NSQLReflect::TLexerGrammar& grammar, bool ansi);
+// Makes regexes only for tokens from OtherNames,
+// as keywords and punctuation are trivially matched.
+TVector<std::tuple<TString, TString>> MakeRegexByOtherName(
+ const NSQLReflect::TLexerGrammar& grammar, bool ansi);
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
index b8d78799dda..2f05d02d776 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
@@ -11,80 +11,80 @@ auto grammar = NSQLReflect::LoadLexerGrammar();
auto defaultRegexes = MakeRegexByOtherName(grammar, /* ansi = */ false);
auto ansiRegexes = MakeRegexByOtherName(grammar, /* ansi = */ true);
-TString Get(const TVector<std::tuple<TString, TString>> &regexes,
+TString Get(const TVector<std::tuple<TString, TString>>& regexes,
const TStringBuf name) {
- return std::get<1>(*FindIf(
- regexes, [&](const auto &pair) { return std::get<0>(pair) == name; }));
+ return std::get<1>(*FindIf(
+ regexes, [&](const auto& pair) { return std::get<0>(pair) == name; }));
}
void CheckRegex(bool ansi, const TStringBuf name, const TStringBuf expected) {
- const auto &regexes = ansi ? ansiRegexes : defaultRegexes;
- const TString regex = Get(regexes, name);
+ const auto& regexes = ansi ? ansiRegexes : defaultRegexes;
+ const TString regex = Get(regexes, name);
- const RE2 re2(regex);
- Y_ENSURE(re2.ok(), re2.error());
+ const RE2 re2(regex);
+ Y_ENSURE(re2.ok(), re2.error());
- UNIT_ASSERT_VALUES_EQUAL(regex, expected);
+ UNIT_ASSERT_VALUES_EQUAL(regex, expected);
}
} // namespace
Y_UNIT_TEST_SUITE(SqlRegexTests) {
- Y_UNIT_TEST(StringValue) {
+Y_UNIT_TEST(StringValue) {
CheckRegex(
/* ansi = */ false, "STRING_VALUE",
R"(((((\'([^'\\]|(\\(.|\n)))*\'))|((\"([^"\\]|(\\(.|\n)))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sSuUyYjJ]|[pP]([tTbBvV])?)?))");
- }
+}
- Y_UNIT_TEST(AnsiStringValue) {
+Y_UNIT_TEST(AnsiStringValue) {
CheckRegex(
/* ansi = */ true, "STRING_VALUE",
R"(((((\'([^']|(\'\'))*\'))|((\"([^"]|(\"\"))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sSuUyYjJ]|[pP]([tTbBvV])?)?))");
- }
+}
- Y_UNIT_TEST(IdPlain) {
+Y_UNIT_TEST(IdPlain) {
CheckRegex(
/* ansi = */ false, "ID_PLAIN", R"(([a-zA-Z_])([a-zA-Z_0-9])*)");
- }
+}
- Y_UNIT_TEST(IdQuoted) {
+Y_UNIT_TEST(IdQuoted) {
CheckRegex(
/* ansi = */ false, "ID_QUOTED", R"(\`(\\(.|\n)|\`\`|[^`\\])*\`)");
- }
+}
- Y_UNIT_TEST(Digits) {
+Y_UNIT_TEST(Digits) {
CheckRegex(
/* ansi = */ false, "DIGITS",
R"((0[xX]([0-9a-fA-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))");
- }
+}
- Y_UNIT_TEST(IntegerValue) {
+Y_UNIT_TEST(IntegerValue) {
CheckRegex(
/* ansi = */ false, "INTEGER_VALUE",
R"(((0[xX]([0-9a-fA-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))(([pPuU])?([lLsStTiIbBnN])?))");
- }
+}
- Y_UNIT_TEST(Real) {
+Y_UNIT_TEST(Real) {
CheckRegex(
/* ansi = */ false, "REAL",
R"((([0-9]+)\.[0-9]*([eE](\+|\-)?([0-9]+))?|([0-9]+)([eE](\+|\-)?([0-9]+)))([fF]|[pP]([fF](4|8)|[nN])?)?)");
- }
+}
- Y_UNIT_TEST(Ws) {
+Y_UNIT_TEST(Ws) {
CheckRegex(
/* ansi = */ false, "WS", R"(( |\r|\t|\f|\n))");
- }
+}
- Y_UNIT_TEST(Comment) {
+Y_UNIT_TEST(Comment) {
CheckRegex(
/* ansi = */ false, "COMMENT",
R"(((\/\*(.|\n)*?\*\/)|(\-\-[^\n\r]*(\r\n?|\n|$))))");
- }
+}
- Y_UNIT_TEST(AnsiCommentSameAsDefault) {
+Y_UNIT_TEST(AnsiCommentSameAsDefault) {
// Because of recursive definition
UNIT_ASSERT_VALUES_EQUAL(Get(ansiRegexes, "COMMENT"),
Get(defaultRegexes, "COMMENT"));
- }
+}
} // Y_UNIT_TEST_SUITE(SqlRegexTests)