diff options
author | vitya-smirnov <[email protected]> | 2025-10-07 09:34:39 +0300 |
---|---|---|
committer | vitya-smirnov <[email protected]> | 2025-10-07 09:52:14 +0300 |
commit | babe7533f18c11be1f8a195ed2324d2d9a89436a (patch) | |
tree | 45b7627141bf5a52b45a3d61fd1fbdd564bb8dd9 /yql/essentials/sql/v1/lexer/regex | |
parent | 8fe7cfe254fde2772477a8933a163b5f303716b4 (diff) |
YQL-20086 sql/v1
commit_hash:55bc611cdaa0d8a0fc3c4c7708ed9f17cc4976cf
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex')
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.cpp | 282 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/generic.h | 58 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.cpp | 344 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer.h | 8 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp | 214 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex.cpp | 398 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex.h | 8 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex_ut.cpp | 56 |
8 files changed, 684 insertions, 684 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp index 50b2d78cf77..c27eec99b28 100644 --- a/yql/essentials/sql/v1/lexer/regex/generic.cpp +++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp @@ -6,176 +6,176 @@ namespace NSQLTranslationV1 { - namespace { +namespace { - TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) { - re2::StringPiece input(prefix.data(), prefix.size()); - if (RE2::Consume(&input, regex)) { - return TStringBuf(prefix.data(), input.data()); - } - return Nothing(); - } - - } // namespace - - class TGenericLexer: public IGenericLexer { - private: - static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - - public: - explicit TGenericLexer(TGenericLexerGrammar grammar) - : Grammar_(std::move(grammar)) - { - } - - virtual bool Tokenize( - TStringBuf text, - const TTokenCallback& onNext, - size_t maxErrors) const override { - Y_ENSURE(0 < maxErrors); - size_t errors = 0; +TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) { + re2::StringPiece input(prefix.data(), prefix.size()); + if (RE2::Consume(&input, regex)) { + return TStringBuf(prefix.data(), input.data()); + } + return Nothing(); +} - size_t pos = 0; - if (text.StartsWith(Utf8BOM)) { - pos += Utf8BOM.size(); - } +} // namespace - while (pos < text.size() && errors < maxErrors) { - TMaybe<TGenericToken> prev; - TGenericToken next = Match(TStringBuf(text, pos)); +class TGenericLexer: public IGenericLexer { +private: + static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - size_t skipped = next.Begin; - next.Begin = skipped + pos; +public: + explicit TGenericLexer(TGenericLexerGrammar grammar) + : Grammar_(std::move(grammar)) + { + } - if (skipped != 0) { - prev = Match(TStringBuf(text, pos, skipped)); - prev->Begin = pos; - } + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors) const override { + Y_ENSURE(0 < maxErrors); + size_t errors = 0; - pos += skipped + next.Content.size(); + size_t pos = 0; + if (text.StartsWith(Utf8BOM)) { + pos += Utf8BOM.size(); + } - if (next.Name == TGenericToken::Error) { - errors += 1; - } + while (pos < text.size() && errors < maxErrors) { + TMaybe<TGenericToken> prev; + TGenericToken next = Match(TStringBuf(text, pos)); - if (prev) { - onNext(std::move(*prev)); - } - onNext(std::move(next)); - } + size_t skipped = next.Begin; + next.Begin = skipped + pos; - if (errors == maxErrors) { - return false; + if (skipped != 0) { + prev = Match(TStringBuf(text, pos, skipped)); + prev->Begin = pos; } - onNext(TGenericToken{ - .Name = "EOF", - .Content = "<EOF>", - .Begin = pos, - }); - - return errors == 0; - } + pos += skipped + next.Content.size(); - private: - TGenericToken Match(TStringBuf prefix) const { - TMaybe<TGenericToken> max; - Match(prefix, [&](TGenericToken&& token) { - if (max.Empty() || max->Content.size() < token.Content.size()) { - max = std::move(token); - } - }); - - if (max) { - return *max; + if (next.Name == TGenericToken::Error) { + errors += 1; } - return { - .Name = TGenericToken::Error, - .Content = prefix.substr(0, 1), - }; + if (prev) { + onNext(std::move(*prev)); + } + onNext(std::move(next)); } - void Match(TStringBuf prefix, auto onMatch) const { - for (const auto& matcher : Grammar_) { - if (auto token = matcher(prefix)) { - onMatch(std::move(*token)); - } - } + if (errors == maxErrors) { + return false; } - TGenericLexerGrammar Grammar_; - }; + onNext(TGenericToken{ + .Name = "EOF", + .Content = "<EOF>", + .Begin = pos, + }); - TTokenMatcher Compile(TString name, const TRegexPattern& regex) { - RE2::Options options; - options.set_case_sensitive(!regex.IsCaseInsensitive); - - return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options), - bodyRe = MakeAtomicShared<RE2>(regex.Body, options), - afterRe = MakeAtomicShared<RE2>(regex.After, options), - name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { - TMaybe<TStringBuf> before, body, after; - if ((before = Match(prefix, *beforeRe)) && - (body = Match(prefix.Tail(before->size()), *bodyRe)) && - (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) { - return TGenericToken{ - .Name = name, - .Content = *body, - .Begin = before->size(), - }; - } - return Nothing(); - }; + return errors == 0; } - TRegexPattern Merged(TVector<TRegexPattern> patterns) { - Y_ENSURE(!patterns.empty()); - - const TRegexPattern& sample = patterns.back(); - Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { - return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) == - std::tie(sample.After, sample.Before, sample.IsCaseInsensitive); - })); - - Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { - const auto lhs_length = lhs.Body.length(); - const auto rhs_length = rhs.Body.length(); - - // Note: do not compare After and Before here as they are equal. - return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body); +private: + TGenericToken Match(TStringBuf prefix) const { + TMaybe<TGenericToken> max; + Match(prefix, [&](TGenericToken&& token) { + if (max.Empty() || max->Content.size() < token.Content.size()) { + max = std::move(token); + } }); - TStringBuilder body; - for (const auto& pattern : patterns) { - TString regex = pattern.Body; - if (pattern.Body.Contains('|')) { - regex.prepend('('); - regex.append(')'); - } - body << regex << "|"; + if (max) { + return *max; } - Y_ENSURE(body.back() == '|'); - body.pop_back(); - - return TRegexPattern{ - .Body = std::move(body), - .After = sample.After, - .Before = sample.Before, - .IsCaseInsensitive = sample.IsCaseInsensitive, + + return { + .Name = TGenericToken::Error, + .Content = prefix.substr(0, 1), }; } - IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { - return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); + void Match(TStringBuf prefix, auto onMatch) const { + for (const auto& matcher : Grammar_) { + if (auto token = matcher(prefix)) { + onMatch(std::move(*token)); + } + } } - TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) { - TVector<TGenericToken> tokens; - lexer->Tokenize(text, [&](TGenericToken&& token) { - tokens.emplace_back(std::move(token)); - }); - return tokens; + TGenericLexerGrammar Grammar_; +}; + +TTokenMatcher Compile(TString name, const TRegexPattern& regex) { + RE2::Options options; + options.set_case_sensitive(!regex.IsCaseInsensitive); + + return [beforeRe = MakeAtomicShared<RE2>(regex.Before, options), + bodyRe = MakeAtomicShared<RE2>(regex.Body, options), + afterRe = MakeAtomicShared<RE2>(regex.After, options), + name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { + TMaybe<TStringBuf> before, body, after; + if ((before = Match(prefix, *beforeRe)) && + (body = Match(prefix.Tail(before->size()), *bodyRe)) && + (after = Match(prefix.Tail(before->size() + body->size()), *afterRe))) { + return TGenericToken{ + .Name = name, + .Content = *body, + .Begin = before->size(), + }; + } + return Nothing(); + }; +} + +TRegexPattern Merged(TVector<TRegexPattern> patterns) { + Y_ENSURE(!patterns.empty()); + + const TRegexPattern& sample = patterns.back(); + Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { + return std::tie(pattern.After, pattern.Before, pattern.IsCaseInsensitive) == + std::tie(sample.After, sample.Before, sample.IsCaseInsensitive); + })); + + Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { + const auto lhs_length = lhs.Body.length(); + const auto rhs_length = rhs.Body.length(); + + // Note: do not compare After and Before here as they are equal. + return std::tie(lhs_length, lhs.Body) > std::tie(rhs_length, rhs.Body); + }); + + TStringBuilder body; + for (const auto& pattern : patterns) { + TString regex = pattern.Body; + if (pattern.Body.Contains('|')) { + regex.prepend('('); + regex.append(')'); + } + body << regex << "|"; } + Y_ENSURE(body.back() == '|'); + body.pop_back(); + + return TRegexPattern{ + .Body = std::move(body), + .After = sample.After, + .Before = sample.Before, + .IsCaseInsensitive = sample.IsCaseInsensitive, + }; +} + +IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { + return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); +} + +TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) { + TVector<TGenericToken> tokens; + lexer->Tokenize(text, [&](TGenericToken&& token) { + tokens.emplace_back(std::move(token)); + }); + return tokens; +} } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h index 60c2a53207c..52d1498106c 100644 --- a/yql/essentials/sql/v1/lexer/regex/generic.h +++ b/yql/essentials/sql/v1/lexer/regex/generic.h @@ -10,44 +10,44 @@ namespace NSQLTranslationV1 { - struct TGenericToken { - static constexpr const char* Error = "<ERROR>"; +struct TGenericToken { + static constexpr const char* Error = "<ERROR>"; - TString Name; - TStringBuf Content; - size_t Begin = 0; // In bytes - }; + TString Name; + TStringBuf Content; + size_t Begin = 0; // In bytes +}; - class IGenericLexer: public TThrRefBase { - public: - using TPtr = TIntrusivePtr<IGenericLexer>; - using TTokenCallback = std::function<void(TGenericToken&& token)>; +class IGenericLexer: public TThrRefBase { +public: + using TPtr = TIntrusivePtr<IGenericLexer>; + using TTokenCallback = std::function<void(TGenericToken&& token)>; - static constexpr size_t MaxErrorsLimit = Max<size_t>(); + static constexpr size_t MaxErrorsLimit = Max<size_t>(); - virtual ~IGenericLexer() = default; - virtual bool Tokenize( - TStringBuf text, - const TTokenCallback& onNext, - size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0; - }; + virtual ~IGenericLexer() = default; + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0; +}; - using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>; +using TTokenMatcher = std::function<TMaybe<TGenericToken>(TStringBuf prefix)>; - using TGenericLexerGrammar = TVector<TTokenMatcher>; +using TGenericLexerGrammar = TVector<TTokenMatcher>; - struct TRegexPattern { - TString Body; - TString After = ""; - TString Before = ""; - bool IsCaseInsensitive = false; - }; +struct TRegexPattern { + TString Body; + TString After = ""; + TString Before = ""; + bool IsCaseInsensitive = false; +}; - TTokenMatcher Compile(TString name, const TRegexPattern& regex); - TRegexPattern Merged(TVector<TRegexPattern> patterns); +TTokenMatcher Compile(TString name, const TRegexPattern& regex); +TRegexPattern Merged(TVector<TRegexPattern> patterns); - IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar); +IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar); - TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text); +TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text); } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index 5d48c092716..7b9f2ba6e33 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -17,219 +17,219 @@ namespace NSQLTranslationV1 { - using NSQLReflect::TLexerGrammar; - using NSQLTranslation::TParsedToken; - using NSQLTranslation::TParsedTokenList; +using NSQLReflect::TLexerGrammar; +using NSQLTranslation::TParsedToken; +using NSQLTranslation::TParsedTokenList; - size_t MatchANSIMultilineComment(TStringBuf remaining); +size_t MatchANSIMultilineComment(TStringBuf remaining); - TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) { - return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { - const auto basic = defaultComment(prefix); - if (basic.Empty()) { - return Nothing(); - } - - if (!prefix.StartsWith("/*")) { - return basic; - } - - size_t ll1Length = MatchANSIMultilineComment(prefix); - TStringBuf ll1Content = prefix.SubString(0, ll1Length); - - Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content); - if (ll1Content == 0) { - return basic; - } - - return TGenericToken{ - .Name = name, - .Content = ll1Content, - }; - }; - } +TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment) { + return [defaultComment, name = std::move(name)](TStringBuf prefix) -> TMaybe<TGenericToken> { + const auto basic = defaultComment(prefix); + if (basic.Empty()) { + return Nothing(); + } - size_t MatchANSIMultilineComment(TStringBuf prefix) { if (!prefix.StartsWith("/*")) { - return 0; + return basic; } - size_t skipped = 0; + size_t ll1Length = MatchANSIMultilineComment(prefix); + TStringBuf ll1Content = prefix.SubString(0, ll1Length); - prefix.Skip(2); - skipped += 2; + Y_ENSURE(ll1Content == 0 || basic->Content <= ll1Content); + if (ll1Content == 0) { + return basic; + } - for (;;) { - if (prefix.StartsWith("*/")) { - prefix.Skip(2); - skipped += 2; - return skipped; - } + return TGenericToken{ + .Name = name, + .Content = ll1Content, + }; + }; +} - bool isSkipped = false; - if (prefix.StartsWith("/*")) { - size_t limit = prefix.rfind("*/"); - if (limit == std::string::npos) { - return 0; - } +size_t MatchANSIMultilineComment(TStringBuf prefix) { + if (!prefix.StartsWith("/*")) { + return 0; + } - size_t len = MatchANSIMultilineComment(prefix.Head(limit)); - prefix.Skip(len); - skipped += len; + size_t skipped = 0; - isSkipped = len != 0; - } + prefix.Skip(2); + skipped += 2; - if (isSkipped) { - continue; - } + for (;;) { + if (prefix.StartsWith("*/")) { + prefix.Skip(2); + skipped += 2; + return skipped; + } - if (prefix.size() == 0) { + bool isSkipped = false; + if (prefix.StartsWith("/*")) { + size_t limit = prefix.rfind("*/"); + if (limit == std::string::npos) { return 0; } - prefix.Skip(1); - skipped += 1; - } - } + size_t len = MatchANSIMultilineComment(prefix.Head(limit)); + prefix.Skip(len); + skipped += len; - TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) { - auto keyword = Compile("Keyword", KeywordPattern(grammar)); - return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> { - if (auto token = keyword(content)) { - return TGenericToken{ - .Name = TLexerGrammar::KeywordNameByBlock(token->Content), - .Content = token->Content, - }; - } - return Nothing(); - }; - } - - TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) { - TVector<TRegexPattern> patterns; - patterns.reserve(grammar.KeywordNames.size()); - for (const auto& keyword : grammar.KeywordNames) { - const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); - patterns.push_back({ - .Body = TString(content), - .IsCaseInsensitive = true, - }); + isSkipped = len != 0; } - return Merged(std::move(patterns)); - } - TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) { - THashMap<TString, TString> nameByBlock; - nameByBlock.reserve(grammar.PunctuationNames.size()); - for (const auto& name : grammar.PunctuationNames) { - const auto& block = grammar.BlockByName.at(name); - nameByBlock[block] = name; + if (isSkipped) { + continue; } - auto punct = Compile("Punctuation", PuntuationPattern(grammar)); + if (prefix.size() == 0) { + return 0; + } - return [nameByBlock = std::move(nameByBlock), - punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> { - if (auto token = punct(content)) { - return TGenericToken{ - .Name = nameByBlock.at(token->Content), - .Content = token->Content, - }; - } - return Nothing(); - }; + prefix.Skip(1); + skipped += 1; } +} - TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) { - TVector<TRegexPattern> patterns; - patterns.reserve(grammar.PunctuationNames.size()); - for (const auto& name : grammar.PunctuationNames) { - patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))}); +TTokenMatcher KeywordMatcher(const NSQLReflect::TLexerGrammar& grammar) { + auto keyword = Compile("Keyword", KeywordPattern(grammar)); + return [keyword = std::move(keyword)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = keyword(content)) { + return TGenericToken{ + .Name = TLexerGrammar::KeywordNameByBlock(token->Content), + .Content = token->Content, + }; } - return Merged(std::move(patterns)); + return Nothing(); + }; +} + +TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.KeywordNames.size()); + for (const auto& keyword : grammar.KeywordNames) { + const TStringBuf content = TLexerGrammar::KeywordBlockByName(keyword); + patterns.push_back({ + .Body = TString(content), + .IsCaseInsensitive = true, + }); + } + return Merged(std::move(patterns)); +} + +TTokenMatcher PuntuationMatcher(const NSQLReflect::TLexerGrammar& grammar) { + THashMap<TString, TString> nameByBlock; + nameByBlock.reserve(grammar.PunctuationNames.size()); + for (const auto& name : grammar.PunctuationNames) { + const auto& block = grammar.BlockByName.at(name); + nameByBlock[block] = name; } - TGenericLexerGrammar MakeGenericLexerGrammar( - bool ansi, - const TLexerGrammar& grammar, - const TVector<std::tuple<TString, TString>>& regexByOtherName) { - TGenericLexerGrammar generic; - - generic.emplace_back(KeywordMatcher(grammar)); - generic.emplace_back(PuntuationMatcher(grammar)); + auto punct = Compile("Punctuation", PuntuationPattern(grammar)); - for (const auto& [name, regex] : regexByOtherName) { - generic.emplace_back(Compile(name, {regex})); - if (name == "COMMENT" && ansi) { - generic.back() = ANSICommentMatcher(name, std::move(generic.back())); - } + return [nameByBlock = std::move(nameByBlock), + punct = std::move(punct)](TStringBuf content) -> TMaybe<TGenericToken> { + if (auto token = punct(content)) { + return TGenericToken{ + .Name = nameByBlock.at(token->Content), + .Content = token->Content, + }; } + return Nothing(); + }; +} - return generic; +TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar) { + TVector<TRegexPattern> patterns; + patterns.reserve(grammar.PunctuationNames.size()); + for (const auto& name : grammar.PunctuationNames) { + patterns.push_back({RE2::QuoteMeta(grammar.BlockByName.at(name))}); } - - class TRegexLexer: public NSQLTranslation::ILexer { - public: - TRegexLexer(IGenericLexer::TPtr lexer) - : Lexer_(std::move(lexer)) - { + return Merged(std::move(patterns)); +} + +TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; + + generic.emplace_back(KeywordMatcher(grammar)); + generic.emplace_back(PuntuationMatcher(grammar)); + + for (const auto& [name, regex] : regexByOtherName) { + generic.emplace_back(Compile(name, {regex})); + if (name == "COMMENT" && ansi) { + generic.back() = ANSICommentMatcher(name, std::move(generic.back())); } + } - bool Tokenize( - const TString& query, - const TString& queryName, - const TTokenCallback& onNextToken, - NYql::TIssues& issues, - size_t maxErrors) override { - bool isFailed = false; - - const auto onNext = [&](TGenericToken&& token) { - if (token.Name == TGenericToken::Error) { - NYql::TPosition pos(token.Begin, 0, queryName); - TString message = TString("no candidates, skipping ") + token.Content; - issues.AddIssue(std::move(pos), std::move(message)); - isFailed = true; - return; - } - - onNextToken({ - .Name = TString(token.Name), - .Content = TString(token.Content), - }); - }; + return generic; +} - Lexer_->Tokenize(query, onNext, maxErrors); - return !isFailed; - } +class TRegexLexer: public NSQLTranslation::ILexer { +public: + TRegexLexer(IGenericLexer::TPtr lexer) + : Lexer_(std::move(lexer)) + { + } - private: - IGenericLexer::TPtr Lexer_; - }; + bool Tokenize( + const TString& query, + const TString& queryName, + const TTokenCallback& onNextToken, + NYql::TIssues& issues, + size_t maxErrors) override { + bool isFailed = false; + + const auto onNext = [&](TGenericToken&& token) { + if (token.Name == TGenericToken::Error) { + NYql::TPosition pos(token.Begin, 0, queryName); + TString message = TString("no candidates, skipping ") + token.Content; + issues.AddIssue(std::move(pos), std::move(message)); + isFailed = true; + return; + } - namespace { + onNextToken({ + .Name = TString(token.Name), + .Content = TString(token.Content), + }); + }; - class TFactory final: public NSQLTranslation::ILexerFactory { - public: - explicit TFactory(bool ansi) { - auto grammar = NSQLReflect::LoadLexerGrammar(); - auto regexes = MakeRegexByOtherName(grammar, ansi); - Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); - } + Lexer_->Tokenize(query, onNext, maxErrors); + return !isFailed; + } - NSQLTranslation::ILexer::TPtr MakeLexer() const override { - return NSQLTranslation::ILexer::TPtr( - new TRegexLexer(Lexer_)); - } +private: + IGenericLexer::TPtr Lexer_; +}; - private: - IGenericLexer::TPtr Lexer_; - }; +namespace { - } // namespace +class TFactory final: public NSQLTranslation::ILexerFactory { +public: + explicit TFactory(bool ansi) { + auto grammar = NSQLReflect::LoadLexerGrammar(); + auto regexes = MakeRegexByOtherName(grammar, ansi); + Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); + } - NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) { - return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi)); + NSQLTranslation::ILexer::TPtr MakeLexer() const override { + return NSQLTranslation::ILexer::TPtr( + new TRegexLexer(Lexer_)); } +private: + IGenericLexer::TPtr Lexer_; +}; + +} // namespace + +NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi) { + return NSQLTranslation::TLexerFactoryPtr(new TFactory(ansi)); +} + } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h index 32c145c6484..462d749f1ae 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.h +++ b/yql/essentials/sql/v1/lexer/regex/lexer.h @@ -7,12 +7,12 @@ namespace NSQLTranslationV1 { - TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment); +TTokenMatcher ANSICommentMatcher(TString name, TTokenMatcher defaultComment); - TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar); +TRegexPattern KeywordPattern(const NSQLReflect::TLexerGrammar& grammar); - TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar); +TRegexPattern PuntuationPattern(const NSQLReflect::TLexerGrammar& grammar); - NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi); +NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi); } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp index 6ac25008b34..cbff9085514 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer_ut.cpp @@ -92,128 +92,128 @@ void Check(TString input, TString expected) { } Y_UNIT_TEST_SUITE(RegexLexerTests) { - Y_UNIT_TEST(Whitespace) { - Check("", "EOF"); - Check(" ", "WS( ) EOF"); - Check(" ", "WS( ) WS( ) EOF"); - Check("\n", "WS(\n) EOF"); - } +Y_UNIT_TEST(Whitespace) { + Check("", "EOF"); + Check(" ", "WS( ) EOF"); + Check(" ", "WS( ) WS( ) EOF"); + Check("\n", "WS(\n) EOF"); +} - Y_UNIT_TEST(SinleLineComment) { - Check("--yql", "COMMENT(--yql) EOF"); - Check("-- yql ", "COMMENT(-- yql ) EOF"); - Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF"); - Check("-- yql --", "COMMENT(-- yql --) EOF"); - } +Y_UNIT_TEST(SinleLineComment) { + Check("--yql", "COMMENT(--yql) EOF"); + Check("-- yql ", "COMMENT(-- yql ) EOF"); + Check("-- yql\nSELECT", "COMMENT(-- yql\n) SELECT EOF"); + Check("-- yql --", "COMMENT(-- yql --) EOF"); +} - Y_UNIT_TEST(MultiLineComment) { - Check("/* yql */", "COMMENT(/* yql */) EOF"); - Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); - Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF"); - } +Y_UNIT_TEST(MultiLineComment) { + Check("/* yql */", "COMMENT(/* yql */) EOF"); + Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF"); + Check("/* yql\n * yql\n */", "COMMENT(/* yql\n * yql\n */) EOF"); +} - Y_UNIT_TEST(RecursiveMultiLineCommentDefault) { - Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false); - Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false); - } +Y_UNIT_TEST(RecursiveMultiLineCommentDefault) { + Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ false); + Check("/* /* yql */ */", "COMMENT(/* /* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ false); +} - Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) { - Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true); - Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); - Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true); - Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); - Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true); - Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); - Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); - Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true); - } +Y_UNIT_TEST(RecursiveMultiLineCommentAnsi) { + Check("/* /* yql */", "COMMENT(/* /* yql */) EOF", /* ansi = */ true); + Check("/* yql */ */", "COMMENT(/* yql */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/* /* /* yql */ */", "COMMENT(/* /* /* yql */ */) EOF", /* ansi = */ true); + Check("/* /* yql */ */ */", "COMMENT(/* /* yql */ */) WS( ) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/* /* yql */ */", "COMMENT(/* /* yql */ */) EOF", /* ansi = */ true); + Check("/*/*/*/", "COMMENT(/*/*/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/*/**/*/*/*/", "COMMENT(/*/**/*/) ASTERISK(*) SLASH(/) ASTERISK(*) SLASH(/) EOF", /* ansi = */ true); + Check("/* /* */ a /* /* */", "COMMENT(/* /* */ a /* /* */) EOF", /* ansi = */ true); +} - Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) { - SetRandomSeed(100); - for (size_t i = 0; i < 512; ++i) { - auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128); - TString actual = Tokenized(*AnsiLexer, input); - TString expected = Tokenized(*PureAnsiLexer, input); - UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input); - } +Y_UNIT_TEST(RecursiveMultiLineCommentAnsiReferenceComparion) { + SetRandomSeed(100); + for (size_t i = 0; i < 512; ++i) { + auto input = RandomMultilineCommentLikeText(/* maxSize = */ 128); + TString actual = Tokenized(*AnsiLexer, input); + TString expected = Tokenized(*PureAnsiLexer, input); + UNIT_ASSERT_VALUES_EQUAL_C(actual, expected, "Input: " << input); } +} - Y_UNIT_TEST(Keyword) { - Check("SELECT", "SELECT EOF"); - Check("INSERT", "INSERT EOF"); - Check("FROM", "FROM EOF"); - } +Y_UNIT_TEST(Keyword) { + Check("SELECT", "SELECT EOF"); + Check("INSERT", "INSERT EOF"); + Check("FROM", "FROM EOF"); +} - Y_UNIT_TEST(Punctuation) { - Check( - "* / + - <|", - "ASTERISK(*) WS( ) SLASH(/) WS( ) " - "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF"); - Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF"); - } +Y_UNIT_TEST(Punctuation) { + Check( + "* / + - <|", + "ASTERISK(*) WS( ) SLASH(/) WS( ) " + "PLUS(+) WS( ) MINUS(-) WS( ) STRUCT_OPEN(<|) EOF"); + Check("SELECT*FROM", "SELECT ASTERISK(*) FROM EOF"); +} - Y_UNIT_TEST(IdPlain) { - Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF"); - } +Y_UNIT_TEST(IdPlain) { + Check("variable my_table", "ID_PLAIN(variable) WS( ) ID_PLAIN(my_table) EOF"); +} - Y_UNIT_TEST(IdQuoted) { - Check("``", "ID_QUOTED(``) EOF"); - Check("` `", "ID_QUOTED(` `) EOF"); - Check("` `", "ID_QUOTED(` `) EOF"); - Check("`local/table`", "ID_QUOTED(`local/table`) EOF"); - } +Y_UNIT_TEST(IdQuoted) { + Check("``", "ID_QUOTED(``) EOF"); + Check("` `", "ID_QUOTED(` `) EOF"); + Check("` `", "ID_QUOTED(` `) EOF"); + Check("`local/table`", "ID_QUOTED(`local/table`) EOF"); +} - Y_UNIT_TEST(SinleLineString) { - Check("\"\"", "STRING_VALUE(\"\") EOF"); - Check("\' \'", "STRING_VALUE(\' \') EOF"); - Check("\" \"", "STRING_VALUE(\" \") EOF"); - Check("\"test\"", "STRING_VALUE(\"test\") EOF"); +Y_UNIT_TEST(SinleLineString) { + Check("\"\"", "STRING_VALUE(\"\") EOF"); + Check("\' \'", "STRING_VALUE(\' \') EOF"); + Check("\" \"", "STRING_VALUE(\" \") EOF"); + Check("\"test\"", "STRING_VALUE(\"test\") EOF"); - Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false); - Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true); + Check("\"\\\"\"", "STRING_VALUE(\"\\\"\") EOF", /* ansi = */ false); + Check("\"\\\"\"", "[INVALID] STRING_VALUE(\"\\\") EOF", /* ansi = */ true); - Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false); - Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true); - } + Check("\"\"\"\"", "STRING_VALUE(\"\") STRING_VALUE(\"\") EOF", /* ansi = */ false); + Check("\"\"\"\"", "STRING_VALUE(\"\"\"\") EOF", /* ansi = */ true); +} - Y_UNIT_TEST(MultiLineString) { - Check("@@@@", "STRING_VALUE(@@@@) EOF"); - Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF"); - Check("@@test@@", "STRING_VALUE(@@test@@) EOF"); - Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF"); - } +Y_UNIT_TEST(MultiLineString) { + Check("@@@@", "STRING_VALUE(@@@@) EOF"); + Check("@@ @@@", "STRING_VALUE(@@ @@@) EOF"); + Check("@@test@@", "STRING_VALUE(@@test@@) EOF"); + Check("@@line1\nline2@@", "STRING_VALUE(@@line1\nline2@@) EOF"); +} - Y_UNIT_TEST(Query) { - TString query = - "SELECT\n" - " 123467,\n" - " \"Hello, {name}!\",\n" - " (1 + (5 * 1 / 0)),\n" - " MIN(identifier),\n" - " Bool(field),\n" - " Math::Sin(var)\n" - "FROM `local/test/space/table`\n" - "JOIN test;"; - - TString expected = - "SELECT WS(\n) " - "WS( ) WS( ) DIGITS(123467) COMMA(,) WS(\n) " - "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) " - "WS( ) WS( ) LPAREN(() DIGITS(1) WS( ) PLUS(+) WS( ) LPAREN(() DIGITS(5) WS( ) " - "ASTERISK(*) WS( ) DIGITS(1) WS( ) SLASH(/) WS( ) DIGITS(0) RPAREN()) " - "RPAREN()) COMMA(,) WS(\n) " - "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) " - "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) " - "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) " - "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) " - "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF"; - - Check(query, expected); - } +Y_UNIT_TEST(Query) { + TString query = + "SELECT\n" + " 123467,\n" + " \"Hello, {name}!\",\n" + " (1 + (5 * 1 / 0)),\n" + " MIN(identifier),\n" + " Bool(field),\n" + " Math::Sin(var)\n" + "FROM `local/test/space/table`\n" + "JOIN test;"; + + TString expected = + "SELECT WS(\n) " + "WS( ) WS( ) DIGITS(123467) COMMA(,) WS(\n) " + "WS( ) WS( ) STRING_VALUE(\"Hello, {name}!\") COMMA(,) WS(\n) " + "WS( ) WS( ) LPAREN(() DIGITS(1) WS( ) PLUS(+) WS( ) LPAREN(() DIGITS(5) WS( ) " + "ASTERISK(*) WS( ) DIGITS(1) WS( ) SLASH(/) WS( ) DIGITS(0) RPAREN()) " + "RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(MIN) LPAREN(() ID_PLAIN(identifier) RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(Bool) LPAREN(() ID_PLAIN(field) RPAREN()) COMMA(,) WS(\n) " + "WS( ) WS( ) ID_PLAIN(Math) NAMESPACE(::) ID_PLAIN(Sin) LPAREN(() ID_PLAIN(var) RPAREN()) WS(\n) " + "FROM WS( ) ID_QUOTED(`local/test/space/table`) WS(\n) " + "JOIN WS( ) ID_PLAIN(test) SEMICOLON(;) EOF"; + + Check(query, expected); +} - Y_UNIT_TEST(Invalid) { - Check("\"", "[INVALID] EOF"); - Check("\" SELECT", "[INVALID] WS( ) SELECT EOF"); - } +Y_UNIT_TEST(Invalid) { + Check("\"", "[INVALID] EOF"); + Check("\" SELECT", "[INVALID] WS( ) SELECT EOF"); +} } // Y_UNIT_TEST_SUITE(RegexLexerTests) diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp index 4d50d26e046..05e62b65423 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp @@ -18,254 +18,254 @@ namespace NSQLTranslationV1 { - class TLexerGrammarToRegexTranslator { - private: - struct TRewriteRule { - TString Repr; - std::function<void(TString&)> Apply; - }; +class TLexerGrammarToRegexTranslator { +private: + struct TRewriteRule { + TString Repr; + std::function<void(TString&)> Apply; + }; - using TRewriteRules = TVector<TRewriteRule>; + using TRewriteRules = TVector<TRewriteRule>; - public: - explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi) - : Grammar_(&grammar) - , Mode_(ansi ? "ANSI" : "DEFAULT") - { - AddExternalRules(Inliners_); - AddFragmentRules(Inliners_); +public: + explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi) + : Grammar_(&grammar) + , Mode_(ansi ? "ANSI" : "DEFAULT") + { + AddExternalRules(Inliners_); + AddFragmentRules(Inliners_); - AddLetterRules(Transformations_); - AddTransformationRules(Transformations_); + AddLetterRules(Transformations_); + AddTransformationRules(Transformations_); - UnwrapQuotes_ = UnwrapQuotesRule(); - AddSpaceCollapses(SpaceCollapses_); - UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule(); - AddRegexOptimizations(RegexOptimizations_); - } + UnwrapQuotes_ = UnwrapQuotesRule(); + AddSpaceCollapses(SpaceCollapses_); + UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule(); + AddRegexOptimizations(RegexOptimizations_); + } - TString ToRegex(const TStringBuf name) { - TString text = Grammar_->BlockByName.at(name); - Preprocess(text); - Inline(text); - Transform(text); - Finalize(text); - return text; - } + TString ToRegex(const TStringBuf name) { + TString text = Grammar_->BlockByName.at(name); + Preprocess(text); + Inline(text); + Transform(text); + Finalize(text); + return text; + } - private: - void Preprocess(TString& text) { - text = ChangedDigitsPrecendence(std::move(text)); - } +private: + void Preprocess(TString& text) { + text = ChangedDigitsPrecendence(std::move(text)); + } - void Inline(TString& text) { - ApplyEachWhileChanging(text, Inliners_); - } + void Inline(TString& text) { + ApplyEachWhileChanging(text, Inliners_); + } - void AddExternalRules(TRewriteRules& rules) { - THashMap<TString, THashMap<TString, TString>> Substitutions = { - SUBSTITUTIONS(DEFAULT), - SUBSTITUTIONS(ANSI), - }; + void AddExternalRules(TRewriteRules& rules) { + THashMap<TString, THashMap<TString, TString>> Substitutions = { + SUBSTITUTIONS(DEFAULT), + SUBSTITUTIONS(ANSI), + }; - // ANSI mode MULTILINE_COMMENT is recursive - Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] = - Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"]; + // ANSI mode MULTILINE_COMMENT is recursive + Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] = + Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"]; - for (const auto& [k, v] : Substitutions.at(Mode_)) { - rules.emplace_back(RegexRewriteRule("@" + k + "@", v)); - } + for (const auto& [k, v] : Substitutions.at(Mode_)) { + rules.emplace_back(RegexRewriteRule("@" + k + "@", v)); } + } - void AddFragmentRules(TRewriteRules& rules) { - const THashSet<TString> PunctuationFragments = { - "BACKSLASH", - "QUOTE_DOUBLE", - "QUOTE_SINGLE", - "BACKTICK", - "DOUBLE_COMMAT", - }; - - for (const auto& [name, definition] : Grammar_->BlockByName) { - TString def = definition; - if ( - Grammar_->PunctuationNames.contains(name) || - PunctuationFragments.contains(name)) { - def = "'" + def + "'"; - } else if (name == "DIGITS") { - def = ChangedDigitsPrecendence(std::move(def)); - } - def = QuoteAntlrRewrite(std::move(def)); + void AddFragmentRules(TRewriteRules& rules) { + const THashSet<TString> PunctuationFragments = { + "BACKSLASH", + "QUOTE_DOUBLE", + "QUOTE_SINGLE", + "BACKTICK", + "DOUBLE_COMMAT", + }; - rules.emplace_back(RegexRewriteRule( - "(\\b" + name + "\\b)", - "(" + def + ")")); + for (const auto& [name, definition] : Grammar_->BlockByName) { + TString def = definition; + if ( + Grammar_->PunctuationNames.contains(name) || + PunctuationFragments.contains(name)) { + def = "'" + def + "'"; + } else if (name == "DIGITS") { + def = ChangedDigitsPrecendence(std::move(def)); } - } + def = QuoteAntlrRewrite(std::move(def)); - // Regex engine matches the first matched alternative, - // even if it is not the longest one, while ANTLR is more gready. - TString ChangedDigitsPrecendence(TString body) { - if (SubstGlobal(body, "DECDIGITS | ", "") != 0) { - SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS"); - } - return body; + rules.emplace_back(RegexRewriteRule( + "(\\b" + name + "\\b)", + "(" + def + ")")); } + } - void Transform(TString& text) { - ApplyEachWhileChanging(text, Transformations_); + // Regex engine matches the first matched alternative, + // even if it is not the longest one, while ANTLR is more gready. + TString ChangedDigitsPrecendence(TString body) { + if (SubstGlobal(body, "DECDIGITS | ", "") != 0) { + SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS"); } + return body; + } - void AddLetterRules(TRewriteRules& rules) { - for (char letter = 'A'; letter <= 'Z'; ++letter) { - TString lower(char(ToLower(letter))); - TString upper(char(ToUpper(letter))); - rules.emplace_back(RegexRewriteRule( - "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)", - "\\1[" + lower + upper + "]\\2")); - } - } + void Transform(TString& text) { + ApplyEachWhileChanging(text, Transformations_); + } - void AddTransformationRules(TRewriteRules& rules) { + void AddLetterRules(TRewriteRules& rules) { + for (char letter = 'A'; letter <= 'Z'; ++letter) { + TString lower(char(ToLower(letter))); + TString upper(char(ToUpper(letter))); rules.emplace_back(RegexRewriteRule( - R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])")); + "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)", + "\\1[" + lower + upper + "]\\2")); + } + } - rules.emplace_back(RegexRewriteRule( - R"(~\('(..?)'\))", R"([^\1])")); + void AddTransformationRules(TRewriteRules& rules) { + rules.emplace_back(RegexRewriteRule( + R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])")); - rules.emplace_back(RegexRewriteRule( - R"(('..?')\.\.('..?'))", R"([\1-\2])")); + rules.emplace_back(RegexRewriteRule( + R"(~\('(..?)'\))", R"([^\1])")); - rules.emplace_back(RegexRewriteRule( - R"(\((.)\))", R"(\1)")); + rules.emplace_back(RegexRewriteRule( + R"(('..?')\.\.('..?'))", R"([\1-\2])")); - rules.emplace_back(RegexRewriteRule( - R"(\((\[.{1,8}\])\))", R"(\1)")); + rules.emplace_back(RegexRewriteRule( + R"(\((.)\))", R"(\1)")); - rules.emplace_back(RegexRewriteRule( - R"(\(('..?')\))", R"(\1)")); + rules.emplace_back(RegexRewriteRule( + R"(\((\[.{1,8}\])\))", R"(\1)")); - rules.emplace_back(RegexRewriteRule( - R"( \.)", R"( (.|\\n))")); + rules.emplace_back(RegexRewriteRule( + R"(\(('..?')\))", R"(\1)")); - rules.emplace_back(RegexRewriteRule( - R"(\bEOF\b)", R"($)")); + rules.emplace_back(RegexRewriteRule( + R"( \.)", R"( (.|\\n))")); - rules.emplace_back(RegexRewriteRule( - R"('\\u000C' \|)", R"('\\f' |)")); - } + rules.emplace_back(RegexRewriteRule( + R"(\bEOF\b)", R"($)")); - void Finalize(TString& text) { - UnwrapQuotes_.Apply(text); - ApplyEachWhileChanging(text, SpaceCollapses_); - UnwrapQuotedSpace_.Apply(text); - ApplyEachWhileChanging(text, RegexOptimizations_); - } + rules.emplace_back(RegexRewriteRule( + R"('\\u000C' \|)", R"('\\f' |)")); + } - void AddSpaceCollapses(TRewriteRules& rules) { - rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)")); - rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)")); - } + void Finalize(TString& text) { + UnwrapQuotes_.Apply(text); + ApplyEachWhileChanging(text, SpaceCollapses_); + UnwrapQuotedSpace_.Apply(text); + ApplyEachWhileChanging(text, RegexOptimizations_); + } - void AddRegexOptimizations(TRewriteRules& rules) { - // ([a-z]|_) -> ([a-z_]) - rules.emplace_back(RegexRewriteRule( - R"re(\[([^\^\[\]]+)\]\|(.)([\)\|]))re", - R"re([\1\2]\3)re")); + void AddSpaceCollapses(TRewriteRules& rules) { + rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)")); + rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)")); + } - // ([a-z]|[A-Z]) -> ([a-zA-Z]) - rules.emplace_back(RegexRewriteRule( - R"re(\[([^\^\[\]]+)\]\|\[([^\^\[\]]+)\]([\)\|]))re", - R"re([\1\2]\3)re")); - } + void AddRegexOptimizations(TRewriteRules& rules) { + // ([a-z]|_) -> ([a-z_]) + rules.emplace_back(RegexRewriteRule( + R"re(\[([^\^\[\]]+)\]\|(.)([\)\|]))re", + R"re([\1\2]\3)re")); - void ApplyEachOnce(TString& text, const TRewriteRules& rules) { - for (const auto& rule : rules) { - rule.Apply(text); - } + // ([a-z]|[A-Z]) -> ([a-zA-Z]) + rules.emplace_back(RegexRewriteRule( + R"re(\[([^\^\[\]]+)\]\|\[([^\^\[\]]+)\]([\)\|]))re", + R"re([\1\2]\3)re")); + } + + void ApplyEachOnce(TString& text, const TRewriteRules& rules) { + for (const auto& rule : rules) { + rule.Apply(text); } + } - void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) { - constexpr size_t Limit = 16; + void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) { + constexpr size_t Limit = 16; - TString prev; - for (size_t i = 0; i < Limit + 1 && prev != text; ++i) { - prev = text; - ApplyEachOnce(text, rules); - Y_ENSURE(i != Limit); - } + TString prev; + for (size_t i = 0; i < Limit + 1 && prev != text; ++i) { + prev = text; + ApplyEachOnce(text, rules); + Y_ENSURE(i != Limit); } + } - TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) { - auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); - Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); - - TString error; - Y_ENSURE( - re2->CheckRewriteString(rewrite, &error), - error << " on rewrite '" << rewrite << "'"); - - return { - .Repr = regex + " -> " + rewrite, - .Apply = [re2, rewrite = std::move(rewrite)](TString& text) { - RE2::GlobalReplace(&text, *re2, rewrite); - }, - }; - } + TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) { + auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); + Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); + + TString error; + Y_ENSURE( + re2->CheckRewriteString(rewrite, &error), + error << " on rewrite '" << rewrite << "'"); + + return { + .Repr = regex + " -> " + rewrite, + .Apply = [re2, rewrite = std::move(rewrite)](TString& text) { + RE2::GlobalReplace(&text, *re2, rewrite); + }, + }; + } - TRewriteRule UnwrapQuotesRule() { - const TString regex = R"('([^ ][^ ]?)')"; - auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); - Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); - - return { - .Repr = regex + " -> Quoted(\\1)", - .Apply = [re2](TString& text) { - TString content; - std::size_t i = 256; - while (RE2::PartialMatch(text, *re2, &content) && --i != 0) { - TString quoted = RE2::QuoteMeta(content); - for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) { - quoted.erase(std::begin(quoted)); - } - SubstGlobal(text, "'" + content + "'", quoted); + TRewriteRule UnwrapQuotesRule() { + const TString regex = R"('([^ ][^ ]?)')"; + auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); + Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); + + return { + .Repr = regex + " -> Quoted(\\1)", + .Apply = [re2](TString& text) { + TString content; + std::size_t i = 256; + while (RE2::PartialMatch(text, *re2, &content) && --i != 0) { + TString quoted = RE2::QuoteMeta(content); + for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) { + quoted.erase(std::begin(quoted)); } - Y_ENSURE(i != 0); - }, - }; - } + SubstGlobal(text, "'" + content + "'", quoted); + } + Y_ENSURE(i != 0); + }, + }; + } - TRewriteRule UnwrapQuotedSpaceRule() { - return RegexRewriteRule(R"(' ')", R"( )"); - } + TRewriteRule UnwrapQuotedSpaceRule() { + return RegexRewriteRule(R"(' ')", R"( )"); + } - TString QuoteAntlrRewrite(TString rewrite) { - SubstGlobal(rewrite, R"(\)", R"(\\)"); - SubstGlobal(rewrite, R"('\\')", R"('\\\\')"); - return rewrite; - } + TString QuoteAntlrRewrite(TString rewrite) { + SubstGlobal(rewrite, R"(\)", R"(\\)"); + SubstGlobal(rewrite, R"('\\')", R"('\\\\')"); + return rewrite; + } - const NSQLReflect::TLexerGrammar* Grammar_; - const TStringBuf Mode_; + const NSQLReflect::TLexerGrammar* Grammar_; + const TStringBuf Mode_; - TRewriteRules Inliners_; + TRewriteRules Inliners_; - TRewriteRules Transformations_; + TRewriteRules Transformations_; - TRewriteRule UnwrapQuotes_; - TRewriteRules SpaceCollapses_; - TRewriteRule UnwrapQuotedSpace_; - TRewriteRules RegexOptimizations_; - }; + TRewriteRule UnwrapQuotes_; + TRewriteRules SpaceCollapses_; + TRewriteRule UnwrapQuotedSpace_; + TRewriteRules RegexOptimizations_; +}; - TVector<std::tuple<TString, TString>> MakeRegexByOtherName(const NSQLReflect::TLexerGrammar& grammar, bool ansi) { - TLexerGrammarToRegexTranslator translator(grammar, ansi); +TVector<std::tuple<TString, TString>> MakeRegexByOtherName(const NSQLReflect::TLexerGrammar& grammar, bool ansi) { + TLexerGrammarToRegexTranslator translator(grammar, ansi); - TVector<std::tuple<TString, TString>> regexes; - for (const auto& token : grammar.OtherNames) { - regexes.emplace_back(token, translator.ToRegex(token)); - } - return regexes; + TVector<std::tuple<TString, TString>> regexes; + for (const auto& token : grammar.OtherNames) { + regexes.emplace_back(token, translator.ToRegex(token)); } + return regexes; +} } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/regex.h b/yql/essentials/sql/v1/lexer/regex/regex.h index 1e9d92b6535..943f8c73e42 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex.h +++ b/yql/essentials/sql/v1/lexer/regex/regex.h @@ -6,9 +6,9 @@ namespace NSQLTranslationV1 { - // Makes regexes only for tokens from OtherNames, - // as keywords and punctuation are trivially matched. - TVector<std::tuple<TString, TString>> MakeRegexByOtherName( - const NSQLReflect::TLexerGrammar& grammar, bool ansi); +// Makes regexes only for tokens from OtherNames, +// as keywords and punctuation are trivially matched. +TVector<std::tuple<TString, TString>> MakeRegexByOtherName( + const NSQLReflect::TLexerGrammar& grammar, bool ansi); } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp index b8d78799dda..2f05d02d776 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp @@ -11,80 +11,80 @@ auto grammar = NSQLReflect::LoadLexerGrammar(); auto defaultRegexes = MakeRegexByOtherName(grammar, /* ansi = */ false); auto ansiRegexes = MakeRegexByOtherName(grammar, /* ansi = */ true); -TString Get(const TVector<std::tuple<TString, TString>> ®exes, +TString Get(const TVector<std::tuple<TString, TString>>& regexes, const TStringBuf name) { - return std::get<1>(*FindIf( - regexes, [&](const auto &pair) { return std::get<0>(pair) == name; })); + return std::get<1>(*FindIf( + regexes, [&](const auto& pair) { return std::get<0>(pair) == name; })); } void CheckRegex(bool ansi, const TStringBuf name, const TStringBuf expected) { - const auto ®exes = ansi ? ansiRegexes : defaultRegexes; - const TString regex = Get(regexes, name); + const auto& regexes = ansi ? ansiRegexes : defaultRegexes; + const TString regex = Get(regexes, name); - const RE2 re2(regex); - Y_ENSURE(re2.ok(), re2.error()); + const RE2 re2(regex); + Y_ENSURE(re2.ok(), re2.error()); - UNIT_ASSERT_VALUES_EQUAL(regex, expected); + UNIT_ASSERT_VALUES_EQUAL(regex, expected); } } // namespace Y_UNIT_TEST_SUITE(SqlRegexTests) { - Y_UNIT_TEST(StringValue) { +Y_UNIT_TEST(StringValue) { CheckRegex( /* ansi = */ false, "STRING_VALUE", R"(((((\'([^'\\]|(\\(.|\n)))*\'))|((\"([^"\\]|(\\(.|\n)))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sSuUyYjJ]|[pP]([tTbBvV])?)?))"); - } +} - Y_UNIT_TEST(AnsiStringValue) { +Y_UNIT_TEST(AnsiStringValue) { CheckRegex( /* ansi = */ true, "STRING_VALUE", R"(((((\'([^']|(\'\'))*\'))|((\"([^"]|(\"\"))*\"))|((\@\@(.|\n)*?\@\@)+\@?))([sSuUyYjJ]|[pP]([tTbBvV])?)?))"); - } +} - Y_UNIT_TEST(IdPlain) { +Y_UNIT_TEST(IdPlain) { CheckRegex( /* ansi = */ false, "ID_PLAIN", R"(([a-zA-Z_])([a-zA-Z_0-9])*)"); - } +} - Y_UNIT_TEST(IdQuoted) { +Y_UNIT_TEST(IdQuoted) { CheckRegex( /* ansi = */ false, "ID_QUOTED", R"(\`(\\(.|\n)|\`\`|[^`\\])*\`)"); - } +} - Y_UNIT_TEST(Digits) { +Y_UNIT_TEST(Digits) { CheckRegex( /* ansi = */ false, "DIGITS", R"((0[xX]([0-9a-fA-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))"); - } +} - Y_UNIT_TEST(IntegerValue) { +Y_UNIT_TEST(IntegerValue) { CheckRegex( /* ansi = */ false, "INTEGER_VALUE", R"(((0[xX]([0-9a-fA-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))(([pPuU])?([lLsStTiIbBnN])?))"); - } +} - Y_UNIT_TEST(Real) { +Y_UNIT_TEST(Real) { CheckRegex( /* ansi = */ false, "REAL", R"((([0-9]+)\.[0-9]*([eE](\+|\-)?([0-9]+))?|([0-9]+)([eE](\+|\-)?([0-9]+)))([fF]|[pP]([fF](4|8)|[nN])?)?)"); - } +} - Y_UNIT_TEST(Ws) { +Y_UNIT_TEST(Ws) { CheckRegex( /* ansi = */ false, "WS", R"(( |\r|\t|\f|\n))"); - } +} - Y_UNIT_TEST(Comment) { +Y_UNIT_TEST(Comment) { CheckRegex( /* ansi = */ false, "COMMENT", R"(((\/\*(.|\n)*?\*\/)|(\-\-[^\n\r]*(\r\n?|\n|$))))"); - } +} - Y_UNIT_TEST(AnsiCommentSameAsDefault) { +Y_UNIT_TEST(AnsiCommentSameAsDefault) { // Because of recursive definition UNIT_ASSERT_VALUES_EQUAL(Get(ansiRegexes, "COMMENT"), Get(defaultRegexes, "COMMENT")); - } +} } // Y_UNIT_TEST_SUITE(SqlRegexTests) |