diff options
author | vityaman <vityaman.dev@yandex.ru> | 2025-03-28 18:29:24 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2025-03-28 18:50:04 +0300 |
commit | 60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch) | |
tree | 08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql/v1/lexer/regex/regex.cpp | |
parent | 1e214be59cbf130bee433c422b42f16148e5acff (diff) | |
download | ydb-60b99f11bcb2386c2a1c36ffd2e96e69d0105dac.tar.gz |
YQL-19616 Convert YQL lexer grammar to regexes
- [x] Parse YQL grammar to extract lexer grammar into `TLexerGrammar`.
- [x] Translate `TLexerGrammar` into regexes.
- [x] Implement a lexer via regexes `TRegexLexer` to test generated regexes validity.
- [x] Test on `Default` syntax mode.
- [x] Test on `ANSI` syntax mode.
---
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Requirement for https://github.com/ytsaurus/ytsaurus/pull/1112
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1127
commit_hash:03ffffe81cdafe7f93a4d3fd9a3212fe67f1c72d
Diffstat (limited to 'yql/essentials/sql/v1/lexer/regex/regex.cpp')
-rw-r--r-- | yql/essentials/sql/v1/lexer/regex/regex.cpp | 240 |
1 files changed, 240 insertions, 0 deletions
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp new file mode 100644 index 00000000000..a8aca8a1318 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp @@ -0,0 +1,240 @@ +#include "regex.h" + +#include <contrib/libs/re2/re2/re2.h> + +#include <util/generic/vector.h> + +#define SUBSTITUTION(name, mode) \ + {#name, name##_##mode} + +#define SUBSTITUTIONS(mode) \ + { \ + #mode, { \ + SUBSTITUTION(GRAMMAR_STRING_CORE_SINGLE, mode), \ + SUBSTITUTION(GRAMMAR_STRING_CORE_DOUBLE, mode), \ + SUBSTITUTION(GRAMMAR_MULTILINE_COMMENT_CORE, mode), \ + } \ + } + +namespace NSQLTranslationV1 { + + class TLexerGrammarToRegexTranslator { + private: + struct TRewriteRule { + TString Repr; + std::function<void(TString&)> Apply; + }; + + using TRewriteRules = TVector<TRewriteRule>; + + public: + explicit TLexerGrammarToRegexTranslator(const NSQLReflect::TLexerGrammar& grammar, bool ansi) + : Grammar_(&grammar) + , Mode_(ansi ? "ANSI" : "DEFAULT") + { + AddExternalRules(Inliners_); + AddFragmentRules(Inliners_); + + AddLetterRules(Transformations_); + AddTransformationRules(Transformations_); + + UnwrapQuotes_ = UnwrapQuotesRule(); + AddSpaceCollapses(SpaceCollapses_); + UnwrapQuotedSpace_ = UnwrapQuotedSpaceRule(); + } + + TString ToRegex(const TStringBuf name) { + TString text = Grammar_->BlockByName.at(name); + Inline(text); + Transform(text); + Finalize(text); + return text; + } + + private: + void Inline(TString& text) { + ApplyEachWhileChanging(text, Inliners_); + } + + void AddExternalRules(TRewriteRules& rules) { + THashMap<TString, THashMap<TString, TString>> Substitutions = { + SUBSTITUTIONS(DEFAULT), + SUBSTITUTIONS(ANSI), + }; + + // ANSI mode MULTILINE_COMMENT is recursive + Substitutions["ANSI"]["GRAMMAR_MULTILINE_COMMENT_CORE"] = + Substitutions["DEFAULT"]["GRAMMAR_MULTILINE_COMMENT_CORE"]; + + for (const auto& [k, v] : Substitutions.at(Mode_)) { + rules.emplace_back(RegexRewriteRule("@" + k + "@", v)); + } + } + + void AddFragmentRules(TRewriteRules& rules) { + const THashSet<TString> PunctuationFragments = { + "BACKSLASH", + "QUOTE_DOUBLE", + "QUOTE_SINGLE", + "BACKTICK", + "DOUBLE_COMMAT", + }; + + for (const auto& [name, definition] : Grammar_->BlockByName) { + TString def = definition; + if ( + Grammar_->PunctuationNames.contains(name) || + PunctuationFragments.contains(name)) { + def = "'" + def + "'"; + } + def = QuoteAntlrRewrite(std::move(def)); + + rules.emplace_back(RegexRewriteRule( + "(\\b" + name + "\\b)", + "(" + def + ")")); + } + } + + void Transform(TString& text) { + ApplyEachWhileChanging(text, Transformations_); + } + + void AddLetterRules(TRewriteRules& rules) { + for (char letter = 'A'; letter <= 'Z'; ++letter) { + TString lower(char(ToLower(letter))); + TString upper(char(ToUpper(letter))); + rules.emplace_back(RegexRewriteRule( + "([^'\\w\\[\\]]|^)" + upper + "([^'\\w\\[\\]]|$)", + "\\1[" + lower + upper + "]\\2")); + } + } + + void AddTransformationRules(TRewriteRules& rules) { + rules.emplace_back(RegexRewriteRule( + R"(~\('(..?)' \| '(..?)'\))", R"([^\1\2])")); + + rules.emplace_back(RegexRewriteRule( + R"(~\('(..?)'\))", R"([^\1])")); + + rules.emplace_back(RegexRewriteRule( + R"(('..?')\.\.('..?'))", R"([\1-\2])")); + + rules.emplace_back(RegexRewriteRule( + R"(\((.)\))", R"(\1)")); + + rules.emplace_back(RegexRewriteRule( + R"(\((\[.{1,8}\])\))", R"(\1)")); + + rules.emplace_back(RegexRewriteRule( + R"(\(('..?')\))", R"(\1)")); + + rules.emplace_back(RegexRewriteRule( + R"( \.)", R"( (.|\\n))")); + + rules.emplace_back(RegexRewriteRule( + R"(\bEOF\b)", R"($)")); + + rules.emplace_back(RegexRewriteRule( + R"('\\u000C' \|)", "")); + } + + void Finalize(TString& text) { + UnwrapQuotes_.Apply(text); + ApplyEachWhileChanging(text, SpaceCollapses_); + UnwrapQuotedSpace_.Apply(text); + } + + void AddSpaceCollapses(TRewriteRules& rules) { + rules.emplace_back(RegexRewriteRule(R"(([^']|^) )", R"(\1)")); + rules.emplace_back(RegexRewriteRule(R"( ([^']|$))", R"(\1)")); + } + + void ApplyEachOnce(TString& text, const TRewriteRules& rules) { + for (const auto& rule : rules) { + rule.Apply(text); + } + } + + void ApplyEachWhileChanging(TString& text, const TRewriteRules& rules) { + constexpr size_t Limit = 16; + + TString prev; + for (size_t i = 0; i < Limit + 1 && prev != text; ++i) { + prev = text; + ApplyEachOnce(text, rules); + Y_ENSURE(i != Limit); + } + } + + TRewriteRule RegexRewriteRule(const TString& regex, TString rewrite) { + auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); + Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); + + TString error; + Y_ENSURE( + re2->CheckRewriteString(rewrite, &error), + error << " on rewrite '" << rewrite << "'"); + + return { + .Repr = regex + " -> " + rewrite, + .Apply = [re2, rewrite = std::move(rewrite)](TString& text) { + RE2::GlobalReplace(&text, *re2, rewrite); + }, + }; + } + + TRewriteRule UnwrapQuotesRule() { + const TString regex = R"('([^ ][^ ]?)')"; + auto re2 = std::make_shared<RE2>(regex, RE2::Quiet); + Y_ENSURE(re2->ok(), re2->error() << " on regex '" << regex << "'"); + + return { + .Repr = regex + " -> Quoted(\\1)", + .Apply = [re2](TString& text) { + TString content; + std::size_t i = 256; + while (RE2::PartialMatch(text, *re2, &content) && --i != 0) { + TString quoted = RE2::QuoteMeta(content); + for (size_t i = 0; i < 2 && quoted.StartsWith(R"(\\)"); ++i) { + quoted.erase(std::begin(quoted)); + } + SubstGlobal(text, "'" + content + "'", quoted); + } + Y_ENSURE(i != 0); + }, + }; + } + + TRewriteRule UnwrapQuotedSpaceRule() { + return RegexRewriteRule(R"(' ')", R"( )"); + } + + TString QuoteAntlrRewrite(TString rewrite) { + SubstGlobal(rewrite, R"(\)", R"(\\)"); + SubstGlobal(rewrite, R"('\\')", R"('\\\\')"); + return rewrite; + } + + const NSQLReflect::TLexerGrammar* Grammar_; + const TStringBuf Mode_; + + TRewriteRules Inliners_; + + TRewriteRules Transformations_; + + TRewriteRule UnwrapQuotes_; + TRewriteRules SpaceCollapses_; + TRewriteRule UnwrapQuotedSpace_; + }; + + THashMap<TString, TString> MakeRegexByOtherNameMap(const NSQLReflect::TLexerGrammar& grammar, bool ansi) { + TLexerGrammarToRegexTranslator translator(grammar, ansi); + + THashMap<TString, TString> regexes; + for (const auto& token : grammar.OtherNames) { + regexes.emplace(token, translator.ToRegex(token)); + } + return regexes; + } + +} // namespace NSQLTranslationV1 |