diff options
author | vityaman <[email protected]> | 2025-03-28 18:29:24 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-03-28 18:50:04 +0300 |
commit | 60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch) | |
tree | 08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql/v1/reflect/sql_reflect.cpp | |
parent | 1e214be59cbf130bee433c422b42f16148e5acff (diff) |
YQL-19616 Convert YQL lexer grammar to regexes
- [x] Parse YQL grammar to extract lexer grammar into `TLexerGrammar`.
- [x] Translate `TLexerGrammar` into regexes.
- [x] Implement a lexer via regexes `TRegexLexer` to test generated regexes validity.
- [x] Test on `Default` syntax mode.
- [x] Test on `ANSI` syntax mode.
---
- Related to https://github.com/ydb-platform/ydb/issues/15129
- Requirement for https://github.com/ytsaurus/ytsaurus/pull/1112
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1127
commit_hash:03ffffe81cdafe7f93a4d3fd9a3212fe67f1c72d
Diffstat (limited to 'yql/essentials/sql/v1/reflect/sql_reflect.cpp')
-rw-r--r-- | yql/essentials/sql/v1/reflect/sql_reflect.cpp | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp new file mode 100644 index 00000000000..f47f35cb9de --- /dev/null +++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp @@ -0,0 +1,173 @@ +#include "sql_reflect.h" + +#include <library/cpp/resource/resource.h> + +#include <util/string/split.h> +#include <util/string/strip.h> + +namespace NSQLReflect { + + const TStringBuf ReflectPrefix = "//!"; + const TStringBuf SectionPrefix = "//! section:"; + const TStringBuf SectionPunctuation = "//! section:punctuation"; + const TStringBuf SectionLetter = "//! section:letter"; + const TStringBuf SectionKeyword = "//! section:keyword"; + const TStringBuf SectionOther = "//! section:other"; + const TStringBuf FragmentPrefix = "fragment "; + + TVector<TString> GetResourceLines(const TStringBuf key) { + TString text; + Y_ENSURE(NResource::FindExact(key, &text)); + + TVector<TString> lines; + Split(text, "\n", lines); + return lines; + } + + void Format(TVector<TString>& lines) { + for (size_t i = 0; i < lines.size(); ++i) { + auto& line = lines[i]; + + StripInPlace(line); + + if (line.StartsWith("//") || (line.Contains(':') && line.Contains(';'))) { + continue; + } + + size_t j = i + 1; + do { + line += lines.at(j); + } while (!lines.at(j++).Contains(';')); + + auto first = std::next(std::begin(lines), i + 1); + auto last = std::next(std::begin(lines), j); + lines.erase(first, last); + } + + for (auto& line : lines) { + CollapseInPlace(line); + SubstGlobal(line, " ;", ";"); + SubstGlobal(line, " :", ":"); + SubstGlobal(line, " )", ")"); + SubstGlobal(line, "( ", "("); + } + } + + void Purify(TVector<TString>& lines) { + const auto [first, last] = std::ranges::remove_if(lines, [](const TString& line) { + return (line.StartsWith("//") && !line.StartsWith(ReflectPrefix)) || line.empty(); + }); + lines.erase(first, last); + } + + THashMap<TStringBuf, TVector<TString>> GroupBySection(TVector<TString>&& lines) { + TVector<TStringBuf> sections = { + "", + SectionPunctuation, + SectionLetter, + SectionKeyword, + SectionOther, + }; + + size_t section = 0; + + THashMap<TStringBuf, TVector<TString>> groups; + for (auto& line : lines) { + if (line.StartsWith(SectionPrefix)) { + Y_ENSURE(sections.at(section + 1) == line); + section += 1; + continue; + } + + groups[sections.at(section)].emplace_back(std::move(line)); + } + + groups.erase(""); + groups.erase(SectionLetter); + + return groups; + } + + std::tuple<TString, TString> ParseLexerRule(TString&& line) { + size_t colonPos = line.find(':'); + size_t semiPos = line.rfind(';'); + + Y_ENSURE( + colonPos != TString::npos && + semiPos != TString::npos && + colonPos < semiPos); + + TString block = line.substr(colonPos + 2, semiPos - colonPos - 2); + SubstGlobal(block, "\\\\", "\\"); + + TString name = std::move(line); + name.resize(colonPos); + + return std::make_tuple(std::move(name), std::move(block)); + } + + void ParsePunctuationLine(TString&& line, TLexerGrammar& grammar) { + auto [name, block] = ParseLexerRule(std::move(line)); + block = block.erase(std::begin(block)); + block.pop_back(); + + SubstGlobal(block, "\\\'", "\'"); + + if (!name.StartsWith(FragmentPrefix)) { + grammar.PunctuationNames.emplace(name); + } + + SubstGlobal(name, FragmentPrefix, ""); + grammar.BlockByName.emplace(std::move(name), std::move(block)); + } + + void ParseKeywordLine(TString&& line, TLexerGrammar& grammar) { + auto [name, block] = ParseLexerRule(std::move(line)); + SubstGlobal(block, "'", ""); + SubstGlobal(block, " ", ""); + + Y_ENSURE(name == block || (name == "TSKIP" && block == "SKIP")); + grammar.KeywordNames.emplace(std::move(name)); + } + + void ParseOtherLine(TString&& line, TLexerGrammar& grammar) { + auto [name, block] = ParseLexerRule(std::move(line)); + + if (!name.StartsWith(FragmentPrefix)) { + grammar.OtherNames.emplace(name); + } + + SubstGlobal(name, FragmentPrefix, ""); + SubstGlobal(block, " -> channel(HIDDEN)", ""); + grammar.BlockByName.emplace(std::move(name), std::move(block)); + } + + TLexerGrammar LoadLexerGrammar() { + TVector<TString> lines = GetResourceLines("SQLv1Antlr4.g.in"); + Purify(lines); + Format(lines); + Purify(lines); + + THashMap<TStringBuf, TVector<TString>> sections; + sections = GroupBySection(std::move(lines)); + + TLexerGrammar grammar; + + for (auto& [section, lines] : sections) { + for (auto& line : lines) { + if (section == SectionPunctuation) { + ParsePunctuationLine(std::move(line), grammar); + } else if (section == SectionKeyword) { + ParseKeywordLine(std::move(line), grammar); + } else if (section == SectionOther) { + ParseOtherLine(std::move(line), grammar); + } else { + Y_ABORT("Unexpected section %s", section); + } + } + } + + return grammar; + } + +} // namespace NSQLReflect |