summaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/reflect/sql_reflect.cpp
diff options
context:
space:
mode:
authorvityaman <[email protected]>2025-03-28 18:29:24 +0300
committerrobot-piglet <[email protected]>2025-03-28 18:50:04 +0300
commit60b99f11bcb2386c2a1c36ffd2e96e69d0105dac (patch)
tree08c15d732484c6accf16658b09ed8f07286a9338 /yql/essentials/sql/v1/reflect/sql_reflect.cpp
parent1e214be59cbf130bee433c422b42f16148e5acff (diff)
YQL-19616 Convert YQL lexer grammar to regexes
- [x] Parse YQL grammar to extract lexer grammar into `TLexerGrammar`. - [x] Translate `TLexerGrammar` into regexes. - [x] Implement a lexer via regexes `TRegexLexer` to test generated regexes validity. - [x] Test on `Default` syntax mode. - [x] Test on `ANSI` syntax mode. --- - Related to https://github.com/ydb-platform/ydb/issues/15129 - Requirement for https://github.com/ytsaurus/ytsaurus/pull/1112 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1127 commit_hash:03ffffe81cdafe7f93a4d3fd9a3212fe67f1c72d
Diffstat (limited to 'yql/essentials/sql/v1/reflect/sql_reflect.cpp')
-rw-r--r--yql/essentials/sql/v1/reflect/sql_reflect.cpp173
1 files changed, 173 insertions, 0 deletions
diff --git a/yql/essentials/sql/v1/reflect/sql_reflect.cpp b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
new file mode 100644
index 00000000000..f47f35cb9de
--- /dev/null
+++ b/yql/essentials/sql/v1/reflect/sql_reflect.cpp
@@ -0,0 +1,173 @@
+#include "sql_reflect.h"
+
+#include <library/cpp/resource/resource.h>
+
+#include <util/string/split.h>
+#include <util/string/strip.h>
+
+namespace NSQLReflect {
+
+ const TStringBuf ReflectPrefix = "//!";
+ const TStringBuf SectionPrefix = "//! section:";
+ const TStringBuf SectionPunctuation = "//! section:punctuation";
+ const TStringBuf SectionLetter = "//! section:letter";
+ const TStringBuf SectionKeyword = "//! section:keyword";
+ const TStringBuf SectionOther = "//! section:other";
+ const TStringBuf FragmentPrefix = "fragment ";
+
+ TVector<TString> GetResourceLines(const TStringBuf key) {
+ TString text;
+ Y_ENSURE(NResource::FindExact(key, &text));
+
+ TVector<TString> lines;
+ Split(text, "\n", lines);
+ return lines;
+ }
+
+ void Format(TVector<TString>& lines) {
+ for (size_t i = 0; i < lines.size(); ++i) {
+ auto& line = lines[i];
+
+ StripInPlace(line);
+
+ if (line.StartsWith("//") || (line.Contains(':') && line.Contains(';'))) {
+ continue;
+ }
+
+ size_t j = i + 1;
+ do {
+ line += lines.at(j);
+ } while (!lines.at(j++).Contains(';'));
+
+ auto first = std::next(std::begin(lines), i + 1);
+ auto last = std::next(std::begin(lines), j);
+ lines.erase(first, last);
+ }
+
+ for (auto& line : lines) {
+ CollapseInPlace(line);
+ SubstGlobal(line, " ;", ";");
+ SubstGlobal(line, " :", ":");
+ SubstGlobal(line, " )", ")");
+ SubstGlobal(line, "( ", "(");
+ }
+ }
+
+ void Purify(TVector<TString>& lines) {
+ const auto [first, last] = std::ranges::remove_if(lines, [](const TString& line) {
+ return (line.StartsWith("//") && !line.StartsWith(ReflectPrefix)) || line.empty();
+ });
+ lines.erase(first, last);
+ }
+
+ THashMap<TStringBuf, TVector<TString>> GroupBySection(TVector<TString>&& lines) {
+ TVector<TStringBuf> sections = {
+ "",
+ SectionPunctuation,
+ SectionLetter,
+ SectionKeyword,
+ SectionOther,
+ };
+
+ size_t section = 0;
+
+ THashMap<TStringBuf, TVector<TString>> groups;
+ for (auto& line : lines) {
+ if (line.StartsWith(SectionPrefix)) {
+ Y_ENSURE(sections.at(section + 1) == line);
+ section += 1;
+ continue;
+ }
+
+ groups[sections.at(section)].emplace_back(std::move(line));
+ }
+
+ groups.erase("");
+ groups.erase(SectionLetter);
+
+ return groups;
+ }
+
+ std::tuple<TString, TString> ParseLexerRule(TString&& line) {
+ size_t colonPos = line.find(':');
+ size_t semiPos = line.rfind(';');
+
+ Y_ENSURE(
+ colonPos != TString::npos &&
+ semiPos != TString::npos &&
+ colonPos < semiPos);
+
+ TString block = line.substr(colonPos + 2, semiPos - colonPos - 2);
+ SubstGlobal(block, "\\\\", "\\");
+
+ TString name = std::move(line);
+ name.resize(colonPos);
+
+ return std::make_tuple(std::move(name), std::move(block));
+ }
+
+ void ParsePunctuationLine(TString&& line, TLexerGrammar& grammar) {
+ auto [name, block] = ParseLexerRule(std::move(line));
+ block = block.erase(std::begin(block));
+ block.pop_back();
+
+ SubstGlobal(block, "\\\'", "\'");
+
+ if (!name.StartsWith(FragmentPrefix)) {
+ grammar.PunctuationNames.emplace(name);
+ }
+
+ SubstGlobal(name, FragmentPrefix, "");
+ grammar.BlockByName.emplace(std::move(name), std::move(block));
+ }
+
+ void ParseKeywordLine(TString&& line, TLexerGrammar& grammar) {
+ auto [name, block] = ParseLexerRule(std::move(line));
+ SubstGlobal(block, "'", "");
+ SubstGlobal(block, " ", "");
+
+ Y_ENSURE(name == block || (name == "TSKIP" && block == "SKIP"));
+ grammar.KeywordNames.emplace(std::move(name));
+ }
+
+ void ParseOtherLine(TString&& line, TLexerGrammar& grammar) {
+ auto [name, block] = ParseLexerRule(std::move(line));
+
+ if (!name.StartsWith(FragmentPrefix)) {
+ grammar.OtherNames.emplace(name);
+ }
+
+ SubstGlobal(name, FragmentPrefix, "");
+ SubstGlobal(block, " -> channel(HIDDEN)", "");
+ grammar.BlockByName.emplace(std::move(name), std::move(block));
+ }
+
+ TLexerGrammar LoadLexerGrammar() {
+ TVector<TString> lines = GetResourceLines("SQLv1Antlr4.g.in");
+ Purify(lines);
+ Format(lines);
+ Purify(lines);
+
+ THashMap<TStringBuf, TVector<TString>> sections;
+ sections = GroupBySection(std::move(lines));
+
+ TLexerGrammar grammar;
+
+ for (auto& [section, lines] : sections) {
+ for (auto& line : lines) {
+ if (section == SectionPunctuation) {
+ ParsePunctuationLine(std::move(line), grammar);
+ } else if (section == SectionKeyword) {
+ ParseKeywordLine(std::move(line), grammar);
+ } else if (section == SectionOther) {
+ ParseOtherLine(std::move(line), grammar);
+ } else {
+ Y_ABORT("Unexpected section %s", section);
+ }
+ }
+ }
+
+ return grammar;
+ }
+
+} // namespace NSQLReflect