summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2025-05-12 13:53:24 +0300
committerrobot-piglet <[email protected]>2025-05-12 14:05:50 +0300
commit7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch)
tree70c132d1b611697ad23b90cf35215b035f247ec0
parentbf1279129bcf6c1b1001e39c39a13d80737898d3 (diff)
Intermediate changes
commit_hash:3a624a323006078de71f50747f7b2e8cadba7ccd
-rw-r--r--yql/essentials/sql/v1/highlight/README.md217
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlight.cpp313
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlight.h41
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlight_json.cpp67
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlight_json.h11
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp14
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlighter.cpp138
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlighter.h34
-rw-r--r--yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp106
-rw-r--r--yql/essentials/sql/v1/highlight/ut/suite.json95
-rw-r--r--yql/essentials/sql/v1/highlight/ut/ya.make8
-rw-r--r--yql/essentials/sql/v1/highlight/ya.make20
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.cpp7
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.cpp127
-rw-r--r--yql/essentials/sql/v1/lexer/regex/generic.h56
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.cpp304
-rw-r--r--yql/essentials/sql/v1/lexer/regex/lexer.h4
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex.cpp16
-rw-r--r--yql/essentials/sql/v1/lexer/regex/regex_ut.cpp9
-rw-r--r--yql/essentials/sql/v1/lexer/regex/ya.make1
-rw-r--r--yql/essentials/sql/v1/ya.make1
-rw-r--r--yql/essentials/tools/yql_highlight/ya.make17
-rw-r--r--yql/essentials/tools/yql_highlight/yql_highlight1
-rw-r--r--yql/essentials/tools/yql_highlight/yql_highlight.cpp79
24 files changed, 1494 insertions, 192 deletions
diff --git a/yql/essentials/sql/v1/highlight/README.md b/yql/essentials/sql/v1/highlight/README.md
new file mode 100644
index 00000000000..214e4d41573
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/README.md
@@ -0,0 +1,217 @@
+# YQL SQL Syntax Highlighting Specfication
+
+## Overview
+
+This document specifies the syntax highlighting system for the YQL. The system specifies how to identify and categorize different syntactic elements in queries for highlighting porposes.
+
+## Terms
+
+- `Highlighting` is a _list_ of `Highlighting Unit`s that define how to recognize different parts of SQL syntax.
+
+- `Highlighting Unit` is a language construction to be highlighted (e.g., keywords, identifiers, literals).
+
+- `Highlighting Token` is a text fragment matched with a `Highlighting Unit`.
+
+- `Highlighter` is an function parametrized by `Highlighting` transforming a text into a stream of `Highlighting Token`s.
+
+- `Theme` is a mapping from `Highlighting Unit` to a `Color`.
+
+## Highlighting Unit
+
+Here are examples of `Highlighting Unit`s. They will evolve and should be taken from the JSON `Highlighting` programmatically. Only comments are always present, as they may require special processing.
+
+- `keyword`: SQL reserved words (e.g., `SELECT`, `INSERT`, `FROM`).
+
+- `punctuation`: Syntactic symbols (e.g., `.`, `;`, `(`, `)`).
+
+- `identifier`: Unquoted names (e.g., table or column names).
+
+- `quoted-identifier`: Backtick-quoted names (e.g., ``` `table` ```).
+
+- `bind-parameter-identifier`: Parameter references (e.g., `$param`).
+
+- `type-identifier`: Type names (e.g., `Int32`, `String`).
+
+- `function-identifier`: Function names (e.g., `MIN`, `Math::Sin`).
+
+- `literal`: Numeric constants (e.g., `123`, `1.23`).
+
+- `string-literal`: Quoted strings (e.g., `"example"`).
+
+- `comment`: Single-line (`--`) or multi-line (`/* */`) comments.
+
+- `ws`: Spaces, tabs, newlines.
+
+- `error`: Unrecognized syntax.
+
+Each `Highlighting Unit` contains one or more `Patterns` that define how to recognize the unit in text.
+
+## Pattern Matching
+
+A `Pattern` consists of:
+
+- `body`: The main regex pattern to match.
+
+- `after`: A lookahead pattern.
+
+- `is-case-insensitive`: Whether matching should be case-insensitive.
+
+The matching behavior is equivalent to the regex: `body(?=after)`.
+
+## Highlighter Algorithm
+
+The highlighter algorithm can be described with the following pseudocode.
+
+```python
+# Consume matched tokens until empty.
+# For each iteration:
+# 1. Find the next token match (or error)
+# 2. Emit the token
+# 3. Continues with the remaining text
+highlight(text) =
+ if text is not empty do
+ token = match(text)
+ emit token
+ highlight(text[token.length:])
+
+# Select the longest match from all possible
+# patterns. Leftmost is chosen. If no match,
+# emits a 1-character error token for as a
+# recovery.
+match(text) =
+ max of matches(text) by length
+ or error token with length = 1
+
+# For each highlighting unit and its patterns,
+# attempt to match.
+matches(text) = do
+ unit <- highlighting.units
+ pattern <- unit.patterns
+ content <- match(text, pattern)
+ yield token with unit, content
+
+# Match both the pattern body and lookahead
+# (after) portion with case sensitivity settings.
+match(text, pattern) = do
+ body <- (
+ regex pattern.body
+ matches text prefix
+ with pattern.case_sensivity)
+ after <- (
+ regex pattern.body
+ matches text[body.length:] prefix
+ with pattern.case_sensivity)
+ yield body + after
+
+# Special ANSI Comment handling.
+# Recursively process nested multiline comments.
+match(text, Comment if ANSI) =
+ if text not starts with "/*" do
+ return match(text, Comment if Default)
+
+ text = text after "/*"
+ loop do
+ if text starts with "*/" do
+ return text after "/*"
+
+ if text starts with "/*" do
+ budget = text before last "*/"
+ match = match(budget, Comment if ANSI)
+ text = text after match
+
+ if match:
+ continue
+
+ if text is empty:
+ return Nothing
+
+ text = text[1:]
+```
+
+## Highlighting JSON Example
+
+The highlighting can be generated using the `yql_highlight` tool in JSON format.
+
+```json
+{
+ "units": [
+ ...
+ {
+ "kind":"type-identifier",
+ "patterns": [
+ {
+ "body":"([a-z]|[A-Z]|_)([a-z]|[A-Z]|_|[0-9])*",
+ "after":"\\<"
+ },
+ {
+ "body":"Int32|Int16|Utf8|...",
+ "is-case-insensitive":true
+ }
+ ]
+ },
+ ...
+ ]
+}
+```
+
+## Test Suite
+
+The reference implementation includes a comprehensive test suite that verifies correct highlighting behavior. The test suite is defined in JSON format with the following structure:
+
+```json
+{
+ "SQL": [
+ ["SELECT id, alias from users", "KKKKKK#_#II#P#_#IIIII#_#KKKK#_#IIIII"],
+ ],
+ "TypeIdentifier": [
+ ["Bool(value)", "TTTT#P#IIIII#P"]
+ ]
+}
+```
+
+Where the first element is the SQL text to highlight and the second one is a string where each character represents the highlighting unit kind for each character in the input.
+
+Here's the table representation of the unit kind to character mapping:
+
+| Unit Kind | Character |
+| ------------------------- | --------- |
+| keyword | K |
+| punctuation | P |
+| identifier | I |
+| quoted-identifier | Q |
+| bind-parameter-identifier | B |
+| type-identifier | T |
+| function-identifier | F |
+| literal | L |
+| string-literal | S |
+| comment | C |
+| ws | _ |
+| error | E |
+
+Note: The `#` is used to make tokens visually distinct from other.
+
+The test driver pseudocode:
+
+```cpp
+run_test_suite =
+ let
+ highlighting = load_highlighting()
+ highlighter = make_highlighter(highlighting)
+ suite = load_sest_suite()
+ in do
+ scenario <- suite
+ test <- scenario
+ (input, expected) = test
+
+ tokens = highlighter.highlight(input)
+ actual = to_pattern(tokens)
+ assert actual == expected
+```
+
+## Implementation Guidelines
+
+- The module `yql/essentials/sql/v1/highlight` is a reference implementation of the `YQL` highlighting. Module includes a comprehensive test suite to check an implementation compliance with the specification. Also this module contains this specification document.
+
+- The module `yql/essentials/tools/yql_highlight` contains a tool to play with the reference highlighting implementation and to generate various representation of highlighting (e.g. in JSON).
+
+- The test suite data can be found at `yql/essentials/sql/v1/highlight/ut/suite.json`.
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight.cpp b/yql/essentials/sql/v1/highlight/sql_highlight.cpp
new file mode 100644
index 00000000000..a477ba542f8
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlight.cpp
@@ -0,0 +1,313 @@
+#include "sql_highlight.h"
+
+#include <yql/essentials/sql/v1/lexer/regex/regex.h>
+
+#include <contrib/libs/re2/re2/re2.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/hash.h>
+#include <util/generic/hash_set.h>
+#include <util/string/builder.h>
+#include <util/string/join.h>
+
+namespace NSQLHighlight {
+
+ using NSQLTranslationV1::TRegexPattern;
+
+ TRegexPattern Merged(TVector<TRegexPattern> patterns) {
+ Y_ENSURE(!patterns.empty());
+
+ const TRegexPattern& sample = patterns.back();
+ Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) {
+ return std::tie(pattern.After, pattern.IsCaseInsensitive) ==
+ std::tie(sample.After, sample.IsCaseInsensitive);
+ }));
+
+ Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) {
+ return lhs.Body.length() > rhs.Body.length();
+ });
+
+ TStringBuilder body;
+ for (const auto& pattern : patterns) {
+ body << "(" << pattern.Body << ")|";
+ }
+ Y_ENSURE(body.back() == '|');
+ body.pop_back();
+
+ return TRegexPattern{
+ .Body = std::move(body),
+ .After = sample.After,
+ .IsCaseInsensitive = sample.IsCaseInsensitive,
+ };
+ }
+
+ struct Syntax {
+ const NSQLReflect::TLexerGrammar* Grammar;
+ THashMap<TString, TString> RegexesDefault;
+ THashMap<TString, TString> RegexesANSI;
+
+ TString Concat(const TVector<TStringBuf>& names) {
+ TString concat;
+ for (const auto& name : names) {
+ concat += Get(name);
+ }
+ return concat;
+ }
+
+ TString Get(const TStringBuf name, bool ansi = false) const {
+ if (Grammar->PunctuationNames.contains(name)) {
+ return RE2::QuoteMeta(Grammar->BlockByName.at(name));
+ }
+ if (ansi) {
+ return RegexesANSI.at(name);
+ }
+ return RegexesDefault.at(name);
+ }
+ };
+
+ NSQLTranslationV1::TRegexPattern CaseInsensitive(TStringBuf text) {
+ return {
+ .Body = TString(text),
+ .IsCaseInsensitive = true,
+ };
+ }
+
+ template <EUnitKind K>
+ TUnit MakeUnit(Syntax& syntax);
+
+ template <>
+ TUnit MakeUnit<EUnitKind::Keyword>(Syntax& s) {
+ using NSQLReflect::TLexerGrammar;
+
+ TUnit unit = {.Kind = EUnitKind::Keyword};
+ for (const auto& keyword : s.Grammar->KeywordNames) {
+ const TStringBuf content = TLexerGrammar::KeywordBlock(keyword);
+ unit.Patterns.push_back(CaseInsensitive(content));
+ }
+
+ unit.Patterns = {Merged(std::move(unit.Patterns))};
+ return unit;
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::Punctuation>(Syntax& s) {
+ TUnit unit = {.Kind = EUnitKind::Punctuation};
+ for (const auto& name : s.Grammar->PunctuationNames) {
+ const TString content = s.Get(name);
+ unit.Patterns.push_back({content});
+ }
+
+ unit.Patterns = {Merged(std::move(unit.Patterns))};
+ return unit;
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::QuotedIdentifier>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::QuotedIdentifier,
+ .Patterns = {
+ {s.Get("ID_QUOTED")},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::BindParamterIdentifier>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::BindParamterIdentifier,
+ .Patterns = {
+ {s.Concat({"DOLLAR", "ID_PLAIN"})},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::TypeIdentifier>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::TypeIdentifier,
+ .Patterns = {
+ {s.Get("ID_PLAIN"), s.Get("LESS")},
+ {Merged({
+ CaseInsensitive("Decimal"),
+ CaseInsensitive("Bool"),
+ CaseInsensitive("Int8"),
+ CaseInsensitive("Int16"),
+ CaseInsensitive("Int32"),
+ CaseInsensitive("Int64"),
+ CaseInsensitive("Uint8"),
+ CaseInsensitive("Uint16"),
+ CaseInsensitive("Uint32"),
+ CaseInsensitive("Uint64"),
+ CaseInsensitive("Float"),
+ CaseInsensitive("Double"),
+ CaseInsensitive("DyNumber"),
+ CaseInsensitive("String"),
+ CaseInsensitive("Utf8"),
+ CaseInsensitive("Json"),
+ CaseInsensitive("JsonDocument"),
+ CaseInsensitive("Yson"),
+ CaseInsensitive("Uuid"),
+ CaseInsensitive("Date"),
+ CaseInsensitive("Datetime"),
+ CaseInsensitive("Timestamp"),
+ CaseInsensitive("Interval"),
+ CaseInsensitive("TzDate"),
+ CaseInsensitive("TzDateTime"),
+ CaseInsensitive("TzTimestamp"),
+ CaseInsensitive("Callable"),
+ CaseInsensitive("Resource"),
+ CaseInsensitive("Tagged"),
+ CaseInsensitive("Generic"),
+ CaseInsensitive("Unit"),
+ CaseInsensitive("Null"),
+ CaseInsensitive("Void"),
+ CaseInsensitive("EmptyList"),
+ CaseInsensitive("EmptyDict"),
+ })},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::FunctionIdentifier>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::FunctionIdentifier,
+ .Patterns = {
+ {s.Concat({"ID_PLAIN", "NAMESPACE", "ID_PLAIN"})},
+ {s.Get("ID_PLAIN"), s.Get("LPAREN")},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::Identifier>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::Identifier,
+ .Patterns = {
+ {s.Get("ID_PLAIN")},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::Literal>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::Literal,
+ .Patterns = {
+ {s.Get("DIGITS")},
+ {s.Get("INTEGER_VALUE")},
+ {s.Get("REAL")},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::StringLiteral>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::StringLiteral,
+ .Patterns = {{s.Get("STRING_VALUE")}},
+ .PatternsANSI = TVector<TRegexPattern>{
+ TRegexPattern{s.Get("STRING_VALUE", /* ansi = */ true)},
+ },
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::Comment>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::Comment,
+ .Patterns = {{s.Get("COMMENT")}},
+ .PatternsANSI = Nothing(),
+ };
+ }
+
+ template <>
+ TUnit MakeUnit<EUnitKind::Whitespace>(Syntax& s) {
+ return {
+ .Kind = EUnitKind::Whitespace,
+ .Patterns = {
+ {s.Get("WS")},
+ },
+ };
+ }
+
+ Syntax MakeSyntax(const NSQLReflect::TLexerGrammar& grammar) {
+ using NSQLTranslationV1::MakeRegexByOtherName;
+
+ Syntax syntax;
+ syntax.Grammar = &grammar;
+ for (auto& [k, v] : MakeRegexByOtherName(*syntax.Grammar, /* ansi = */ false)) {
+ syntax.RegexesDefault.emplace(std::move(k), std::move(v));
+ }
+ for (auto& [k, v] : MakeRegexByOtherName(*syntax.Grammar, /* ansi = */ true)) {
+ syntax.RegexesANSI.emplace(std::move(k), std::move(v));
+ }
+ return syntax;
+ }
+
+ THighlighting MakeHighlighting() {
+ return MakeHighlighting(NSQLReflect::LoadLexerGrammar());
+ }
+
+ THighlighting MakeHighlighting(const NSQLReflect::TLexerGrammar& grammar) {
+ Syntax s = MakeSyntax(grammar);
+
+ THighlighting h;
+ h.Units.emplace_back(MakeUnit<EUnitKind::Keyword>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::Punctuation>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::QuotedIdentifier>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::BindParamterIdentifier>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::TypeIdentifier>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::FunctionIdentifier>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::Identifier>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::Literal>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::StringLiteral>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::Comment>(s));
+ h.Units.emplace_back(MakeUnit<EUnitKind::Whitespace>(s));
+
+ return h;
+ }
+
+} // namespace NSQLHighlight
+
+template <>
+void Out<NSQLHighlight::EUnitKind>(IOutputStream& out, NSQLHighlight::EUnitKind kind) {
+ switch (kind) {
+ case NSQLHighlight::EUnitKind::Keyword:
+ out << "keyword";
+ break;
+ case NSQLHighlight::EUnitKind::Punctuation:
+ out << "punctuation";
+ break;
+ case NSQLHighlight::EUnitKind::QuotedIdentifier:
+ out << "quoted-identifier";
+ break;
+ case NSQLHighlight::EUnitKind::BindParamterIdentifier:
+ out << "bind-paramter-identifier";
+ break;
+ case NSQLHighlight::EUnitKind::TypeIdentifier:
+ out << "type-identifier";
+ break;
+ case NSQLHighlight::EUnitKind::FunctionIdentifier:
+ out << "function-identifier";
+ break;
+ case NSQLHighlight::EUnitKind::Identifier:
+ out << "identifier";
+ break;
+ case NSQLHighlight::EUnitKind::Literal:
+ out << "literal";
+ break;
+ case NSQLHighlight::EUnitKind::StringLiteral:
+ out << "string-literal";
+ break;
+ case NSQLHighlight::EUnitKind::Comment:
+ out << "comment";
+ break;
+ case NSQLHighlight::EUnitKind::Whitespace:
+ out << "ws";
+ break;
+ case NSQLHighlight::EUnitKind::Error:
+ out << "error";
+ break;
+ }
+}
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight.h b/yql/essentials/sql/v1/highlight/sql_highlight.h
new file mode 100644
index 00000000000..f6ecc375836
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlight.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <yql/essentials/sql/v1/lexer/regex/generic.h>
+#include <yql/essentials/sql/v1/reflect/sql_reflect.h>
+
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/map.h>
+
+namespace NSQLHighlight {
+
+ enum class EUnitKind {
+ Keyword,
+ Punctuation,
+ QuotedIdentifier,
+ BindParamterIdentifier,
+ TypeIdentifier,
+ FunctionIdentifier,
+ Identifier,
+ Literal,
+ StringLiteral,
+ Comment,
+ Whitespace,
+ Error,
+ };
+
+ struct TUnit {
+ EUnitKind Kind;
+ TVector<NSQLTranslationV1::TRegexPattern> Patterns;
+ TMaybe<TVector<NSQLTranslationV1::TRegexPattern>> PatternsANSI;
+ };
+
+ struct THighlighting {
+ TVector<TUnit> Units;
+ };
+
+ THighlighting MakeHighlighting();
+
+ THighlighting MakeHighlighting(const NSQLReflect::TLexerGrammar& grammar);
+
+} // namespace NSQLHighlight
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight_json.cpp b/yql/essentials/sql/v1/highlight/sql_highlight_json.cpp
new file mode 100644
index 00000000000..e4af680cdb9
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlight_json.cpp
@@ -0,0 +1,67 @@
+#include "sql_highlight_json.h"
+
+#include <util/string/cast.h>
+
+namespace NSQLHighlight {
+
+ struct {
+ const char* Units = "units";
+ struct {
+ const char* Kind = "kind";
+ const char* Patterns = "patterns";
+ const char* PatternsANSI = "patterns-ansi";
+ } Unit;
+ struct {
+ const char* Body = "body";
+ const char* After = "after";
+ const char* IsCaseInsensitive = "is-case-insensitive";
+ } Pattern;
+ } JsonKey;
+
+ NJson::TJsonValue ToJson(const NSQLTranslationV1::TRegexPattern& pattern) {
+ NJson::TJsonMap map;
+ map[JsonKey.Pattern.Body] = pattern.Body;
+ if (!pattern.After.empty()) {
+ map[JsonKey.Pattern.After] = pattern.After;
+ }
+ if (pattern.IsCaseInsensitive) {
+ map[JsonKey.Pattern.IsCaseInsensitive] = pattern.IsCaseInsensitive;
+ }
+ return map;
+ }
+
+ NJson::TJsonValue ToJson(const TVector<NSQLTranslationV1::TRegexPattern>& patterns) {
+ NJson::TJsonArray array;
+ for (const auto& pattern : patterns) {
+ array.AppendValue(ToJson(pattern));
+ }
+ return array;
+ }
+
+ NJson::TJsonValue ToJson(const TUnit& unit) {
+ NJson::TJsonMap map;
+ map[JsonKey.Unit.Kind] = ToString(unit.Kind);
+ if (!unit.Patterns.empty()) {
+ map[JsonKey.Unit.Patterns] = ToJson(unit.Patterns);
+ }
+ if (!unit.PatternsANSI.Empty()) {
+ map[JsonKey.Unit.PatternsANSI] = ToJson(*unit.PatternsANSI);
+ }
+ return map;
+ }
+
+ NJson::TJsonValue ToJson(const TVector<TUnit>& units) {
+ NJson::TJsonArray array;
+ for (const auto& unit : units) {
+ array.AppendValue(ToJson(unit));
+ }
+ return array;
+ }
+
+ NJson::TJsonValue ToJson(const THighlighting& highlighting) {
+ NJson::TJsonMap map;
+ map[JsonKey.Units] = ToJson(highlighting.Units);
+ return map;
+ }
+
+} // namespace NSQLHighlight
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight_json.h b/yql/essentials/sql/v1/highlight/sql_highlight_json.h
new file mode 100644
index 00000000000..96df7c15a78
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlight_json.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "sql_highlight.h"
+
+#include <library/cpp/json/json_value.h>
+
+namespace NSQLHighlight {
+
+ NJson::TJsonValue ToJson(const THighlighting& highlighting);
+
+} // namespace NSQLHighlight
diff --git a/yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp b/yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp
new file mode 100644
index 00000000000..0329ad32320
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp
@@ -0,0 +1,14 @@
+#include "sql_highlight_json.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NSQLHighlight;
+
+Y_UNIT_TEST_SUITE(SqlHighlightJsonTests) {
+
+ Y_UNIT_TEST(Smoke) {
+ NJson::TJsonValue json = ToJson(MakeHighlighting());
+ UNIT_ASSERT(json.Has("units"));
+ }
+
+} // Y_UNIT_TEST_SUITE(SqlHighlightJsonTests)
diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter.cpp b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp
new file mode 100644
index 00000000000..23d17277e49
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp
@@ -0,0 +1,138 @@
+#include "sql_highlighter.h"
+
+#include <yql/essentials/sql/v1/lexer/regex/lexer.h>
+
+#include <contrib/libs/re2/re2/re2.h>
+
+#include <util/generic/deque.h>
+#include <util/generic/maybe.h>
+
+namespace NSQLHighlight {
+
+ using NSQLTranslationV1::Compile;
+ using NSQLTranslationV1::IGenericLexer;
+ using NSQLTranslationV1::TGenericLexerGrammar;
+ using NSQLTranslationV1::TGenericToken;
+ using NSQLTranslationV1::TTokenRule;
+
+ THashMap<EUnitKind, TString> NamesByUnitKind = [] {
+ THashMap<EUnitKind, TString> names;
+ names[EUnitKind::Keyword] = "K";
+ names[EUnitKind::Punctuation] = "P";
+ names[EUnitKind::QuotedIdentifier] = "Q";
+ names[EUnitKind::BindParamterIdentifier] = "B";
+ names[EUnitKind::TypeIdentifier] = "T";
+ names[EUnitKind::FunctionIdentifier] = "F";
+ names[EUnitKind::Identifier] = "I";
+ names[EUnitKind::Literal] = "L";
+ names[EUnitKind::StringLiteral] = "S";
+ names[EUnitKind::Comment] = "C";
+ names[EUnitKind::Whitespace] = "W";
+ names[EUnitKind::Error] = TGenericToken::Error;
+ return names;
+ }();
+
+ THashMap<TString, EUnitKind> UnitKindsByName = [] {
+ THashMap<TString, EUnitKind> kinds;
+ for (const auto& [kind, name] : NamesByUnitKind) {
+ Y_ENSURE(!kinds.contains(name));
+ kinds[name] = kind;
+ }
+ return kinds;
+ }();
+
+ TGenericLexerGrammar ToGenericLexerGrammar(const THighlighting& highlighting, bool ansi) {
+ using NSQLTranslationV1::ANSICommentMatcher;
+
+ TGenericLexerGrammar grammar;
+ for (const auto& unit : highlighting.Units) {
+ const auto* patterns = &unit.Patterns;
+ if (!unit.PatternsANSI.Empty() && ansi) {
+ patterns = unit.PatternsANSI.Get();
+ }
+
+ if (unit.Kind == EUnitKind::Comment && ansi) {
+ Y_ENSURE(unit.Patterns.size() == 1);
+ const auto& pattern = unit.Patterns[0];
+ grammar.emplace_back(TTokenRule{
+ .TokenName = NamesByUnitKind.at(unit.Kind),
+ .Match = ANSICommentMatcher(Compile(pattern)),
+ });
+ }
+
+ for (const auto& pattern : *patterns) {
+ grammar.emplace_back(TTokenRule{
+ .TokenName = NamesByUnitKind.at(unit.Kind),
+ .Match = Compile(pattern),
+ });
+ }
+ }
+ return grammar;
+ }
+
+ class THighlighter: public IHighlighter {
+ public:
+ explicit THighlighter(NSQLTranslationV1::IGenericLexer::TPtr lexer)
+ : Lexer_(std::move(lexer))
+ {
+ }
+
+ bool Tokenize(TStringBuf text, const TTokenCallback& onNext, size_t maxErrors) const override {
+ const auto onNextToken = [&](NSQLTranslationV1::TGenericToken&& token) {
+ if (token.Name == "EOF") {
+ return;
+ }
+
+ onNext({
+ .Kind = UnitKindsByName.at(token.Name),
+ .Begin = token.Begin,
+ .Length = token.Content.size(),
+ });
+ };
+
+ return Lexer_->Tokenize(text, onNextToken, maxErrors);
+ }
+
+ private:
+ NSQLTranslationV1::IGenericLexer::TPtr Lexer_;
+ };
+
+ class TCombinedHighlighter: public IHighlighter {
+ public:
+ explicit TCombinedHighlighter(const THighlighting& highlighting)
+ : LexerDefault_(NSQLTranslationV1::MakeGenericLexer(
+ ToGenericLexerGrammar(highlighting, /* ansi = */ false)))
+ , LexerANSI_(NSQLTranslationV1::MakeGenericLexer(
+ ToGenericLexerGrammar(highlighting, /* ansi = */ true)))
+ {
+ }
+
+ bool Tokenize(TStringBuf text, const TTokenCallback& onNext, size_t maxErrors) const override {
+ return Alt(text).Tokenize(text, onNext, maxErrors);
+ }
+
+ private:
+ const IHighlighter& Alt(TStringBuf text) const {
+ if (text.After('-').StartsWith("-!ansi_lexer")) {
+ return LexerANSI_;
+ }
+ return LexerDefault_;
+ }
+
+ THighlighter LexerDefault_;
+ THighlighter LexerANSI_;
+ };
+
+ TVector<TToken> Tokenize(IHighlighter& highlighter, TStringBuf text) {
+ TVector<TToken> tokens;
+ highlighter.Tokenize(text, [&](TToken&& token) {
+ tokens.emplace_back(std::move(token));
+ });
+ return tokens;
+ }
+
+ IHighlighter::TPtr MakeHighlighter(const THighlighting& highlighting) {
+ return IHighlighter::TPtr(new TCombinedHighlighter(highlighting));
+ }
+
+} // namespace NSQLHighlight
diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter.h b/yql/essentials/sql/v1/highlight/sql_highlighter.h
new file mode 100644
index 00000000000..39b5e93242d
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlighter.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "sql_highlight.h"
+
+#include <util/generic/ptr.h>
+#include <util/generic/ylimits.h>
+
+#include <functional>
+
+namespace NSQLHighlight {
+
+ struct TToken {
+ EUnitKind Kind;
+ size_t Begin; // In bytes
+ size_t Length; // In bytes
+ };
+
+ class IHighlighter: public TThrRefBase {
+ public:
+ using TPtr = TIntrusivePtr<IHighlighter>;
+ using TTokenCallback = std::function<void(TToken&& token)>;
+
+ virtual ~IHighlighter() = default;
+ virtual bool Tokenize(
+ TStringBuf text,
+ const TTokenCallback& onNext,
+ size_t maxErrors = Max<size_t>()) const = 0;
+ };
+
+ TVector<TToken> Tokenize(IHighlighter& highlighter, TStringBuf text);
+
+ IHighlighter::TPtr MakeHighlighter(const THighlighting& highlighting);
+
+} // namespace NSQLHighlight
diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp b/yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp
new file mode 100644
index 00000000000..5fcab7937fc
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp
@@ -0,0 +1,106 @@
+#include "sql_highlighter.h"
+
+#include <library/cpp/json/json_reader.h>
+#include <library/cpp/resource/resource.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/string.h>
+#include <util/string/join.h>
+#include <util/string/escape.h>
+
+using namespace NSQLHighlight;
+
+struct TTest {
+ struct TCase {
+ TString Input;
+ TString Expected;
+ };
+
+ TString Name;
+ TVector<TCase> Cases;
+};
+
+TVector<TTest> LoadTestSuite() {
+ TString text;
+ Y_ENSURE(NResource::FindExact("suite.json", &text));
+ auto json = NJson::ReadJsonFastTree(text).GetMapSafe();
+
+ TVector<TTest> tests;
+ for (auto& [k, v] : json) {
+ TVector<TTest::TCase> cases;
+ for (auto& c : v.GetArraySafe()) {
+ cases.emplace_back(
+ std::move(c[0].GetStringSafe()),
+ std::move(c[1].GetStringSafe()));
+ }
+ tests.emplace_back(std::move(k), std::move(cases));
+ }
+ return tests;
+}
+
+char ToChar(EUnitKind kind) {
+ switch (kind) {
+ case EUnitKind::Keyword:
+ return 'K';
+ case EUnitKind::Punctuation:
+ return 'P';
+ case EUnitKind::Identifier:
+ return 'I';
+ case EUnitKind::QuotedIdentifier:
+ return 'Q';
+ case EUnitKind::BindParamterIdentifier:
+ return 'B';
+ case EUnitKind::TypeIdentifier:
+ return 'T';
+ case EUnitKind::FunctionIdentifier:
+ return 'F';
+ case EUnitKind::Literal:
+ return 'L';
+ case EUnitKind::StringLiteral:
+ return 'S';
+ case EUnitKind::Comment:
+ return 'C';
+ case EUnitKind::Whitespace:
+ return '_';
+ case EUnitKind::Error:
+ return 'E';
+ }
+}
+
+TString ToMask(const TVector<TToken>& tokens) {
+ TVector<TString> s;
+ for (const auto& t : tokens) {
+ s.emplace_back(TString(t.Length, ToChar(t.Kind)));
+ }
+ return JoinSeq("#", s);
+}
+
+TString Mask(IHighlighter::TPtr& h, TStringBuf text) {
+ return ToMask(Tokenize(*h, text));
+}
+
+Y_UNIT_TEST_SUITE(SqlHighlighterTests) {
+
+ Y_UNIT_TEST(Suite) {
+ auto h = MakeHighlighter(MakeHighlighting());
+ size_t count = 0;
+ Cerr << "{" << Endl;
+ for (const auto& test : LoadTestSuite()) {
+ Cerr << " \"" << test.Name << "\": [" << Endl;
+ for (size_t i = 0; i < test.Cases.size(); ++i) {
+ const auto& check = test.Cases[i];
+ const auto actual = Mask(h, check.Input);
+ Cerr << " [\"" << EscapeC(check.Input) << "\", \"" << actual << "\"]," << Endl;
+ UNIT_ASSERT_VALUES_EQUAL_C(
+ actual,
+ check.Expected,
+ test.Name << " #" << i << ": Input = '" << check.Input << "'");
+ count += 1;
+ }
+ Cerr << " ]," << Endl;
+ }
+ Cerr << "}" << Endl;
+ Cerr << "Test Cases Executed: " << count << Endl;
+ }
+
+} // Y_UNIT_TEST_SUITE(SqlHighlighterTests)
diff --git a/yql/essentials/sql/v1/highlight/ut/suite.json b/yql/essentials/sql/v1/highlight/ut/suite.json
new file mode 100644
index 00000000000..04e961a6f91
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/ut/suite.json
@@ -0,0 +1,95 @@
+{
+ "Invalid": [
+ ["!", "E"],
+ ["й", "E#E"],
+ ["编", "E#E#E"],
+ ["\uD83D\uDE00", "E#E#E#E"],
+ ["!select", "E#KKKKKK"],
+ ["!SSelect", "E#IIIIIII"]
+ ],
+ "Operation": [
+ ["(1 + 21 / 4)", "P#L#_#P#_#LL#_#P#_#L#P"],
+ ["(1+21/4)", "P#L#P#LL#P#L#P"]
+ ],
+ "ANSI": [
+ ["--_ansi_lexer\n\n/*/**/*/", "CCCCCCCCCCCCCC#_#CCCCCC#P#P"],
+ ["--!ansi_lexer\n\n/*/**/*/", "CCCCCCCCCCCCCC#_#CCCCCCCC"],
+ ["--_ansi_lexer\n\n\"\\\"\"", "CCCCCCCCCCCCCC#_#SSSS"],
+ ["--!ansi_lexer\n\n\"\\\"\"", "CCCCCCCCCCCCCC#_#SSS#E"],
+ ["\n --!ansi_lexer\n\n/*/**/*/", "_#_#CCCCCCCCCCCCCC#_#CCCCCCCC"]
+ ],
+ "Number": [
+ ["1234", "LLLL"],
+ ["-123", "P#LLL"],
+ ["SELECT 123l AS `Int64`, 0b01u AS `Uint32`, 0xFFul AS `Uint64`, 0o7ut AS `Uint8`, 456s AS `Int16`, 1.2345f AS `Float`;", "KKKKKK#_#LLLL#_#KK#_#QQQQQQQ#P#_#LLLLL#_#KK#_#QQQQQQQQ#P#_#LLLLLL#_#KK#_#QQQQQQQQ#P#_#LLLLL#_#KK#_#QQQQQQQ#P#_#LLLL#_#KK#_#QQQQQQQ#P#_#LLLLLLL#_#KK#_#QQQQQQQ#P"]
+ ],
+ "Comment": [
+ ["- select", "P#_#KKKKKK"],
+ ["select -- select", "KKKKKK#_#CCCCCCCCC"],
+ ["-- select\nselect", "CCCCCCCCCC#KKKKKK"],
+ ["/* select */", "CCCCCCCCCCCC"],
+ ["select /* select */ select", "KKKKKK#_#CCCCCCCCCCCC#_#KKKKKK"],
+ ["/**/ --", "CCCC#_#CC"],
+ ["/*/**/*/", "CCCCCC#P#P"]
+ ],
+ "FunctionIdentifier": [
+ ["MIN", "III"],
+ ["min", "III"],
+ ["MIN(123, 65)", "FFF#P#LLL#P#_#LL#P"],
+ ["minimum", "IIIIIII"],
+ ["MINimum", "IIIIIII"],
+ ["Math::Sin", "FFFFFFFFF"],
+ ["Math", "IIII"],
+ ["Math::", "IIII#PP"],
+ ["::Sin", "PP#III"]
+ ],
+ "SQL": [
+ ["SELECT id, alias from users", "KKKKKK#_#II#P#_#IIIII#_#KKKK#_#IIIII"],
+ ["INSERT INTO users (id, alias) VALUES (12, \"tester\")", "KKKKKK#_#KKKK#_#IIIII#_#P#II#P#_#IIIII#P#_#KKKKKK#_#P#LL#P#_#SSSSSSSS#P"],
+ ["SELECT 123467, \"HeLLo, {name}!\", (1 + (5 * 1 / 0)), MIN(identifier) FROM `local/test/space/table` JOIN test;", "KKKKKK#_#LLLLLL#P#_#SSSSSSSSSSSSSSSS#P#_#P#L#_#P#_#P#L#_#P#_#L#_#P#_#L#P#P#P#_#FFF#P#IIIIIIIIII#P#_#KKKK#_#QQQQQQQQQQQQQQQQQQQQQQQQ#_#KKKK#_#IIII#P"],
+ ["SELECT Bool(phone) FROM customer", "KKKKKK#_#TTTT#P#IIIII#P#_#KKKK#_#IIIIIIII"]
+ ],
+ "TypeIdentifier": [
+ ["Bool", "TTTT"],
+ ["Bool(value)", "TTTT#P#IIIII#P"]
+ ],
+ "Identifier": [
+ ["test", "IIII"]
+ ],
+ "Keyword": [
+ ["SELECT", "KKKKKK"],
+ ["select", "KKKKKK"],
+ ["ALTER", "KKKKK"],
+ ["GROUP BY", "KKKKK#_#KK"],
+ ["INSERT", "KKKKKK"]
+ ],
+ "String": [
+ ["\"\"", "SS"],
+ ["\"test\"", "SSSSSS"],
+ ["\"", "E"],
+ ["\"\"\"", "SS#E"],
+ ["\"\\\"", "E#E#E"],
+ ["\"test select from", "E#IIII#_#KKKKKK#_#KKKK"],
+ ["\"\\\"\"", "SSSS"],
+ ["\"select\"select", "SSSSSSSSS#IIIII"],
+ ["\"select\"group", "SSSSSSSS#KKKKK"],
+ ["SELECT \"\uD83D\uDE00\" FROM test", "KKKKKK#_#SSSSSS#_#KKKK#_#IIII"]
+ ],
+ "Blank": [
+ ["", ""],
+ [" ", "_"],
+ [" ", "_#_#_"],
+ ["\n", "_"],
+ ["\n\n", "_#_"],
+ ["\r\n", "_#_"],
+ ["\r", "_"],
+ ["\r\n\n", "_#_#_"],
+ ["\r\n\r\n", "_#_#_#_"]
+ ],
+ "QuotedIdentifier": [
+ ["`/cluster/database`", "QQQQQQQQQQQQQQQQQQQ"],
+ ["`test`select", "QQQQQQ#KKKKKK"],
+ ["`/cluster", "E#P#IIIIIII"],
+ ["`\uD83D\uDE00`", "QQQQQQ"]
+ ]
+}
diff --git a/yql/essentials/sql/v1/highlight/ut/ya.make b/yql/essentials/sql/v1/highlight/ut/ya.make
new file mode 100644
index 00000000000..81ddde8b8b0
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/ut/ya.make
@@ -0,0 +1,8 @@
+UNITTEST_FOR(yql/essentials/sql/v1/highlight)
+
+SRCS(
+ sql_highlight_json_ut.cpp
+ sql_highlighter_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/sql/v1/highlight/ya.make b/yql/essentials/sql/v1/highlight/ya.make
new file mode 100644
index 00000000000..a5e4ef7d6b6
--- /dev/null
+++ b/yql/essentials/sql/v1/highlight/ya.make
@@ -0,0 +1,20 @@
+LIBRARY()
+
+SRCS(
+ sql_highlight_json.cpp
+ sql_highlight.cpp
+ sql_highlighter.cpp
+)
+
+PEERDIR(
+ yql/essentials/sql/v1/lexer/regex
+ yql/essentials/sql/v1/reflect
+)
+
+RESOURCE(yql/essentials/sql/v1/highlight/ut/suite.json suite.json)
+
+END()
+
+RECURSE_FOR_TESTS(
+ ut
+)
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
index c59089efd63..196ca68a8f1 100644
--- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -308,6 +308,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF");
UNIT_ASSERT_TOKENIZED(lexer, "from", "FROM(from) EOF");
UNIT_ASSERT_TOKENIZED(lexer, " UPSERT ", "WS( ) UPSERT WS( ) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "ERROR", "ERROR EOF");
}
Y_UNIT_TEST_ON_EACH_LEXER(KeywordSkip) {
@@ -356,6 +357,12 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) {
UNIT_ASSERT_TOKENIZED(lexer, "123.45E10", "REAL(123.45E10) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "123.45E+10", "REAL(123.45E+10) EOF");
UNIT_ASSERT_TOKENIZED(lexer, "1E+10", "REAL(1E+10) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "123l", "INTEGER_VALUE(123l) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "0b01u", "INTEGER_VALUE(0b01u) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "0xfful", "INTEGER_VALUE(0xfful) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "0o7ut", "INTEGER_VALUE(0o7ut) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "456s", "INTEGER_VALUE(456s) EOF");
+ UNIT_ASSERT_TOKENIZED(lexer, "1.2345f", "REAL(1.2345f) EOF");
}
Y_UNIT_TEST_ON_EACH_LEXER(SingleLineString) {
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp
new file mode 100644
index 00000000000..2a451b4ef5c
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp
@@ -0,0 +1,127 @@
+#include "generic.h"
+
+#include <contrib/libs/re2/re2/re2.h>
+
+namespace NSQLTranslationV1 {
+
+ namespace {
+
+ TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) {
+ re2::StringPiece input(prefix.data(), prefix.size());
+ if (RE2::Consume(&input, regex)) {
+ return TStringBuf(prefix.data(), input.data());
+ }
+ return Nothing();
+ }
+
+ } // namespace
+
+ class TGenericLexer: public IGenericLexer {
+ private:
+ static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF";
+
+ public:
+ explicit TGenericLexer(TGenericLexerGrammar grammar)
+ : Grammar_(std::move(grammar))
+ {
+ }
+
+ virtual bool Tokenize(
+ TStringBuf text,
+ const TTokenCallback& onNext,
+ size_t maxErrors) const override {
+ Y_ENSURE(0 < maxErrors);
+ size_t errors = 0;
+
+ size_t pos = 0;
+ if (text.StartsWith(Utf8BOM)) {
+ pos += Utf8BOM.size();
+ }
+
+ while (pos < text.size() && errors < maxErrors) {
+ TGenericToken matched = Match(TStringBuf(text, pos));
+ matched.Begin = pos;
+
+ pos += matched.Content.size();
+
+ if (matched.Name == TGenericToken::Error) {
+ errors += 1;
+ }
+
+ onNext(std::move(matched));
+ }
+
+ if (errors == maxErrors) {
+ return false;
+ }
+
+ onNext(TGenericToken{
+ .Name = "EOF",
+ .Content = "<EOF>",
+ .Begin = pos,
+ });
+
+ return errors == 0;
+ }
+
+ private:
+ TGenericToken Match(TStringBuf prefix) const {
+ TMaybe<TGenericToken> max;
+ Match(prefix, [&](TGenericToken&& token) {
+ if (max.Empty() || max->Content.size() < token.Content.size()) {
+ max = std::move(token);
+ }
+ });
+
+ if (max) {
+ return *max;
+ }
+
+ return {
+ .Name = TGenericToken::Error,
+ .Content = prefix.substr(0, 1),
+ };
+ }
+
+ void Match(TStringBuf prefix, auto onMatch) const {
+ for (const auto& token : Grammar_) {
+ if (auto content = token.Match(prefix)) {
+ onMatch(TGenericToken{
+ .Name = token.TokenName,
+ .Content = *content,
+ });
+ }
+ }
+ }
+
+ TGenericLexerGrammar Grammar_;
+ };
+
+ TTokenMatcher Compile(const TRegexPattern& regex) {
+ RE2::Options options;
+ options.set_case_sensitive(!regex.IsCaseInsensitive);
+
+ return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options),
+ afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ TMaybe<TStringBuf> body, after;
+ if ((body = Match(prefix, *bodyRe)) &&
+ (after = Match(prefix.Tail(body->size()), *afterRe))) {
+ return body;
+ }
+ return Nothing();
+ };
+ }
+
+ IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) {
+ return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar)));
+ }
+
+ TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) {
+ TVector<TGenericToken> tokens;
+ lexer->Tokenize(text, [&](TGenericToken&& token) {
+ tokens.emplace_back(std::move(token));
+ });
+ return tokens;
+ }
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h
new file mode 100644
index 00000000000..cde028cc599
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/regex/generic.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/maybe.h>
+#include <util/generic/ylimits.h>
+
+#include <functional>
+
+namespace NSQLTranslationV1 {
+
+ struct TGenericToken {
+ static constexpr const char* Error = "<ERROR>";
+
+ TStringBuf Name;
+ TStringBuf Content;
+ size_t Begin = 0; // In bytes
+ };
+
+ class IGenericLexer: public TThrRefBase {
+ public:
+ using TPtr = TIntrusivePtr<IGenericLexer>;
+ using TTokenCallback = std::function<void(TGenericToken&& token)>;
+
+ static constexpr size_t MaxErrorsLimit = Max<size_t>();
+
+ virtual ~IGenericLexer() = default;
+ virtual bool Tokenize(
+ TStringBuf text,
+ const TTokenCallback& onNext,
+ size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0;
+ };
+
+ using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>;
+
+ struct TTokenRule {
+ TString TokenName;
+ TTokenMatcher Match;
+ };
+
+ using TGenericLexerGrammar = TVector<TTokenRule>;
+
+ struct TRegexPattern {
+ TString Body;
+ TString After = "";
+ bool IsCaseInsensitive = false;
+ };
+
+ TTokenMatcher Compile(const TRegexPattern& regex);
+
+ IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar);
+
+ TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text);
+
+} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
index a1d96253bf7..58c98edfd31 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp
@@ -1,5 +1,6 @@
#include "lexer.h"
+#include "generic.h"
#include "regex.h"
#include <contrib/libs/re2/re2/re2.h>
@@ -9,256 +10,177 @@
#include <util/generic/algorithm.h>
#include <util/generic/string.h>
+#include <util/generic/maybe.h>
#include <util/string/subst.h>
#include <util/string/ascii.h>
namespace NSQLTranslationV1 {
+ using NSQLReflect::TLexerGrammar;
using NSQLTranslation::TParsedToken;
using NSQLTranslation::TParsedTokenList;
- class TRegexLexer: public NSQLTranslation::ILexer {
- static constexpr const char* CommentTokenName = "COMMENT";
- static constexpr const char* StringValueName = "STRING_VALUE";
-
- static constexpr const TStringBuf Utf8BOM = "\xEF\xBB\xBF";
-
- public:
- TRegexLexer(
- bool ansi,
- NSQLReflect::TLexerGrammar grammar,
- const TVector<std::tuple<TString, TString>>& RegexByOtherName)
- : Grammar_(std::move(grammar))
- , Ansi_(ansi)
- {
- for (const auto& [token, regex] : RegexByOtherName) {
- RE2::Options custom;
- if (token != CommentTokenName && token != StringValueName) {
- custom.set_longest_match(true);
- }
+ size_t MatchANSIMultilineComment(TStringBuf remaining);
- RE2* re2 = new RE2(regex, custom);
- if (token == CommentTokenName) {
- CommentRegex_.Reset(re2);
- } else {
- OtherRegexes_.emplace_back(token, re2);
- }
+ TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) {
+ return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> {
+ const auto basic = defaultComment(prefix);
+ if (basic.Empty()) {
+ return Nothing();
}
- }
-
- bool Tokenize(
- const TString& query,
- const TString& queryName,
- const TTokenCallback& onNextToken,
- NYql::TIssues& issues,
- size_t maxErrors) override {
- size_t errors = 0;
- size_t pos = 0;
- if (query.StartsWith(Utf8BOM)) {
- pos += Utf8BOM.size();
+ if (!prefix.StartsWith("/*")) {
+ return basic;
}
- while (pos < query.size()) {
- TParsedToken matched = Match(TStringBuf(query, pos));
-
- if (matched.Name.empty() && maxErrors == errors) {
- break;
- }
-
- if (matched.Name.empty()) {
- pos += 1;
- errors += 1;
- issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates");
- continue;
- }
+ size_t ll1Length = MatchANSIMultilineComment(prefix);
+ TStringBuf ll1Content = prefix.SubString(0, ll1Length);
- pos += matched.Content.length();
- onNextToken(std::move(matched));
+ Y_ENSURE(ll1Content == 0 || basic <= ll1Content);
+ if (ll1Content == 0) {
+ return basic;
}
- onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"});
- return errors == 0;
+ return ll1Content;
+ };
+ }
+
+ size_t MatchANSIMultilineComment(TStringBuf prefix) {
+ if (!prefix.StartsWith("/*")) {
+ return 0;
}
- private:
- TParsedToken Match(const TStringBuf prefix) {
- TParsedTokenList matches;
+ size_t skipped = 0;
- size_t keywordCount = MatchKeyword(prefix, matches);
- MatchPunctuation(prefix, matches);
- MatchRegex(prefix, matches);
- MatchComment(prefix, matches);
+ prefix.Skip(2);
+ skipped += 2;
- if (matches.empty()) {
- return {};
+ for (;;) {
+ if (prefix.StartsWith("*/")) {
+ prefix.Skip(2);
+ skipped += 2;
+ return skipped;
}
- auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) {
- return m.Content.length();
- })->Content.length();
-
- auto max = FindIf(matches, [&](const TParsedToken& m) {
- return m.Content.length() == maxLength;
- });
-
- auto isMatched = [&](const TStringBuf name) {
- return std::end(matches) != FindIf(matches, [&](const auto& m) {
- return m.Name == name;
- });
- };
-
- size_t conflicts = CountIf(matches, [&](const TParsedToken& m) {
- return m.Content.length() == max->Content.length();
- });
- conflicts -= 1;
- Y_ENSURE(
- conflicts == 0 ||
- (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) ||
- (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE")));
-
- Y_ENSURE(!max->Content.empty());
- return *max;
- }
-
- bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) {
- size_t count = 0;
- for (const auto& keyword : Grammar_.KeywordNames) {
- const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword);
- const TStringBuf content = prefix.substr(0, block.length());
- if (AsciiEqualsIgnoreCase(content, block)) {
- matches.emplace_back(keyword, TString(content));
- count += 1;
+ bool isSkipped = false;
+ if (prefix.StartsWith("/*")) {
+ size_t limit = prefix.rfind("*/");
+ if (limit == std::string::npos) {
+ return 0;
}
- }
- return count;
- }
- size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) {
- size_t count = 0;
- for (const auto& name : Grammar_.PunctuationNames) {
- const auto& content = Grammar_.BlockByName.at(name);
- if (prefix.substr(0, content.length()) == content) {
- matches.emplace_back(name, content);
- count += 1;
- }
- }
- return count;
- }
+ size_t len = MatchANSIMultilineComment(prefix.Head(limit));
+ prefix.Skip(len);
+ skipped += len;
- size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) {
- size_t count = 0;
- for (const auto& [token, regex] : OtherRegexes_) {
- if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) {
- matches.emplace_back(token, TString(match));
- count += 1;
- }
+ isSkipped = len != 0;
}
- return count;
- }
- const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) {
- re2::StringPiece input(prefix.data(), prefix.size());
- if (RE2::Consume(&input, regex)) {
- return TStringBuf(prefix.data(), input.data());
+ if (isSkipped) {
+ continue;
}
- return "";
- }
- size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) {
- const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_);
- if (reContent.empty()) {
+ if (prefix.size() == 0) {
return 0;
}
- if (!(Ansi_ && prefix.StartsWith("/*"))) {
- matches.emplace_back(CommentTokenName, TString(reContent));
- return 1;
- }
-
- size_t ll1Length = MatchANSIMultilineComment(prefix);
- const TStringBuf ll1Content = prefix.SubString(0, ll1Length);
-
- Y_ENSURE(ll1Content == 0 || reContent <= ll1Content);
- if (ll1Content == 0) {
- matches.emplace_back(CommentTokenName, TString(reContent));
- return 1;
- }
-
- matches.emplace_back(CommentTokenName, TString(ll1Content));
- return 1;
+ prefix.Skip(1);
+ skipped += 1;
}
+ }
- size_t MatchANSIMultilineComment(TStringBuf remaining) {
- if (!remaining.StartsWith("/*")) {
- return 0;
- }
+ TGenericLexerGrammar MakeGenericLexerGrammar(
+ bool ansi,
+ const TLexerGrammar& grammar,
+ const TVector<std::tuple<TString, TString>>& regexByOtherName) {
+ TGenericLexerGrammar generic;
- size_t skipped = 0;
+ for (const auto& name : grammar.KeywordNames) {
+ auto matcher = Compile({
+ .Body = TString(TLexerGrammar::KeywordBlock(name)),
+ .IsCaseInsensitive = true,
+ });
+ generic.emplace_back(name, std::move(matcher));
+ }
- remaining.Skip(2);
- skipped += 2;
+ for (const auto& name : grammar.PunctuationNames) {
+ generic.emplace_back(
+ name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))}));
+ }
- for (;;) {
- if (remaining.StartsWith("*/")) {
- remaining.Skip(2);
- skipped += 2;
- return skipped;
- }
+ for (const auto& [name, regex] : regexByOtherName) {
+ auto matcher = Compile({
+ .Body = regex,
+ });
+ generic.emplace_back(name, std::move(matcher));
+ }
- bool isSkipped = false;
- if (remaining.StartsWith("/*")) {
- size_t limit = remaining.rfind("*/");
- if (limit == std::string::npos) {
- return 0;
- }
+ if (ansi) {
+ auto it = FindIf(generic, [](const auto& m) {
+ return m.TokenName == "COMMENT";
+ });
+ Y_ENSURE(it != std::end(generic));
+ it->Match = ANSICommentMatcher(it->Match);
+ }
- size_t len = MatchANSIMultilineComment(remaining.Head(limit));
- remaining.Skip(len);
- skipped += len;
+ return generic;
+ }
- isSkipped = len != 0;
- }
+ class TRegexLexer: public NSQLTranslation::ILexer {
+ public:
+ TRegexLexer(IGenericLexer::TPtr lexer)
+ : Lexer_(std::move(lexer))
+ {
+ }
- if (isSkipped) {
- continue;
+ bool Tokenize(
+ const TString& query,
+ const TString& queryName,
+ const TTokenCallback& onNextToken,
+ NYql::TIssues& issues,
+ size_t maxErrors) override {
+ bool isFailed = false;
+
+ const auto onNext = [&](TGenericToken&& token) {
+ if (token.Name == TGenericToken::Error) {
+ NYql::TPosition pos(token.Begin, 0, queryName);
+ TString message = TString("no candidates, skipping ") + token.Content;
+ issues.AddIssue(std::move(pos), std::move(message));
+ isFailed = true;
+ return;
}
- if (remaining.size() == 0) {
- return 0;
- }
+ onNextToken({
+ .Name = TString(token.Name),
+ .Content = TString(token.Content),
+ });
+ };
- remaining.Skip(1);
- skipped += 1;
- }
+ Lexer_->Tokenize(query, onNext, maxErrors);
+ return !isFailed;
}
- NSQLReflect::TLexerGrammar Grammar_;
- TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_;
- THolder<RE2> CommentRegex_;
- bool Ansi_;
+ private:
+ IGenericLexer::TPtr Lexer_;
};
namespace {
class TFactory final: public NSQLTranslation::ILexerFactory {
public:
- explicit TFactory(bool ansi)
- : Ansi_(ansi)
- , Grammar_(NSQLReflect::LoadLexerGrammar())
- , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_))
- {
+ explicit TFactory(bool ansi) {
+ auto grammar = NSQLReflect::LoadLexerGrammar();
+ auto regexes = MakeRegexByOtherName(grammar, ansi);
+ Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes));
}
NSQLTranslation::ILexer::TPtr MakeLexer() const override {
return NSQLTranslation::ILexer::TPtr(
- new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_));
+ new TRegexLexer(Lexer_));
}
private:
- bool Ansi_;
- NSQLReflect::TLexerGrammar Grammar_;
- TVector<std::tuple<TString, TString>> RegexByOtherName_;
+ IGenericLexer::TPtr Lexer_;
};
} // namespace
diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h
index e9968954e1f..42d99a0a530 100644
--- a/yql/essentials/sql/v1/lexer/regex/lexer.h
+++ b/yql/essentials/sql/v1/lexer/regex/lexer.h
@@ -1,9 +1,13 @@
#pragma once
+#include "generic.h"
+
#include <yql/essentials/parser/lexer_common/lexer.h>
namespace NSQLTranslationV1 {
+ TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment);
+
NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi);
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp
index e634ff009a7..3f8af88eb4c 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp
@@ -45,6 +45,7 @@ namespace NSQLTranslationV1 {
TString ToRegex(const TStringBuf name) {
TString text = Grammar_->BlockByName.at(name);
+ Preprocess(text);
Inline(text);
Transform(text);
Finalize(text);
@@ -52,6 +53,10 @@ namespace NSQLTranslationV1 {
}
private:
+ void Preprocess(TString& text) {
+ text = ChangedDigitsPrecendence(std::move(text));
+ }
+
void Inline(TString& text) {
ApplyEachWhileChanging(text, Inliners_);
}
@@ -86,6 +91,8 @@ namespace NSQLTranslationV1 {
Grammar_->PunctuationNames.contains(name) ||
PunctuationFragments.contains(name)) {
def = "'" + def + "'";
+ } else if (name == "DIGITS") {
+ def = ChangedDigitsPrecendence(std::move(def));
}
def = QuoteAntlrRewrite(std::move(def));
@@ -95,6 +102,15 @@ namespace NSQLTranslationV1 {
}
}
+ // Regex engine matches the first matched alternative,
+ // even if it is not the longest one, while ANTLR is more gready.
+ TString ChangedDigitsPrecendence(TString body) {
+ if (SubstGlobal(body, "DECDIGITS | ", "") != 0) {
+ SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS");
+ }
+ return body;
+ }
+
void Transform(TString& text) {
ApplyEachWhileChanging(text, Transformations_);
}
diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
index e62bb0e609f..8c7688aadcd 100644
--- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
+++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp
@@ -62,7 +62,14 @@ Y_UNIT_TEST_SUITE(SqlRegexTests) {
CheckRegex(
/* ansi = */ false,
"DIGITS",
- R"(([0-9]+)|(0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+))");
+ R"((0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))");
+ }
+
+ Y_UNIT_TEST(IntegerValue) {
+ CheckRegex(
+ /* ansi = */ false,
+ "INTEGER_VALUE",
+ R"(((0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))(([pP]|[uU])?([lL]|[sS]|[tT]|[iI]|[bB]|[nN])?))");
}
Y_UNIT_TEST(Real) {
diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make
index 249dfbd11df..3a7fe19b94c 100644
--- a/yql/essentials/sql/v1/lexer/regex/ya.make
+++ b/yql/essentials/sql/v1/lexer/regex/ya.make
@@ -28,6 +28,7 @@ CFLAGS(
)
SRCS(
+ generic.cpp
lexer.cpp
regex.cpp
)
diff --git a/yql/essentials/sql/v1/ya.make b/yql/essentials/sql/v1/ya.make
index 9407914f976..0e89d150bf8 100644
--- a/yql/essentials/sql/v1/ya.make
+++ b/yql/essentials/sql/v1/ya.make
@@ -58,6 +58,7 @@ END()
RECURSE(
complete
format
+ highlight
lexer
perf
proto_parser
diff --git a/yql/essentials/tools/yql_highlight/ya.make b/yql/essentials/tools/yql_highlight/ya.make
new file mode 100644
index 00000000000..63c2e9eea5e
--- /dev/null
+++ b/yql/essentials/tools/yql_highlight/ya.make
@@ -0,0 +1,17 @@
+IF (NOT EXPORT_CMAKE OR NOT OPENSOURCE OR OPENSOURCE_PROJECT != "yt")
+
+PROGRAM()
+
+PEERDIR(
+ library/cpp/getopt
+ yql/essentials/sql/v1/highlight
+)
+
+SRCS(
+ yql_highlight.cpp
+)
+
+END()
+
+ENDIF()
+
diff --git a/yql/essentials/tools/yql_highlight/yql_highlight b/yql/essentials/tools/yql_highlight/yql_highlight
new file mode 100644
index 00000000000..b47885386e5
--- /dev/null
+++ b/yql/essentials/tools/yql_highlight/yql_highlight
@@ -0,0 +1 @@
+/home/vityaman/.ya/build/symres/1a48d14807d3c5183ba6864b369f928e/yql_highlight \ No newline at end of file
diff --git a/yql/essentials/tools/yql_highlight/yql_highlight.cpp b/yql/essentials/tools/yql_highlight/yql_highlight.cpp
new file mode 100644
index 00000000000..01dd9efe343
--- /dev/null
+++ b/yql/essentials/tools/yql_highlight/yql_highlight.cpp
@@ -0,0 +1,79 @@
+#include <yql/essentials/sql/v1/highlight/sql_highlight_json.h>
+#include <yql/essentials/sql/v1/highlight/sql_highlight.h>
+#include <yql/essentials/sql/v1/highlight/sql_highlighter.h>
+
+#include <library/cpp/getopt/last_getopt.h>
+#include <library/cpp/colorizer/colors.h>
+#include <library/cpp/json/json_writer.h>
+
+#include <util/stream/input.h>
+
+using namespace NSQLHighlight;
+
+int RunGenerateJSON() {
+ THighlighting highlighting = MakeHighlighting();
+ NJson::TJsonValue json = ToJson(highlighting);
+ NJson::WriteJson(&Cout, &json, /* formatOutput = */ true);
+ return 0;
+}
+
+int RunHighlighter() {
+ THashMap<EUnitKind, NColorizer::EAnsiCode> ColorByKind = {
+ {EUnitKind::Keyword, NColorizer::BLUE},
+ {EUnitKind::Punctuation, NColorizer::DARK_WHITE},
+ {EUnitKind::QuotedIdentifier, NColorizer::DARK_CYAN},
+ {EUnitKind::BindParamterIdentifier, NColorizer::YELLOW},
+ {EUnitKind::TypeIdentifier, NColorizer::GREEN},
+ {EUnitKind::FunctionIdentifier, NColorizer::MAGENTA},
+ {EUnitKind::Identifier, NColorizer::DEFAULT},
+ {EUnitKind::Literal, NColorizer::LIGHT_GREEN},
+ {EUnitKind::StringLiteral, NColorizer::DARK_RED},
+ {EUnitKind::Comment, NColorizer::DARK_GREEN},
+ {EUnitKind::Whitespace, NColorizer::DEFAULT},
+ {EUnitKind::Error, NColorizer::RED},
+ };
+
+ TString query = Cin.ReadAll();
+
+ THighlighting highlighting = MakeHighlighting();
+ IHighlighter::TPtr highlighter = MakeHighlighter(highlighting);
+ TVector<TToken> tokens = Tokenize(*highlighter, query);
+
+ for (auto& token : tokens) {
+ TStringBuf content = TStringBuf(query).SubString(token.Begin, token.Length);
+ Cout << ColorByKind[token.Kind] << content << NColorizer::RESET;
+ }
+
+ return 0;
+}
+
+int Run(int argc, char* argv[]) {
+ TString target;
+
+ NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default();
+ opts.AddLongOption('g', "generate", "generate a highlighting configuration")
+ .RequiredArgument("target")
+ .Choices({"json"})
+ .StoreResult(&target);
+ opts.SetFreeArgsNum(0);
+ opts.AddHelpOption();
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+ if (res.Has("generate")) {
+ if (target == "json") {
+ return RunGenerateJSON();
+ }
+ Y_ABORT();
+ }
+ return RunHighlighter();
+}
+
+int main(int argc, char* argv[]) try {
+ return Run(argc, argv);
+} catch (const yexception& e) {
+ Cerr << "Caught exception:" << e.what() << Endl;
+ return 1;
+} catch (...) {
+ Cerr << CurrentExceptionMessage() << Endl;
+ return 1;
+}