diff options
author | robot-piglet <[email protected]> | 2025-05-12 13:53:24 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-05-12 14:05:50 +0300 |
commit | 7a941ebd252fd7442b4d1d34d31d72e971ad20bf (patch) | |
tree | 70c132d1b611697ad23b90cf35215b035f247ec0 | |
parent | bf1279129bcf6c1b1001e39c39a13d80737898d3 (diff) |
Intermediate changes
commit_hash:3a624a323006078de71f50747f7b2e8cadba7ccd
24 files changed, 1494 insertions, 192 deletions
diff --git a/yql/essentials/sql/v1/highlight/README.md b/yql/essentials/sql/v1/highlight/README.md new file mode 100644 index 00000000000..214e4d41573 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/README.md @@ -0,0 +1,217 @@ +# YQL SQL Syntax Highlighting Specfication + +## Overview + +This document specifies the syntax highlighting system for the YQL. The system specifies how to identify and categorize different syntactic elements in queries for highlighting porposes. + +## Terms + +- `Highlighting` is a _list_ of `Highlighting Unit`s that define how to recognize different parts of SQL syntax. + +- `Highlighting Unit` is a language construction to be highlighted (e.g., keywords, identifiers, literals). + +- `Highlighting Token` is a text fragment matched with a `Highlighting Unit`. + +- `Highlighter` is an function parametrized by `Highlighting` transforming a text into a stream of `Highlighting Token`s. + +- `Theme` is a mapping from `Highlighting Unit` to a `Color`. + +## Highlighting Unit + +Here are examples of `Highlighting Unit`s. They will evolve and should be taken from the JSON `Highlighting` programmatically. Only comments are always present, as they may require special processing. + +- `keyword`: SQL reserved words (e.g., `SELECT`, `INSERT`, `FROM`). + +- `punctuation`: Syntactic symbols (e.g., `.`, `;`, `(`, `)`). + +- `identifier`: Unquoted names (e.g., table or column names). + +- `quoted-identifier`: Backtick-quoted names (e.g., ``` `table` ```). + +- `bind-parameter-identifier`: Parameter references (e.g., `$param`). + +- `type-identifier`: Type names (e.g., `Int32`, `String`). + +- `function-identifier`: Function names (e.g., `MIN`, `Math::Sin`). + +- `literal`: Numeric constants (e.g., `123`, `1.23`). + +- `string-literal`: Quoted strings (e.g., `"example"`). + +- `comment`: Single-line (`--`) or multi-line (`/* */`) comments. + +- `ws`: Spaces, tabs, newlines. + +- `error`: Unrecognized syntax. + +Each `Highlighting Unit` contains one or more `Patterns` that define how to recognize the unit in text. + +## Pattern Matching + +A `Pattern` consists of: + +- `body`: The main regex pattern to match. + +- `after`: A lookahead pattern. + +- `is-case-insensitive`: Whether matching should be case-insensitive. + +The matching behavior is equivalent to the regex: `body(?=after)`. + +## Highlighter Algorithm + +The highlighter algorithm can be described with the following pseudocode. + +```python +# Consume matched tokens until empty. +# For each iteration: +# 1. Find the next token match (or error) +# 2. Emit the token +# 3. Continues with the remaining text +highlight(text) = + if text is not empty do + token = match(text) + emit token + highlight(text[token.length:]) + +# Select the longest match from all possible +# patterns. Leftmost is chosen. If no match, +# emits a 1-character error token for as a +# recovery. +match(text) = + max of matches(text) by length + or error token with length = 1 + +# For each highlighting unit and its patterns, +# attempt to match. +matches(text) = do + unit <- highlighting.units + pattern <- unit.patterns + content <- match(text, pattern) + yield token with unit, content + +# Match both the pattern body and lookahead +# (after) portion with case sensitivity settings. +match(text, pattern) = do + body <- ( + regex pattern.body + matches text prefix + with pattern.case_sensivity) + after <- ( + regex pattern.body + matches text[body.length:] prefix + with pattern.case_sensivity) + yield body + after + +# Special ANSI Comment handling. +# Recursively process nested multiline comments. +match(text, Comment if ANSI) = + if text not starts with "/*" do + return match(text, Comment if Default) + + text = text after "/*" + loop do + if text starts with "*/" do + return text after "/*" + + if text starts with "/*" do + budget = text before last "*/" + match = match(budget, Comment if ANSI) + text = text after match + + if match: + continue + + if text is empty: + return Nothing + + text = text[1:] +``` + +## Highlighting JSON Example + +The highlighting can be generated using the `yql_highlight` tool in JSON format. + +```json +{ + "units": [ + ... + { + "kind":"type-identifier", + "patterns": [ + { + "body":"([a-z]|[A-Z]|_)([a-z]|[A-Z]|_|[0-9])*", + "after":"\\<" + }, + { + "body":"Int32|Int16|Utf8|...", + "is-case-insensitive":true + } + ] + }, + ... + ] +} +``` + +## Test Suite + +The reference implementation includes a comprehensive test suite that verifies correct highlighting behavior. The test suite is defined in JSON format with the following structure: + +```json +{ + "SQL": [ + ["SELECT id, alias from users", "KKKKKK#_#II#P#_#IIIII#_#KKKK#_#IIIII"], + ], + "TypeIdentifier": [ + ["Bool(value)", "TTTT#P#IIIII#P"] + ] +} +``` + +Where the first element is the SQL text to highlight and the second one is a string where each character represents the highlighting unit kind for each character in the input. + +Here's the table representation of the unit kind to character mapping: + +| Unit Kind | Character | +| ------------------------- | --------- | +| keyword | K | +| punctuation | P | +| identifier | I | +| quoted-identifier | Q | +| bind-parameter-identifier | B | +| type-identifier | T | +| function-identifier | F | +| literal | L | +| string-literal | S | +| comment | C | +| ws | _ | +| error | E | + +Note: The `#` is used to make tokens visually distinct from other. + +The test driver pseudocode: + +```cpp +run_test_suite = + let + highlighting = load_highlighting() + highlighter = make_highlighter(highlighting) + suite = load_sest_suite() + in do + scenario <- suite + test <- scenario + (input, expected) = test + + tokens = highlighter.highlight(input) + actual = to_pattern(tokens) + assert actual == expected +``` + +## Implementation Guidelines + +- The module `yql/essentials/sql/v1/highlight` is a reference implementation of the `YQL` highlighting. Module includes a comprehensive test suite to check an implementation compliance with the specification. Also this module contains this specification document. + +- The module `yql/essentials/tools/yql_highlight` contains a tool to play with the reference highlighting implementation and to generate various representation of highlighting (e.g. in JSON). + +- The test suite data can be found at `yql/essentials/sql/v1/highlight/ut/suite.json`. diff --git a/yql/essentials/sql/v1/highlight/sql_highlight.cpp b/yql/essentials/sql/v1/highlight/sql_highlight.cpp new file mode 100644 index 00000000000..a477ba542f8 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlight.cpp @@ -0,0 +1,313 @@ +#include "sql_highlight.h" + +#include <yql/essentials/sql/v1/lexer/regex/regex.h> + +#include <contrib/libs/re2/re2/re2.h> + +#include <util/generic/algorithm.h> +#include <util/generic/hash.h> +#include <util/generic/hash_set.h> +#include <util/string/builder.h> +#include <util/string/join.h> + +namespace NSQLHighlight { + + using NSQLTranslationV1::TRegexPattern; + + TRegexPattern Merged(TVector<TRegexPattern> patterns) { + Y_ENSURE(!patterns.empty()); + + const TRegexPattern& sample = patterns.back(); + Y_ENSURE(AllOf(patterns, [&](const TRegexPattern& pattern) { + return std::tie(pattern.After, pattern.IsCaseInsensitive) == + std::tie(sample.After, sample.IsCaseInsensitive); + })); + + Sort(patterns, [](const TRegexPattern& lhs, const TRegexPattern& rhs) { + return lhs.Body.length() > rhs.Body.length(); + }); + + TStringBuilder body; + for (const auto& pattern : patterns) { + body << "(" << pattern.Body << ")|"; + } + Y_ENSURE(body.back() == '|'); + body.pop_back(); + + return TRegexPattern{ + .Body = std::move(body), + .After = sample.After, + .IsCaseInsensitive = sample.IsCaseInsensitive, + }; + } + + struct Syntax { + const NSQLReflect::TLexerGrammar* Grammar; + THashMap<TString, TString> RegexesDefault; + THashMap<TString, TString> RegexesANSI; + + TString Concat(const TVector<TStringBuf>& names) { + TString concat; + for (const auto& name : names) { + concat += Get(name); + } + return concat; + } + + TString Get(const TStringBuf name, bool ansi = false) const { + if (Grammar->PunctuationNames.contains(name)) { + return RE2::QuoteMeta(Grammar->BlockByName.at(name)); + } + if (ansi) { + return RegexesANSI.at(name); + } + return RegexesDefault.at(name); + } + }; + + NSQLTranslationV1::TRegexPattern CaseInsensitive(TStringBuf text) { + return { + .Body = TString(text), + .IsCaseInsensitive = true, + }; + } + + template <EUnitKind K> + TUnit MakeUnit(Syntax& syntax); + + template <> + TUnit MakeUnit<EUnitKind::Keyword>(Syntax& s) { + using NSQLReflect::TLexerGrammar; + + TUnit unit = {.Kind = EUnitKind::Keyword}; + for (const auto& keyword : s.Grammar->KeywordNames) { + const TStringBuf content = TLexerGrammar::KeywordBlock(keyword); + unit.Patterns.push_back(CaseInsensitive(content)); + } + + unit.Patterns = {Merged(std::move(unit.Patterns))}; + return unit; + } + + template <> + TUnit MakeUnit<EUnitKind::Punctuation>(Syntax& s) { + TUnit unit = {.Kind = EUnitKind::Punctuation}; + for (const auto& name : s.Grammar->PunctuationNames) { + const TString content = s.Get(name); + unit.Patterns.push_back({content}); + } + + unit.Patterns = {Merged(std::move(unit.Patterns))}; + return unit; + } + + template <> + TUnit MakeUnit<EUnitKind::QuotedIdentifier>(Syntax& s) { + return { + .Kind = EUnitKind::QuotedIdentifier, + .Patterns = { + {s.Get("ID_QUOTED")}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::BindParamterIdentifier>(Syntax& s) { + return { + .Kind = EUnitKind::BindParamterIdentifier, + .Patterns = { + {s.Concat({"DOLLAR", "ID_PLAIN"})}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::TypeIdentifier>(Syntax& s) { + return { + .Kind = EUnitKind::TypeIdentifier, + .Patterns = { + {s.Get("ID_PLAIN"), s.Get("LESS")}, + {Merged({ + CaseInsensitive("Decimal"), + CaseInsensitive("Bool"), + CaseInsensitive("Int8"), + CaseInsensitive("Int16"), + CaseInsensitive("Int32"), + CaseInsensitive("Int64"), + CaseInsensitive("Uint8"), + CaseInsensitive("Uint16"), + CaseInsensitive("Uint32"), + CaseInsensitive("Uint64"), + CaseInsensitive("Float"), + CaseInsensitive("Double"), + CaseInsensitive("DyNumber"), + CaseInsensitive("String"), + CaseInsensitive("Utf8"), + CaseInsensitive("Json"), + CaseInsensitive("JsonDocument"), + CaseInsensitive("Yson"), + CaseInsensitive("Uuid"), + CaseInsensitive("Date"), + CaseInsensitive("Datetime"), + CaseInsensitive("Timestamp"), + CaseInsensitive("Interval"), + CaseInsensitive("TzDate"), + CaseInsensitive("TzDateTime"), + CaseInsensitive("TzTimestamp"), + CaseInsensitive("Callable"), + CaseInsensitive("Resource"), + CaseInsensitive("Tagged"), + CaseInsensitive("Generic"), + CaseInsensitive("Unit"), + CaseInsensitive("Null"), + CaseInsensitive("Void"), + CaseInsensitive("EmptyList"), + CaseInsensitive("EmptyDict"), + })}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::FunctionIdentifier>(Syntax& s) { + return { + .Kind = EUnitKind::FunctionIdentifier, + .Patterns = { + {s.Concat({"ID_PLAIN", "NAMESPACE", "ID_PLAIN"})}, + {s.Get("ID_PLAIN"), s.Get("LPAREN")}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::Identifier>(Syntax& s) { + return { + .Kind = EUnitKind::Identifier, + .Patterns = { + {s.Get("ID_PLAIN")}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::Literal>(Syntax& s) { + return { + .Kind = EUnitKind::Literal, + .Patterns = { + {s.Get("DIGITS")}, + {s.Get("INTEGER_VALUE")}, + {s.Get("REAL")}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::StringLiteral>(Syntax& s) { + return { + .Kind = EUnitKind::StringLiteral, + .Patterns = {{s.Get("STRING_VALUE")}}, + .PatternsANSI = TVector<TRegexPattern>{ + TRegexPattern{s.Get("STRING_VALUE", /* ansi = */ true)}, + }, + }; + } + + template <> + TUnit MakeUnit<EUnitKind::Comment>(Syntax& s) { + return { + .Kind = EUnitKind::Comment, + .Patterns = {{s.Get("COMMENT")}}, + .PatternsANSI = Nothing(), + }; + } + + template <> + TUnit MakeUnit<EUnitKind::Whitespace>(Syntax& s) { + return { + .Kind = EUnitKind::Whitespace, + .Patterns = { + {s.Get("WS")}, + }, + }; + } + + Syntax MakeSyntax(const NSQLReflect::TLexerGrammar& grammar) { + using NSQLTranslationV1::MakeRegexByOtherName; + + Syntax syntax; + syntax.Grammar = &grammar; + for (auto& [k, v] : MakeRegexByOtherName(*syntax.Grammar, /* ansi = */ false)) { + syntax.RegexesDefault.emplace(std::move(k), std::move(v)); + } + for (auto& [k, v] : MakeRegexByOtherName(*syntax.Grammar, /* ansi = */ true)) { + syntax.RegexesANSI.emplace(std::move(k), std::move(v)); + } + return syntax; + } + + THighlighting MakeHighlighting() { + return MakeHighlighting(NSQLReflect::LoadLexerGrammar()); + } + + THighlighting MakeHighlighting(const NSQLReflect::TLexerGrammar& grammar) { + Syntax s = MakeSyntax(grammar); + + THighlighting h; + h.Units.emplace_back(MakeUnit<EUnitKind::Keyword>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::Punctuation>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::QuotedIdentifier>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::BindParamterIdentifier>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::TypeIdentifier>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::FunctionIdentifier>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::Identifier>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::Literal>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::StringLiteral>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::Comment>(s)); + h.Units.emplace_back(MakeUnit<EUnitKind::Whitespace>(s)); + + return h; + } + +} // namespace NSQLHighlight + +template <> +void Out<NSQLHighlight::EUnitKind>(IOutputStream& out, NSQLHighlight::EUnitKind kind) { + switch (kind) { + case NSQLHighlight::EUnitKind::Keyword: + out << "keyword"; + break; + case NSQLHighlight::EUnitKind::Punctuation: + out << "punctuation"; + break; + case NSQLHighlight::EUnitKind::QuotedIdentifier: + out << "quoted-identifier"; + break; + case NSQLHighlight::EUnitKind::BindParamterIdentifier: + out << "bind-paramter-identifier"; + break; + case NSQLHighlight::EUnitKind::TypeIdentifier: + out << "type-identifier"; + break; + case NSQLHighlight::EUnitKind::FunctionIdentifier: + out << "function-identifier"; + break; + case NSQLHighlight::EUnitKind::Identifier: + out << "identifier"; + break; + case NSQLHighlight::EUnitKind::Literal: + out << "literal"; + break; + case NSQLHighlight::EUnitKind::StringLiteral: + out << "string-literal"; + break; + case NSQLHighlight::EUnitKind::Comment: + out << "comment"; + break; + case NSQLHighlight::EUnitKind::Whitespace: + out << "ws"; + break; + case NSQLHighlight::EUnitKind::Error: + out << "error"; + break; + } +} diff --git a/yql/essentials/sql/v1/highlight/sql_highlight.h b/yql/essentials/sql/v1/highlight/sql_highlight.h new file mode 100644 index 00000000000..f6ecc375836 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlight.h @@ -0,0 +1,41 @@ +#pragma once + +#include <yql/essentials/sql/v1/lexer/regex/generic.h> +#include <yql/essentials/sql/v1/reflect/sql_reflect.h> + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/map.h> + +namespace NSQLHighlight { + + enum class EUnitKind { + Keyword, + Punctuation, + QuotedIdentifier, + BindParamterIdentifier, + TypeIdentifier, + FunctionIdentifier, + Identifier, + Literal, + StringLiteral, + Comment, + Whitespace, + Error, + }; + + struct TUnit { + EUnitKind Kind; + TVector<NSQLTranslationV1::TRegexPattern> Patterns; + TMaybe<TVector<NSQLTranslationV1::TRegexPattern>> PatternsANSI; + }; + + struct THighlighting { + TVector<TUnit> Units; + }; + + THighlighting MakeHighlighting(); + + THighlighting MakeHighlighting(const NSQLReflect::TLexerGrammar& grammar); + +} // namespace NSQLHighlight diff --git a/yql/essentials/sql/v1/highlight/sql_highlight_json.cpp b/yql/essentials/sql/v1/highlight/sql_highlight_json.cpp new file mode 100644 index 00000000000..e4af680cdb9 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlight_json.cpp @@ -0,0 +1,67 @@ +#include "sql_highlight_json.h" + +#include <util/string/cast.h> + +namespace NSQLHighlight { + + struct { + const char* Units = "units"; + struct { + const char* Kind = "kind"; + const char* Patterns = "patterns"; + const char* PatternsANSI = "patterns-ansi"; + } Unit; + struct { + const char* Body = "body"; + const char* After = "after"; + const char* IsCaseInsensitive = "is-case-insensitive"; + } Pattern; + } JsonKey; + + NJson::TJsonValue ToJson(const NSQLTranslationV1::TRegexPattern& pattern) { + NJson::TJsonMap map; + map[JsonKey.Pattern.Body] = pattern.Body; + if (!pattern.After.empty()) { + map[JsonKey.Pattern.After] = pattern.After; + } + if (pattern.IsCaseInsensitive) { + map[JsonKey.Pattern.IsCaseInsensitive] = pattern.IsCaseInsensitive; + } + return map; + } + + NJson::TJsonValue ToJson(const TVector<NSQLTranslationV1::TRegexPattern>& patterns) { + NJson::TJsonArray array; + for (const auto& pattern : patterns) { + array.AppendValue(ToJson(pattern)); + } + return array; + } + + NJson::TJsonValue ToJson(const TUnit& unit) { + NJson::TJsonMap map; + map[JsonKey.Unit.Kind] = ToString(unit.Kind); + if (!unit.Patterns.empty()) { + map[JsonKey.Unit.Patterns] = ToJson(unit.Patterns); + } + if (!unit.PatternsANSI.Empty()) { + map[JsonKey.Unit.PatternsANSI] = ToJson(*unit.PatternsANSI); + } + return map; + } + + NJson::TJsonValue ToJson(const TVector<TUnit>& units) { + NJson::TJsonArray array; + for (const auto& unit : units) { + array.AppendValue(ToJson(unit)); + } + return array; + } + + NJson::TJsonValue ToJson(const THighlighting& highlighting) { + NJson::TJsonMap map; + map[JsonKey.Units] = ToJson(highlighting.Units); + return map; + } + +} // namespace NSQLHighlight diff --git a/yql/essentials/sql/v1/highlight/sql_highlight_json.h b/yql/essentials/sql/v1/highlight/sql_highlight_json.h new file mode 100644 index 00000000000..96df7c15a78 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlight_json.h @@ -0,0 +1,11 @@ +#pragma once + +#include "sql_highlight.h" + +#include <library/cpp/json/json_value.h> + +namespace NSQLHighlight { + + NJson::TJsonValue ToJson(const THighlighting& highlighting); + +} // namespace NSQLHighlight diff --git a/yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp b/yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp new file mode 100644 index 00000000000..0329ad32320 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlight_json_ut.cpp @@ -0,0 +1,14 @@ +#include "sql_highlight_json.h" + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NSQLHighlight; + +Y_UNIT_TEST_SUITE(SqlHighlightJsonTests) { + + Y_UNIT_TEST(Smoke) { + NJson::TJsonValue json = ToJson(MakeHighlighting()); + UNIT_ASSERT(json.Has("units")); + } + +} // Y_UNIT_TEST_SUITE(SqlHighlightJsonTests) diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter.cpp b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp new file mode 100644 index 00000000000..23d17277e49 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlighter.cpp @@ -0,0 +1,138 @@ +#include "sql_highlighter.h" + +#include <yql/essentials/sql/v1/lexer/regex/lexer.h> + +#include <contrib/libs/re2/re2/re2.h> + +#include <util/generic/deque.h> +#include <util/generic/maybe.h> + +namespace NSQLHighlight { + + using NSQLTranslationV1::Compile; + using NSQLTranslationV1::IGenericLexer; + using NSQLTranslationV1::TGenericLexerGrammar; + using NSQLTranslationV1::TGenericToken; + using NSQLTranslationV1::TTokenRule; + + THashMap<EUnitKind, TString> NamesByUnitKind = [] { + THashMap<EUnitKind, TString> names; + names[EUnitKind::Keyword] = "K"; + names[EUnitKind::Punctuation] = "P"; + names[EUnitKind::QuotedIdentifier] = "Q"; + names[EUnitKind::BindParamterIdentifier] = "B"; + names[EUnitKind::TypeIdentifier] = "T"; + names[EUnitKind::FunctionIdentifier] = "F"; + names[EUnitKind::Identifier] = "I"; + names[EUnitKind::Literal] = "L"; + names[EUnitKind::StringLiteral] = "S"; + names[EUnitKind::Comment] = "C"; + names[EUnitKind::Whitespace] = "W"; + names[EUnitKind::Error] = TGenericToken::Error; + return names; + }(); + + THashMap<TString, EUnitKind> UnitKindsByName = [] { + THashMap<TString, EUnitKind> kinds; + for (const auto& [kind, name] : NamesByUnitKind) { + Y_ENSURE(!kinds.contains(name)); + kinds[name] = kind; + } + return kinds; + }(); + + TGenericLexerGrammar ToGenericLexerGrammar(const THighlighting& highlighting, bool ansi) { + using NSQLTranslationV1::ANSICommentMatcher; + + TGenericLexerGrammar grammar; + for (const auto& unit : highlighting.Units) { + const auto* patterns = &unit.Patterns; + if (!unit.PatternsANSI.Empty() && ansi) { + patterns = unit.PatternsANSI.Get(); + } + + if (unit.Kind == EUnitKind::Comment && ansi) { + Y_ENSURE(unit.Patterns.size() == 1); + const auto& pattern = unit.Patterns[0]; + grammar.emplace_back(TTokenRule{ + .TokenName = NamesByUnitKind.at(unit.Kind), + .Match = ANSICommentMatcher(Compile(pattern)), + }); + } + + for (const auto& pattern : *patterns) { + grammar.emplace_back(TTokenRule{ + .TokenName = NamesByUnitKind.at(unit.Kind), + .Match = Compile(pattern), + }); + } + } + return grammar; + } + + class THighlighter: public IHighlighter { + public: + explicit THighlighter(NSQLTranslationV1::IGenericLexer::TPtr lexer) + : Lexer_(std::move(lexer)) + { + } + + bool Tokenize(TStringBuf text, const TTokenCallback& onNext, size_t maxErrors) const override { + const auto onNextToken = [&](NSQLTranslationV1::TGenericToken&& token) { + if (token.Name == "EOF") { + return; + } + + onNext({ + .Kind = UnitKindsByName.at(token.Name), + .Begin = token.Begin, + .Length = token.Content.size(), + }); + }; + + return Lexer_->Tokenize(text, onNextToken, maxErrors); + } + + private: + NSQLTranslationV1::IGenericLexer::TPtr Lexer_; + }; + + class TCombinedHighlighter: public IHighlighter { + public: + explicit TCombinedHighlighter(const THighlighting& highlighting) + : LexerDefault_(NSQLTranslationV1::MakeGenericLexer( + ToGenericLexerGrammar(highlighting, /* ansi = */ false))) + , LexerANSI_(NSQLTranslationV1::MakeGenericLexer( + ToGenericLexerGrammar(highlighting, /* ansi = */ true))) + { + } + + bool Tokenize(TStringBuf text, const TTokenCallback& onNext, size_t maxErrors) const override { + return Alt(text).Tokenize(text, onNext, maxErrors); + } + + private: + const IHighlighter& Alt(TStringBuf text) const { + if (text.After('-').StartsWith("-!ansi_lexer")) { + return LexerANSI_; + } + return LexerDefault_; + } + + THighlighter LexerDefault_; + THighlighter LexerANSI_; + }; + + TVector<TToken> Tokenize(IHighlighter& highlighter, TStringBuf text) { + TVector<TToken> tokens; + highlighter.Tokenize(text, [&](TToken&& token) { + tokens.emplace_back(std::move(token)); + }); + return tokens; + } + + IHighlighter::TPtr MakeHighlighter(const THighlighting& highlighting) { + return IHighlighter::TPtr(new TCombinedHighlighter(highlighting)); + } + +} // namespace NSQLHighlight diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter.h b/yql/essentials/sql/v1/highlight/sql_highlighter.h new file mode 100644 index 00000000000..39b5e93242d --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlighter.h @@ -0,0 +1,34 @@ +#pragma once + +#include "sql_highlight.h" + +#include <util/generic/ptr.h> +#include <util/generic/ylimits.h> + +#include <functional> + +namespace NSQLHighlight { + + struct TToken { + EUnitKind Kind; + size_t Begin; // In bytes + size_t Length; // In bytes + }; + + class IHighlighter: public TThrRefBase { + public: + using TPtr = TIntrusivePtr<IHighlighter>; + using TTokenCallback = std::function<void(TToken&& token)>; + + virtual ~IHighlighter() = default; + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors = Max<size_t>()) const = 0; + }; + + TVector<TToken> Tokenize(IHighlighter& highlighter, TStringBuf text); + + IHighlighter::TPtr MakeHighlighter(const THighlighting& highlighting); + +} // namespace NSQLHighlight diff --git a/yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp b/yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp new file mode 100644 index 00000000000..5fcab7937fc --- /dev/null +++ b/yql/essentials/sql/v1/highlight/sql_highlighter_ut.cpp @@ -0,0 +1,106 @@ +#include "sql_highlighter.h" + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/resource/resource.h> +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/string.h> +#include <util/string/join.h> +#include <util/string/escape.h> + +using namespace NSQLHighlight; + +struct TTest { + struct TCase { + TString Input; + TString Expected; + }; + + TString Name; + TVector<TCase> Cases; +}; + +TVector<TTest> LoadTestSuite() { + TString text; + Y_ENSURE(NResource::FindExact("suite.json", &text)); + auto json = NJson::ReadJsonFastTree(text).GetMapSafe(); + + TVector<TTest> tests; + for (auto& [k, v] : json) { + TVector<TTest::TCase> cases; + for (auto& c : v.GetArraySafe()) { + cases.emplace_back( + std::move(c[0].GetStringSafe()), + std::move(c[1].GetStringSafe())); + } + tests.emplace_back(std::move(k), std::move(cases)); + } + return tests; +} + +char ToChar(EUnitKind kind) { + switch (kind) { + case EUnitKind::Keyword: + return 'K'; + case EUnitKind::Punctuation: + return 'P'; + case EUnitKind::Identifier: + return 'I'; + case EUnitKind::QuotedIdentifier: + return 'Q'; + case EUnitKind::BindParamterIdentifier: + return 'B'; + case EUnitKind::TypeIdentifier: + return 'T'; + case EUnitKind::FunctionIdentifier: + return 'F'; + case EUnitKind::Literal: + return 'L'; + case EUnitKind::StringLiteral: + return 'S'; + case EUnitKind::Comment: + return 'C'; + case EUnitKind::Whitespace: + return '_'; + case EUnitKind::Error: + return 'E'; + } +} + +TString ToMask(const TVector<TToken>& tokens) { + TVector<TString> s; + for (const auto& t : tokens) { + s.emplace_back(TString(t.Length, ToChar(t.Kind))); + } + return JoinSeq("#", s); +} + +TString Mask(IHighlighter::TPtr& h, TStringBuf text) { + return ToMask(Tokenize(*h, text)); +} + +Y_UNIT_TEST_SUITE(SqlHighlighterTests) { + + Y_UNIT_TEST(Suite) { + auto h = MakeHighlighter(MakeHighlighting()); + size_t count = 0; + Cerr << "{" << Endl; + for (const auto& test : LoadTestSuite()) { + Cerr << " \"" << test.Name << "\": [" << Endl; + for (size_t i = 0; i < test.Cases.size(); ++i) { + const auto& check = test.Cases[i]; + const auto actual = Mask(h, check.Input); + Cerr << " [\"" << EscapeC(check.Input) << "\", \"" << actual << "\"]," << Endl; + UNIT_ASSERT_VALUES_EQUAL_C( + actual, + check.Expected, + test.Name << " #" << i << ": Input = '" << check.Input << "'"); + count += 1; + } + Cerr << " ]," << Endl; + } + Cerr << "}" << Endl; + Cerr << "Test Cases Executed: " << count << Endl; + } + +} // Y_UNIT_TEST_SUITE(SqlHighlighterTests) diff --git a/yql/essentials/sql/v1/highlight/ut/suite.json b/yql/essentials/sql/v1/highlight/ut/suite.json new file mode 100644 index 00000000000..04e961a6f91 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/ut/suite.json @@ -0,0 +1,95 @@ +{ + "Invalid": [ + ["!", "E"], + ["й", "E#E"], + ["编", "E#E#E"], + ["\uD83D\uDE00", "E#E#E#E"], + ["!select", "E#KKKKKK"], + ["!SSelect", "E#IIIIIII"] + ], + "Operation": [ + ["(1 + 21 / 4)", "P#L#_#P#_#LL#_#P#_#L#P"], + ["(1+21/4)", "P#L#P#LL#P#L#P"] + ], + "ANSI": [ + ["--_ansi_lexer\n\n/*/**/*/", "CCCCCCCCCCCCCC#_#CCCCCC#P#P"], + ["--!ansi_lexer\n\n/*/**/*/", "CCCCCCCCCCCCCC#_#CCCCCCCC"], + ["--_ansi_lexer\n\n\"\\\"\"", "CCCCCCCCCCCCCC#_#SSSS"], + ["--!ansi_lexer\n\n\"\\\"\"", "CCCCCCCCCCCCCC#_#SSS#E"], + ["\n --!ansi_lexer\n\n/*/**/*/", "_#_#CCCCCCCCCCCCCC#_#CCCCCCCC"] + ], + "Number": [ + ["1234", "LLLL"], + ["-123", "P#LLL"], + ["SELECT 123l AS `Int64`, 0b01u AS `Uint32`, 0xFFul AS `Uint64`, 0o7ut AS `Uint8`, 456s AS `Int16`, 1.2345f AS `Float`;", "KKKKKK#_#LLLL#_#KK#_#QQQQQQQ#P#_#LLLLL#_#KK#_#QQQQQQQQ#P#_#LLLLLL#_#KK#_#QQQQQQQQ#P#_#LLLLL#_#KK#_#QQQQQQQ#P#_#LLLL#_#KK#_#QQQQQQQ#P#_#LLLLLLL#_#KK#_#QQQQQQQ#P"] + ], + "Comment": [ + ["- select", "P#_#KKKKKK"], + ["select -- select", "KKKKKK#_#CCCCCCCCC"], + ["-- select\nselect", "CCCCCCCCCC#KKKKKK"], + ["/* select */", "CCCCCCCCCCCC"], + ["select /* select */ select", "KKKKKK#_#CCCCCCCCCCCC#_#KKKKKK"], + ["/**/ --", "CCCC#_#CC"], + ["/*/**/*/", "CCCCCC#P#P"] + ], + "FunctionIdentifier": [ + ["MIN", "III"], + ["min", "III"], + ["MIN(123, 65)", "FFF#P#LLL#P#_#LL#P"], + ["minimum", "IIIIIII"], + ["MINimum", "IIIIIII"], + ["Math::Sin", "FFFFFFFFF"], + ["Math", "IIII"], + ["Math::", "IIII#PP"], + ["::Sin", "PP#III"] + ], + "SQL": [ + ["SELECT id, alias from users", "KKKKKK#_#II#P#_#IIIII#_#KKKK#_#IIIII"], + ["INSERT INTO users (id, alias) VALUES (12, \"tester\")", "KKKKKK#_#KKKK#_#IIIII#_#P#II#P#_#IIIII#P#_#KKKKKK#_#P#LL#P#_#SSSSSSSS#P"], + ["SELECT 123467, \"HeLLo, {name}!\", (1 + (5 * 1 / 0)), MIN(identifier) FROM `local/test/space/table` JOIN test;", "KKKKKK#_#LLLLLL#P#_#SSSSSSSSSSSSSSSS#P#_#P#L#_#P#_#P#L#_#P#_#L#_#P#_#L#P#P#P#_#FFF#P#IIIIIIIIII#P#_#KKKK#_#QQQQQQQQQQQQQQQQQQQQQQQQ#_#KKKK#_#IIII#P"], + ["SELECT Bool(phone) FROM customer", "KKKKKK#_#TTTT#P#IIIII#P#_#KKKK#_#IIIIIIII"] + ], + "TypeIdentifier": [ + ["Bool", "TTTT"], + ["Bool(value)", "TTTT#P#IIIII#P"] + ], + "Identifier": [ + ["test", "IIII"] + ], + "Keyword": [ + ["SELECT", "KKKKKK"], + ["select", "KKKKKK"], + ["ALTER", "KKKKK"], + ["GROUP BY", "KKKKK#_#KK"], + ["INSERT", "KKKKKK"] + ], + "String": [ + ["\"\"", "SS"], + ["\"test\"", "SSSSSS"], + ["\"", "E"], + ["\"\"\"", "SS#E"], + ["\"\\\"", "E#E#E"], + ["\"test select from", "E#IIII#_#KKKKKK#_#KKKK"], + ["\"\\\"\"", "SSSS"], + ["\"select\"select", "SSSSSSSSS#IIIII"], + ["\"select\"group", "SSSSSSSS#KKKKK"], + ["SELECT \"\uD83D\uDE00\" FROM test", "KKKKKK#_#SSSSSS#_#KKKK#_#IIII"] + ], + "Blank": [ + ["", ""], + [" ", "_"], + [" ", "_#_#_"], + ["\n", "_"], + ["\n\n", "_#_"], + ["\r\n", "_#_"], + ["\r", "_"], + ["\r\n\n", "_#_#_"], + ["\r\n\r\n", "_#_#_#_"] + ], + "QuotedIdentifier": [ + ["`/cluster/database`", "QQQQQQQQQQQQQQQQQQQ"], + ["`test`select", "QQQQQQ#KKKKKK"], + ["`/cluster", "E#P#IIIIIII"], + ["`\uD83D\uDE00`", "QQQQQQ"] + ] +} diff --git a/yql/essentials/sql/v1/highlight/ut/ya.make b/yql/essentials/sql/v1/highlight/ut/ya.make new file mode 100644 index 00000000000..81ddde8b8b0 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/ut/ya.make @@ -0,0 +1,8 @@ +UNITTEST_FOR(yql/essentials/sql/v1/highlight) + +SRCS( + sql_highlight_json_ut.cpp + sql_highlighter_ut.cpp +) + +END() diff --git a/yql/essentials/sql/v1/highlight/ya.make b/yql/essentials/sql/v1/highlight/ya.make new file mode 100644 index 00000000000..a5e4ef7d6b6 --- /dev/null +++ b/yql/essentials/sql/v1/highlight/ya.make @@ -0,0 +1,20 @@ +LIBRARY() + +SRCS( + sql_highlight_json.cpp + sql_highlight.cpp + sql_highlighter.cpp +) + +PEERDIR( + yql/essentials/sql/v1/lexer/regex + yql/essentials/sql/v1/reflect +) + +RESOURCE(yql/essentials/sql/v1/highlight/ut/suite.json suite.json) + +END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp index c59089efd63..196ca68a8f1 100644 --- a/yql/essentials/sql/v1/lexer/lexer_ut.cpp +++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp @@ -308,6 +308,7 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, "FROM", "FROM EOF"); UNIT_ASSERT_TOKENIZED(lexer, "from", "FROM(from) EOF"); UNIT_ASSERT_TOKENIZED(lexer, " UPSERT ", "WS( ) UPSERT WS( ) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "ERROR", "ERROR EOF"); } Y_UNIT_TEST_ON_EACH_LEXER(KeywordSkip) { @@ -356,6 +357,12 @@ Y_UNIT_TEST_SUITE(SQLv1Lexer) { UNIT_ASSERT_TOKENIZED(lexer, "123.45E10", "REAL(123.45E10) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "123.45E+10", "REAL(123.45E+10) EOF"); UNIT_ASSERT_TOKENIZED(lexer, "1E+10", "REAL(1E+10) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "123l", "INTEGER_VALUE(123l) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "0b01u", "INTEGER_VALUE(0b01u) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "0xfful", "INTEGER_VALUE(0xfful) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "0o7ut", "INTEGER_VALUE(0o7ut) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "456s", "INTEGER_VALUE(456s) EOF"); + UNIT_ASSERT_TOKENIZED(lexer, "1.2345f", "REAL(1.2345f) EOF"); } Y_UNIT_TEST_ON_EACH_LEXER(SingleLineString) { diff --git a/yql/essentials/sql/v1/lexer/regex/generic.cpp b/yql/essentials/sql/v1/lexer/regex/generic.cpp new file mode 100644 index 00000000000..2a451b4ef5c --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/generic.cpp @@ -0,0 +1,127 @@ +#include "generic.h" + +#include <contrib/libs/re2/re2/re2.h> + +namespace NSQLTranslationV1 { + + namespace { + + TMaybe<TStringBuf> Match(TStringBuf prefix, const RE2& regex) { + re2::StringPiece input(prefix.data(), prefix.size()); + if (RE2::Consume(&input, regex)) { + return TStringBuf(prefix.data(), input.data()); + } + return Nothing(); + } + + } // namespace + + class TGenericLexer: public IGenericLexer { + private: + static constexpr TStringBuf Utf8BOM = "\xEF\xBB\xBF"; + + public: + explicit TGenericLexer(TGenericLexerGrammar grammar) + : Grammar_(std::move(grammar)) + { + } + + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors) const override { + Y_ENSURE(0 < maxErrors); + size_t errors = 0; + + size_t pos = 0; + if (text.StartsWith(Utf8BOM)) { + pos += Utf8BOM.size(); + } + + while (pos < text.size() && errors < maxErrors) { + TGenericToken matched = Match(TStringBuf(text, pos)); + matched.Begin = pos; + + pos += matched.Content.size(); + + if (matched.Name == TGenericToken::Error) { + errors += 1; + } + + onNext(std::move(matched)); + } + + if (errors == maxErrors) { + return false; + } + + onNext(TGenericToken{ + .Name = "EOF", + .Content = "<EOF>", + .Begin = pos, + }); + + return errors == 0; + } + + private: + TGenericToken Match(TStringBuf prefix) const { + TMaybe<TGenericToken> max; + Match(prefix, [&](TGenericToken&& token) { + if (max.Empty() || max->Content.size() < token.Content.size()) { + max = std::move(token); + } + }); + + if (max) { + return *max; + } + + return { + .Name = TGenericToken::Error, + .Content = prefix.substr(0, 1), + }; + } + + void Match(TStringBuf prefix, auto onMatch) const { + for (const auto& token : Grammar_) { + if (auto content = token.Match(prefix)) { + onMatch(TGenericToken{ + .Name = token.TokenName, + .Content = *content, + }); + } + } + } + + TGenericLexerGrammar Grammar_; + }; + + TTokenMatcher Compile(const TRegexPattern& regex) { + RE2::Options options; + options.set_case_sensitive(!regex.IsCaseInsensitive); + + return [bodyRe = MakeAtomicShared<RE2>(regex.Body, options), + afterRe = MakeAtomicShared<RE2>(regex.After, options)](TStringBuf prefix) -> TMaybe<TStringBuf> { + TMaybe<TStringBuf> body, after; + if ((body = Match(prefix, *bodyRe)) && + (after = Match(prefix.Tail(body->size()), *afterRe))) { + return body; + } + return Nothing(); + }; + } + + IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar) { + return IGenericLexer::TPtr(new TGenericLexer(std::move(grammar))); + } + + TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text) { + TVector<TGenericToken> tokens; + lexer->Tokenize(text, [&](TGenericToken&& token) { + tokens.emplace_back(std::move(token)); + }); + return tokens; + } + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/generic.h b/yql/essentials/sql/v1/lexer/regex/generic.h new file mode 100644 index 00000000000..cde028cc599 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/regex/generic.h @@ -0,0 +1,56 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/maybe.h> +#include <util/generic/ylimits.h> + +#include <functional> + +namespace NSQLTranslationV1 { + + struct TGenericToken { + static constexpr const char* Error = "<ERROR>"; + + TStringBuf Name; + TStringBuf Content; + size_t Begin = 0; // In bytes + }; + + class IGenericLexer: public TThrRefBase { + public: + using TPtr = TIntrusivePtr<IGenericLexer>; + using TTokenCallback = std::function<void(TGenericToken&& token)>; + + static constexpr size_t MaxErrorsLimit = Max<size_t>(); + + virtual ~IGenericLexer() = default; + virtual bool Tokenize( + TStringBuf text, + const TTokenCallback& onNext, + size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0; + }; + + using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>; + + struct TTokenRule { + TString TokenName; + TTokenMatcher Match; + }; + + using TGenericLexerGrammar = TVector<TTokenRule>; + + struct TRegexPattern { + TString Body; + TString After = ""; + bool IsCaseInsensitive = false; + }; + + TTokenMatcher Compile(const TRegexPattern& regex); + + IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar); + + TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text); + +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.cpp b/yql/essentials/sql/v1/lexer/regex/lexer.cpp index a1d96253bf7..58c98edfd31 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/regex/lexer.cpp @@ -1,5 +1,6 @@ #include "lexer.h" +#include "generic.h" #include "regex.h" #include <contrib/libs/re2/re2/re2.h> @@ -9,256 +10,177 @@ #include <util/generic/algorithm.h> #include <util/generic/string.h> +#include <util/generic/maybe.h> #include <util/string/subst.h> #include <util/string/ascii.h> namespace NSQLTranslationV1 { + using NSQLReflect::TLexerGrammar; using NSQLTranslation::TParsedToken; using NSQLTranslation::TParsedTokenList; - class TRegexLexer: public NSQLTranslation::ILexer { - static constexpr const char* CommentTokenName = "COMMENT"; - static constexpr const char* StringValueName = "STRING_VALUE"; - - static constexpr const TStringBuf Utf8BOM = "\xEF\xBB\xBF"; - - public: - TRegexLexer( - bool ansi, - NSQLReflect::TLexerGrammar grammar, - const TVector<std::tuple<TString, TString>>& RegexByOtherName) - : Grammar_(std::move(grammar)) - , Ansi_(ansi) - { - for (const auto& [token, regex] : RegexByOtherName) { - RE2::Options custom; - if (token != CommentTokenName && token != StringValueName) { - custom.set_longest_match(true); - } + size_t MatchANSIMultilineComment(TStringBuf remaining); - RE2* re2 = new RE2(regex, custom); - if (token == CommentTokenName) { - CommentRegex_.Reset(re2); - } else { - OtherRegexes_.emplace_back(token, re2); - } + TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment) { + return [defaultComment](TStringBuf prefix) -> TMaybe<TStringBuf> { + const auto basic = defaultComment(prefix); + if (basic.Empty()) { + return Nothing(); } - } - - bool Tokenize( - const TString& query, - const TString& queryName, - const TTokenCallback& onNextToken, - NYql::TIssues& issues, - size_t maxErrors) override { - size_t errors = 0; - size_t pos = 0; - if (query.StartsWith(Utf8BOM)) { - pos += Utf8BOM.size(); + if (!prefix.StartsWith("/*")) { + return basic; } - while (pos < query.size()) { - TParsedToken matched = Match(TStringBuf(query, pos)); - - if (matched.Name.empty() && maxErrors == errors) { - break; - } - - if (matched.Name.empty()) { - pos += 1; - errors += 1; - issues.AddIssue(NYql::TPosition(pos, 0, queryName), "no candidates"); - continue; - } + size_t ll1Length = MatchANSIMultilineComment(prefix); + TStringBuf ll1Content = prefix.SubString(0, ll1Length); - pos += matched.Content.length(); - onNextToken(std::move(matched)); + Y_ENSURE(ll1Content == 0 || basic <= ll1Content); + if (ll1Content == 0) { + return basic; } - onNextToken(TParsedToken{.Name = "EOF", .Content = "<EOF>"}); - return errors == 0; + return ll1Content; + }; + } + + size_t MatchANSIMultilineComment(TStringBuf prefix) { + if (!prefix.StartsWith("/*")) { + return 0; } - private: - TParsedToken Match(const TStringBuf prefix) { - TParsedTokenList matches; + size_t skipped = 0; - size_t keywordCount = MatchKeyword(prefix, matches); - MatchPunctuation(prefix, matches); - MatchRegex(prefix, matches); - MatchComment(prefix, matches); + prefix.Skip(2); + skipped += 2; - if (matches.empty()) { - return {}; + for (;;) { + if (prefix.StartsWith("*/")) { + prefix.Skip(2); + skipped += 2; + return skipped; } - auto maxLength = MaxElementBy(matches, [](const TParsedToken& m) { - return m.Content.length(); - })->Content.length(); - - auto max = FindIf(matches, [&](const TParsedToken& m) { - return m.Content.length() == maxLength; - }); - - auto isMatched = [&](const TStringBuf name) { - return std::end(matches) != FindIf(matches, [&](const auto& m) { - return m.Name == name; - }); - }; - - size_t conflicts = CountIf(matches, [&](const TParsedToken& m) { - return m.Content.length() == max->Content.length(); - }); - conflicts -= 1; - Y_ENSURE( - conflicts == 0 || - (conflicts == 1 && keywordCount != 0 && isMatched("ID_PLAIN")) || - (conflicts == 1 && isMatched("DIGITS") && isMatched("INTEGER_VALUE"))); - - Y_ENSURE(!max->Content.empty()); - return *max; - } - - bool MatchKeyword(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& keyword : Grammar_.KeywordNames) { - const TStringBuf block = NSQLReflect::TLexerGrammar::KeywordBlock(keyword); - const TStringBuf content = prefix.substr(0, block.length()); - if (AsciiEqualsIgnoreCase(content, block)) { - matches.emplace_back(keyword, TString(content)); - count += 1; + bool isSkipped = false; + if (prefix.StartsWith("/*")) { + size_t limit = prefix.rfind("*/"); + if (limit == std::string::npos) { + return 0; } - } - return count; - } - size_t MatchPunctuation(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& name : Grammar_.PunctuationNames) { - const auto& content = Grammar_.BlockByName.at(name); - if (prefix.substr(0, content.length()) == content) { - matches.emplace_back(name, content); - count += 1; - } - } - return count; - } + size_t len = MatchANSIMultilineComment(prefix.Head(limit)); + prefix.Skip(len); + skipped += len; - size_t MatchRegex(const TStringBuf prefix, TParsedTokenList& matches) { - size_t count = 0; - for (const auto& [token, regex] : OtherRegexes_) { - if (const TStringBuf match = TryMatchRegex(prefix, *regex); !match.empty()) { - matches.emplace_back(token, TString(match)); - count += 1; - } + isSkipped = len != 0; } - return count; - } - const TStringBuf TryMatchRegex(const TStringBuf prefix, const RE2& regex) { - re2::StringPiece input(prefix.data(), prefix.size()); - if (RE2::Consume(&input, regex)) { - return TStringBuf(prefix.data(), input.data()); + if (isSkipped) { + continue; } - return ""; - } - size_t MatchComment(const TStringBuf prefix, TParsedTokenList& matches) { - const TStringBuf reContent = TryMatchRegex(prefix, *CommentRegex_); - if (reContent.empty()) { + if (prefix.size() == 0) { return 0; } - if (!(Ansi_ && prefix.StartsWith("/*"))) { - matches.emplace_back(CommentTokenName, TString(reContent)); - return 1; - } - - size_t ll1Length = MatchANSIMultilineComment(prefix); - const TStringBuf ll1Content = prefix.SubString(0, ll1Length); - - Y_ENSURE(ll1Content == 0 || reContent <= ll1Content); - if (ll1Content == 0) { - matches.emplace_back(CommentTokenName, TString(reContent)); - return 1; - } - - matches.emplace_back(CommentTokenName, TString(ll1Content)); - return 1; + prefix.Skip(1); + skipped += 1; } + } - size_t MatchANSIMultilineComment(TStringBuf remaining) { - if (!remaining.StartsWith("/*")) { - return 0; - } + TGenericLexerGrammar MakeGenericLexerGrammar( + bool ansi, + const TLexerGrammar& grammar, + const TVector<std::tuple<TString, TString>>& regexByOtherName) { + TGenericLexerGrammar generic; - size_t skipped = 0; + for (const auto& name : grammar.KeywordNames) { + auto matcher = Compile({ + .Body = TString(TLexerGrammar::KeywordBlock(name)), + .IsCaseInsensitive = true, + }); + generic.emplace_back(name, std::move(matcher)); + } - remaining.Skip(2); - skipped += 2; + for (const auto& name : grammar.PunctuationNames) { + generic.emplace_back( + name, Compile({RE2::QuoteMeta(grammar.BlockByName.at(name))})); + } - for (;;) { - if (remaining.StartsWith("*/")) { - remaining.Skip(2); - skipped += 2; - return skipped; - } + for (const auto& [name, regex] : regexByOtherName) { + auto matcher = Compile({ + .Body = regex, + }); + generic.emplace_back(name, std::move(matcher)); + } - bool isSkipped = false; - if (remaining.StartsWith("/*")) { - size_t limit = remaining.rfind("*/"); - if (limit == std::string::npos) { - return 0; - } + if (ansi) { + auto it = FindIf(generic, [](const auto& m) { + return m.TokenName == "COMMENT"; + }); + Y_ENSURE(it != std::end(generic)); + it->Match = ANSICommentMatcher(it->Match); + } - size_t len = MatchANSIMultilineComment(remaining.Head(limit)); - remaining.Skip(len); - skipped += len; + return generic; + } - isSkipped = len != 0; - } + class TRegexLexer: public NSQLTranslation::ILexer { + public: + TRegexLexer(IGenericLexer::TPtr lexer) + : Lexer_(std::move(lexer)) + { + } - if (isSkipped) { - continue; + bool Tokenize( + const TString& query, + const TString& queryName, + const TTokenCallback& onNextToken, + NYql::TIssues& issues, + size_t maxErrors) override { + bool isFailed = false; + + const auto onNext = [&](TGenericToken&& token) { + if (token.Name == TGenericToken::Error) { + NYql::TPosition pos(token.Begin, 0, queryName); + TString message = TString("no candidates, skipping ") + token.Content; + issues.AddIssue(std::move(pos), std::move(message)); + isFailed = true; + return; } - if (remaining.size() == 0) { - return 0; - } + onNextToken({ + .Name = TString(token.Name), + .Content = TString(token.Content), + }); + }; - remaining.Skip(1); - skipped += 1; - } + Lexer_->Tokenize(query, onNext, maxErrors); + return !isFailed; } - NSQLReflect::TLexerGrammar Grammar_; - TVector<std::tuple<TString, THolder<RE2>>> OtherRegexes_; - THolder<RE2> CommentRegex_; - bool Ansi_; + private: + IGenericLexer::TPtr Lexer_; }; namespace { class TFactory final: public NSQLTranslation::ILexerFactory { public: - explicit TFactory(bool ansi) - : Ansi_(ansi) - , Grammar_(NSQLReflect::LoadLexerGrammar()) - , RegexByOtherName_(MakeRegexByOtherName(Grammar_, Ansi_)) - { + explicit TFactory(bool ansi) { + auto grammar = NSQLReflect::LoadLexerGrammar(); + auto regexes = MakeRegexByOtherName(grammar, ansi); + Lexer_ = MakeGenericLexer(MakeGenericLexerGrammar(ansi, grammar, regexes)); } NSQLTranslation::ILexer::TPtr MakeLexer() const override { return NSQLTranslation::ILexer::TPtr( - new TRegexLexer(Ansi_, Grammar_, RegexByOtherName_)); + new TRegexLexer(Lexer_)); } private: - bool Ansi_; - NSQLReflect::TLexerGrammar Grammar_; - TVector<std::tuple<TString, TString>> RegexByOtherName_; + IGenericLexer::TPtr Lexer_; }; } // namespace diff --git a/yql/essentials/sql/v1/lexer/regex/lexer.h b/yql/essentials/sql/v1/lexer/regex/lexer.h index e9968954e1f..42d99a0a530 100644 --- a/yql/essentials/sql/v1/lexer/regex/lexer.h +++ b/yql/essentials/sql/v1/lexer/regex/lexer.h @@ -1,9 +1,13 @@ #pragma once +#include "generic.h" + #include <yql/essentials/parser/lexer_common/lexer.h> namespace NSQLTranslationV1 { + TTokenMatcher ANSICommentMatcher(TTokenMatcher defaultComment); + NSQLTranslation::TLexerFactoryPtr MakeRegexLexerFactory(bool ansi); } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/regex/regex.cpp b/yql/essentials/sql/v1/lexer/regex/regex.cpp index e634ff009a7..3f8af88eb4c 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex.cpp @@ -45,6 +45,7 @@ namespace NSQLTranslationV1 { TString ToRegex(const TStringBuf name) { TString text = Grammar_->BlockByName.at(name); + Preprocess(text); Inline(text); Transform(text); Finalize(text); @@ -52,6 +53,10 @@ namespace NSQLTranslationV1 { } private: + void Preprocess(TString& text) { + text = ChangedDigitsPrecendence(std::move(text)); + } + void Inline(TString& text) { ApplyEachWhileChanging(text, Inliners_); } @@ -86,6 +91,8 @@ namespace NSQLTranslationV1 { Grammar_->PunctuationNames.contains(name) || PunctuationFragments.contains(name)) { def = "'" + def + "'"; + } else if (name == "DIGITS") { + def = ChangedDigitsPrecendence(std::move(def)); } def = QuoteAntlrRewrite(std::move(def)); @@ -95,6 +102,15 @@ namespace NSQLTranslationV1 { } } + // Regex engine matches the first matched alternative, + // even if it is not the longest one, while ANTLR is more gready. + TString ChangedDigitsPrecendence(TString body) { + if (SubstGlobal(body, "DECDIGITS | ", "") != 0) { + SubstGlobal(body, "BINDIGITS", "BINDIGITS | DECDIGITS"); + } + return body; + } + void Transform(TString& text) { ApplyEachWhileChanging(text, Transformations_); } diff --git a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp index e62bb0e609f..8c7688aadcd 100644 --- a/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp +++ b/yql/essentials/sql/v1/lexer/regex/regex_ut.cpp @@ -62,7 +62,14 @@ Y_UNIT_TEST_SUITE(SqlRegexTests) { CheckRegex( /* ansi = */ false, "DIGITS", - R"(([0-9]+)|(0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+))"); + R"((0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))"); + } + + Y_UNIT_TEST(IntegerValue) { + CheckRegex( + /* ansi = */ false, + "INTEGER_VALUE", + R"(((0[xX]([0-9]|[a-f]|[A-F])+)|(0[oO][0-8]+)|(0[bB](0|1)+)|([0-9]+))(([pP]|[uU])?([lL]|[sS]|[tT]|[iI]|[bB]|[nN])?))"); } Y_UNIT_TEST(Real) { diff --git a/yql/essentials/sql/v1/lexer/regex/ya.make b/yql/essentials/sql/v1/lexer/regex/ya.make index 249dfbd11df..3a7fe19b94c 100644 --- a/yql/essentials/sql/v1/lexer/regex/ya.make +++ b/yql/essentials/sql/v1/lexer/regex/ya.make @@ -28,6 +28,7 @@ CFLAGS( ) SRCS( + generic.cpp lexer.cpp regex.cpp ) diff --git a/yql/essentials/sql/v1/ya.make b/yql/essentials/sql/v1/ya.make index 9407914f976..0e89d150bf8 100644 --- a/yql/essentials/sql/v1/ya.make +++ b/yql/essentials/sql/v1/ya.make @@ -58,6 +58,7 @@ END() RECURSE( complete format + highlight lexer perf proto_parser diff --git a/yql/essentials/tools/yql_highlight/ya.make b/yql/essentials/tools/yql_highlight/ya.make new file mode 100644 index 00000000000..63c2e9eea5e --- /dev/null +++ b/yql/essentials/tools/yql_highlight/ya.make @@ -0,0 +1,17 @@ +IF (NOT EXPORT_CMAKE OR NOT OPENSOURCE OR OPENSOURCE_PROJECT != "yt") + +PROGRAM() + +PEERDIR( + library/cpp/getopt + yql/essentials/sql/v1/highlight +) + +SRCS( + yql_highlight.cpp +) + +END() + +ENDIF() + diff --git a/yql/essentials/tools/yql_highlight/yql_highlight b/yql/essentials/tools/yql_highlight/yql_highlight new file mode 100644 index 00000000000..b47885386e5 --- /dev/null +++ b/yql/essentials/tools/yql_highlight/yql_highlight @@ -0,0 +1 @@ +/home/vityaman/.ya/build/symres/1a48d14807d3c5183ba6864b369f928e/yql_highlight
\ No newline at end of file diff --git a/yql/essentials/tools/yql_highlight/yql_highlight.cpp b/yql/essentials/tools/yql_highlight/yql_highlight.cpp new file mode 100644 index 00000000000..01dd9efe343 --- /dev/null +++ b/yql/essentials/tools/yql_highlight/yql_highlight.cpp @@ -0,0 +1,79 @@ +#include <yql/essentials/sql/v1/highlight/sql_highlight_json.h> +#include <yql/essentials/sql/v1/highlight/sql_highlight.h> +#include <yql/essentials/sql/v1/highlight/sql_highlighter.h> + +#include <library/cpp/getopt/last_getopt.h> +#include <library/cpp/colorizer/colors.h> +#include <library/cpp/json/json_writer.h> + +#include <util/stream/input.h> + +using namespace NSQLHighlight; + +int RunGenerateJSON() { + THighlighting highlighting = MakeHighlighting(); + NJson::TJsonValue json = ToJson(highlighting); + NJson::WriteJson(&Cout, &json, /* formatOutput = */ true); + return 0; +} + +int RunHighlighter() { + THashMap<EUnitKind, NColorizer::EAnsiCode> ColorByKind = { + {EUnitKind::Keyword, NColorizer::BLUE}, + {EUnitKind::Punctuation, NColorizer::DARK_WHITE}, + {EUnitKind::QuotedIdentifier, NColorizer::DARK_CYAN}, + {EUnitKind::BindParamterIdentifier, NColorizer::YELLOW}, + {EUnitKind::TypeIdentifier, NColorizer::GREEN}, + {EUnitKind::FunctionIdentifier, NColorizer::MAGENTA}, + {EUnitKind::Identifier, NColorizer::DEFAULT}, + {EUnitKind::Literal, NColorizer::LIGHT_GREEN}, + {EUnitKind::StringLiteral, NColorizer::DARK_RED}, + {EUnitKind::Comment, NColorizer::DARK_GREEN}, + {EUnitKind::Whitespace, NColorizer::DEFAULT}, + {EUnitKind::Error, NColorizer::RED}, + }; + + TString query = Cin.ReadAll(); + + THighlighting highlighting = MakeHighlighting(); + IHighlighter::TPtr highlighter = MakeHighlighter(highlighting); + TVector<TToken> tokens = Tokenize(*highlighter, query); + + for (auto& token : tokens) { + TStringBuf content = TStringBuf(query).SubString(token.Begin, token.Length); + Cout << ColorByKind[token.Kind] << content << NColorizer::RESET; + } + + return 0; +} + +int Run(int argc, char* argv[]) { + TString target; + + NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default(); + opts.AddLongOption('g', "generate", "generate a highlighting configuration") + .RequiredArgument("target") + .Choices({"json"}) + .StoreResult(&target); + opts.SetFreeArgsNum(0); + opts.AddHelpOption(); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + if (res.Has("generate")) { + if (target == "json") { + return RunGenerateJSON(); + } + Y_ABORT(); + } + return RunHighlighter(); +} + +int main(int argc, char* argv[]) try { + return Run(argc, argv); +} catch (const yexception& e) { + Cerr << "Caught exception:" << e.what() << Endl; + return 1; +} catch (...) { + Cerr << CurrentExceptionMessage() << Endl; + return 1; +} |