diff options
author | vityaman <vityaman.dev@yandex.ru> | 2025-04-22 19:01:16 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2025-04-22 19:51:08 +0300 |
commit | b81087bb12f4cea90b4830dfae7f1808cc75b61f (patch) | |
tree | 2bdee67b6abaddfc0fb3f6554f37b77ae8063058 | |
parent | 60665d3830d1f5e3252c2b7f5bb4db32cadadfc1 (diff) | |
download | ydb-b81087bb12f4cea90b4830dfae7f1808cc75b61f.tar.gz |
YQL-19747 Normalize names for ranking and filtering
I was lazy to search for a most frequent used name among equivalent by the relation `(a ~ b) iff (NormalizeName(a) = NormalizeName(b))`. Because it seems that names we receive from JSONs are canonized and therefore in a preferable style by the opinion of the YQL language designers. But because of duplicates at `statements_opensource.json` we have, for example, both `IGNORETYPEV3` and `IGNORE_TYPE_V3` in candidates list. I think that we should just remove `IGNORETYPEV3` from the JSON.
---
- Related to https://github.com/ydb-platform/ydb/issues/9056
- Related to https://github.com/vityaman/ydb/issues/21
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1229
commit_hash:fe73374ae27df1fcacb0adccda930ec98ed1d7a6
10 files changed, 183 insertions, 22 deletions
diff --git a/yql/essentials/sql/v1/complete/name/static/frequency.cpp b/yql/essentials/sql/v1/complete/name/static/frequency.cpp index 62997ccff7a..456a9feebcd 100644 --- a/yql/essentials/sql/v1/complete/name/static/frequency.cpp +++ b/yql/essentials/sql/v1/complete/name/static/frequency.cpp @@ -1,5 +1,7 @@ #include "frequency.h" +#include "name_index.h" + #include <library/cpp/json/json_reader.h> #include <library/cpp/resource/resource.h> @@ -54,7 +56,7 @@ namespace NSQLComplete { } }; - TFrequencyData Convert(TVector<TFrequencyItem> items) { + TFrequencyData Convert(TVector<TFrequencyItem> items, auto normalize) { TFrequencyData data; for (auto& item : items) { if (item.Parent == Json.Parent.Pragma || @@ -65,7 +67,7 @@ namespace NSQLComplete { item.Parent == Json.Parent.Module || item.Parent == Json.Parent.ReadHint || item.Parent == Json.Parent.InsertHint) { - item.Rule = ToLowerUTF8(item.Rule); + item.Rule = normalize(item.Rule); } if (item.Parent == Json.Parent.Pragma) { @@ -89,14 +91,24 @@ namespace NSQLComplete { return data; } + TFrequencyData ParseJsonFrequencyData(const TStringBuf text, auto normalize) { + return Convert(TFrequencyItem::ParseListFromJsonText(text), normalize); + } + TFrequencyData ParseJsonFrequencyData(const TStringBuf text) { - return Convert(TFrequencyItem::ParseListFromJsonText(text)); + return ParseJsonFrequencyData(text, NormalizeName); } TFrequencyData LoadFrequencyData() { TString text; Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text)); - return ParseJsonFrequencyData(text); + return ParseJsonFrequencyData(text, NormalizeName); + } + + TFrequencyData LoadFrequencyDataForPrunning() { + TString text; + Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text)); + return ParseJsonFrequencyData(text, UnchangedName); } } // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/frequency.h b/yql/essentials/sql/v1/complete/name/static/frequency.h index 6925c99fa5e..0a5dc36dfbd 100644 --- a/yql/essentials/sql/v1/complete/name/static/frequency.h +++ b/yql/essentials/sql/v1/complete/name/static/frequency.h @@ -17,4 +17,6 @@ namespace NSQLComplete { TFrequencyData LoadFrequencyData(); + TFrequencyData LoadFrequencyDataForPrunning(); + } // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp b/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp index 8f7eafed2ea..c630f0ca987 100644 --- a/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp +++ b/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp @@ -31,7 +31,7 @@ Y_UNIT_TEST_SUITE(FrequencyTests) { }, .Hints = { {"columns", 826110}, - {"column_groups", 225}, + {"columngroups", 225}, }, }; diff --git a/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp b/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp index bc522fd674c..a30f62aac20 100644 --- a/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp +++ b/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp @@ -1,5 +1,8 @@ #include "name_service.h" +#include "frequency.h" +#include "name_index.h" + #include <library/cpp/json/json_reader.h> #include <library/cpp/resource/resource.h> @@ -78,15 +81,52 @@ namespace NSQLComplete { return hints; } + TVector<TString> Pruned(TVector<TString> names, const THashMap<TString, size_t>& frequency) { + THashMap<TString, TVector<std::tuple<TString, size_t>>> groups; + + for (auto& [normalized, original] : BuildNameIndex(std::move(names), NormalizeName)) { + size_t freq = 0; + if (const size_t* it = frequency.FindPtr(original)) { + freq = *it; + } + groups[normalized].emplace_back(std::move(original), freq); + } + + for (auto& [_, group] : groups) { + Sort(group, [](const auto& lhs, const auto& rhs) { + return std::get<1>(lhs) < std::get<1>(rhs); + }); + } + + names = TVector<TString>(); + names.reserve(groups.size()); + for (auto& [_, group] : groups) { + Y_ASSERT(!group.empty()); + names.emplace_back(std::move(std::get<0>(group.back()))); + } + return names; + } + + NameSet Pruned(NameSet names) { + auto frequency = LoadFrequencyDataForPrunning(); + names.Pragmas = Pruned(std::move(names.Pragmas), frequency.Pragmas); + names.Types = Pruned(std::move(names.Types), frequency.Types); + names.Functions = Pruned(std::move(names.Functions), frequency.Functions); + for (auto& [k, h] : names.Hints) { + h = Pruned(h, frequency.Hints); + } + return names; + } + NameSet MakeDefaultNameSet() { - return { + return Pruned({ .Pragmas = ParsePragmas(LoadJsonResource("pragmas_opensource.json")), .Types = ParseTypes(LoadJsonResource("types.json")), .Functions = Merge( ParseFunctions(LoadJsonResource("sql_functions.json")), ParseUdfs(LoadJsonResource("udfs_basic.json"))), .Hints = ParseHints(LoadJsonResource("statements_opensource.json")), - }; + }); } } // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/name_index.cpp b/yql/essentials/sql/v1/complete/name/static/name_index.cpp new file mode 100644 index 00000000000..bfbf6af7fb4 --- /dev/null +++ b/yql/essentials/sql/v1/complete/name/static/name_index.cpp @@ -0,0 +1,21 @@ +#include "name_index.h" + +#include <yql/essentials/core/sql_types/normalize_name.h> + +#include <util/charset/utf8.h> + +namespace NSQLComplete { + + TString NormalizeName(const TString& name) { + return NYql::NormalizeName(name); + } + + TString LowerizeName(const TString& name) { + return ToLowerUTF8(name); + } + + TString UnchangedName(const TString& name) { + return name; + } + +} // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/name_index.h b/yql/essentials/sql/v1/complete/name/static/name_index.h new file mode 100644 index 00000000000..77b50238846 --- /dev/null +++ b/yql/essentials/sql/v1/complete/name/static/name_index.h @@ -0,0 +1,48 @@ +#pragma once + +#include <yql/essentials/sql/v1/complete/text/case.h> + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/algorithm.h> + +namespace NSQLComplete { + + struct TNameIndexEntry { + TString Normalized; + TString Original; + }; + + using TNameIndex = TVector<TNameIndexEntry>; + + inline bool NameIndexCompare(const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) { + return NoCaseCompare(lhs.Normalized, rhs.Normalized); + } + + inline auto NameIndexCompareLimit(size_t limit) { + return [cmp = NoCaseCompareLimit(limit)](const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) { + return cmp(lhs.Normalized, rhs.Normalized); + }; + } + + TNameIndex BuildNameIndex(TVector<TString> originals, auto normalize) { + TNameIndex index; + for (auto& original : originals) { + TNameIndexEntry entry = { + .Normalized = normalize(original), + .Original = std::move(original), + }; + index.emplace_back(std::move(entry)); + } + + Sort(index, NameIndexCompare); + return index; + } + + TString NormalizeName(const TString& name); + + TString LowerizeName(const TString& name); + + TString UnchangedName(const TString& name); + +} // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/name_service.cpp b/yql/essentials/sql/v1/complete/name/static/name_service.cpp index 3fd33102d61..201f096bd9e 100644 --- a/yql/essentials/sql/v1/complete/name/static/name_service.cpp +++ b/yql/essentials/sql/v1/complete/name/static/name_service.cpp @@ -1,11 +1,29 @@ #include "name_service.h" +#include "name_index.h" #include "ranking.h" #include <yql/essentials/sql/v1/complete/text/case.h> namespace NSQLComplete { + const TVector<TStringBuf> FilteredByPrefix(const TString& prefix, const TNameIndex& index Y_LIFETIME_BOUND) { + TNameIndexEntry normalized = { + .Normalized = NormalizeName(prefix), + .Original = "", + }; + + auto range = std::ranges::equal_range( + std::begin(index), std::end(index), + normalized, NameIndexCompareLimit(normalized.Normalized.size())); + + TVector<TStringBuf> filtered; + for (const TNameIndexEntry& entry : range) { + filtered.emplace_back(TStringBuf(entry.Original)); + } + return filtered; + } + const TVector<TStringBuf> FilteredByPrefix( const TString& prefix, const TVector<TString>& sorted Y_LIFETIME_BOUND) { @@ -55,15 +73,18 @@ namespace NSQLComplete { class TStaticNameService: public INameService { public: explicit TStaticNameService(NameSet names, IRanking::TPtr ranking) - : NameSet_(std::move(names)) + : Pragmas_(BuildNameIndex(std::move(names.Pragmas), NormalizeName)) + , Types_(BuildNameIndex(std::move(names.Types), NormalizeName)) + , Functions_(BuildNameIndex(std::move(names.Functions), NormalizeName)) + , Hints_([hints = std::move(names.Hints)] { + THashMap<EStatementKind, TNameIndex> index; + for (auto& [k, hints] : hints) { + index.emplace(k, BuildNameIndex(std::move(hints), NormalizeName)); + } + return index; + }()) , Ranking_(std::move(ranking)) { - Sort(NameSet_.Pragmas, NoCaseCompare); - Sort(NameSet_.Types, NoCaseCompare); - Sort(NameSet_.Functions, NoCaseCompare); - for (auto& [_, hints] : NameSet_.Hints) { - Sort(hints, NoCaseCompare); - } } TFuture<TNameResponse> Lookup(TNameRequest request) override { @@ -76,19 +97,19 @@ namespace NSQLComplete { if (request.Constraints.Pragma) { auto prefix = Prefixed(request.Prefix, ".", *request.Constraints.Pragma); - auto names = FilteredByPrefix(prefix, NameSet_.Pragmas); + auto names = FilteredByPrefix(prefix, Pragmas_); AppendAs<TPragmaName>(response.RankedNames, names); } if (request.Constraints.Type) { AppendAs<TTypeName>( response.RankedNames, - FilteredByPrefix(request.Prefix, NameSet_.Types)); + FilteredByPrefix(request.Prefix, Types_)); } if (request.Constraints.Function) { auto prefix = Prefixed(request.Prefix, "::", *request.Constraints.Function); - auto names = FilteredByPrefix(prefix, NameSet_.Functions); + auto names = FilteredByPrefix(prefix, Functions_); AppendAs<TFunctionName>(response.RankedNames, names); } @@ -96,7 +117,7 @@ namespace NSQLComplete { const auto stmt = request.Constraints.Hint->Statement; AppendAs<THintName>( response.RankedNames, - FilteredByPrefix(request.Prefix, NameSet_.Hints[stmt])); + FilteredByPrefix(request.Prefix, Hints_[stmt])); } Ranking_->CropToSortedPrefix(response.RankedNames, request.Limit); @@ -109,7 +130,10 @@ namespace NSQLComplete { } private: - NameSet NameSet_; + TNameIndex Pragmas_; + TNameIndex Types_; + TNameIndex Functions_; + THashMap<EStatementKind, TNameIndex> Hints_; IRanking::TPtr Ranking_; }; diff --git a/yql/essentials/sql/v1/complete/name/static/ranking.cpp b/yql/essentials/sql/v1/complete/name/static/ranking.cpp index ee1cbef08f5..aa08bd7a639 100644 --- a/yql/essentials/sql/v1/complete/name/static/ranking.cpp +++ b/yql/essentials/sql/v1/complete/name/static/ranking.cpp @@ -4,6 +4,8 @@ #include <yql/essentials/sql/v1/complete/name/name_service.h> +#include <yql/essentials/core/sql_types/normalize_name.h> + #include <util/charset/utf8.h> namespace NSQLComplete { @@ -57,7 +59,7 @@ namespace NSQLComplete { return std::visit([this](const auto& name) -> size_t { using T = std::decay_t<decltype(name)>; - auto content = ToLowerUTF8(ContentView(name)); + auto content = NYql::NormalizeName(ContentView(name)); if constexpr (std::is_same_v<T, TKeyword>) { if (auto weight = Frequency_.Keywords.FindPtr(content)) { diff --git a/yql/essentials/sql/v1/complete/name/static/ya.make b/yql/essentials/sql/v1/complete/name/static/ya.make index 155c0926399..1315d7475da 100644 --- a/yql/essentials/sql/v1/complete/name/static/ya.make +++ b/yql/essentials/sql/v1/complete/name/static/ya.make @@ -3,11 +3,13 @@ LIBRARY() SRCS( frequency.cpp json_name_set.cpp + name_index.cpp name_service.cpp ranking.cpp ) PEERDIR( + yql/essentials/core/sql_types yql/essentials/sql/v1/complete/name yql/essentials/sql/v1/complete/text ) diff --git a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp index a0681b1888f..d14d7b85442 100644 --- a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp +++ b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp @@ -605,7 +605,6 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) { { TVector<TCandidate> expected = { {HintName, "IGNORE_TYPE_V3"}, - {HintName, "IGNORETYPEV3"}, }; UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ig"}), expected); } @@ -642,6 +641,17 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) { UNIT_ASSERT_GE(Complete(engine, {"SELECT "}).size(), 55); } + Y_UNIT_TEST(NameNormalization) { + auto set = MakeDefaultNameSet(); + auto service = MakeStaticNameService(std::move(set), MakeDefaultRanking()); + auto engine = MakeSqlCompletionEngine(MakePureLexerSupplier(), std::move(service)); + + TVector<TCandidate> expected = { + {HintName, "IGNORE_TYPE_V3"}, + }; + UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ignoret"}), expected); + } + Y_UNIT_TEST(Ranking) { TFrequencyData frequency = { .Keywords = { @@ -715,7 +725,7 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) { {HintName, "XLOCK"}, {HintName, "UNORDERED"}, {Keyword, "COLUMNS"}, - {HintName, "FORCEINFERSCHEMA"}, + {HintName, "FORCE_INFER_SCHEMA"}, }; UNIT_ASSERT_VALUES_EQUAL(CompleteTop(expected.size(), engine, {"SELECT * FROM a WITH "}), expected); } |