aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvityaman <vityaman.dev@yandex.ru>2025-04-22 19:01:16 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2025-04-22 19:51:08 +0300
commitb81087bb12f4cea90b4830dfae7f1808cc75b61f (patch)
tree2bdee67b6abaddfc0fb3f6554f37b77ae8063058
parent60665d3830d1f5e3252c2b7f5bb4db32cadadfc1 (diff)
downloadydb-b81087bb12f4cea90b4830dfae7f1808cc75b61f.tar.gz
YQL-19747 Normalize names for ranking and filtering
I was lazy to search for a most frequent used name among equivalent by the relation `(a ~ b) iff (NormalizeName(a) = NormalizeName(b))`. Because it seems that names we receive from JSONs are canonized and therefore in a preferable style by the opinion of the YQL language designers. But because of duplicates at `statements_opensource.json` we have, for example, both `IGNORETYPEV3` and `IGNORE_TYPE_V3` in candidates list. I think that we should just remove `IGNORETYPEV3` from the JSON. --- - Related to https://github.com/ydb-platform/ydb/issues/9056 - Related to https://github.com/vityaman/ydb/issues/21 --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1229 commit_hash:fe73374ae27df1fcacb0adccda930ec98ed1d7a6
-rw-r--r--yql/essentials/sql/v1/complete/name/static/frequency.cpp20
-rw-r--r--yql/essentials/sql/v1/complete/name/static/frequency.h2
-rw-r--r--yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp2
-rw-r--r--yql/essentials/sql/v1/complete/name/static/json_name_set.cpp44
-rw-r--r--yql/essentials/sql/v1/complete/name/static/name_index.cpp21
-rw-r--r--yql/essentials/sql/v1/complete/name/static/name_index.h48
-rw-r--r--yql/essentials/sql/v1/complete/name/static/name_service.cpp48
-rw-r--r--yql/essentials/sql/v1/complete/name/static/ranking.cpp4
-rw-r--r--yql/essentials/sql/v1/complete/name/static/ya.make2
-rw-r--r--yql/essentials/sql/v1/complete/sql_complete_ut.cpp14
10 files changed, 183 insertions, 22 deletions
diff --git a/yql/essentials/sql/v1/complete/name/static/frequency.cpp b/yql/essentials/sql/v1/complete/name/static/frequency.cpp
index 62997ccff7a..456a9feebcd 100644
--- a/yql/essentials/sql/v1/complete/name/static/frequency.cpp
+++ b/yql/essentials/sql/v1/complete/name/static/frequency.cpp
@@ -1,5 +1,7 @@
#include "frequency.h"
+#include "name_index.h"
+
#include <library/cpp/json/json_reader.h>
#include <library/cpp/resource/resource.h>
@@ -54,7 +56,7 @@ namespace NSQLComplete {
}
};
- TFrequencyData Convert(TVector<TFrequencyItem> items) {
+ TFrequencyData Convert(TVector<TFrequencyItem> items, auto normalize) {
TFrequencyData data;
for (auto& item : items) {
if (item.Parent == Json.Parent.Pragma ||
@@ -65,7 +67,7 @@ namespace NSQLComplete {
item.Parent == Json.Parent.Module ||
item.Parent == Json.Parent.ReadHint ||
item.Parent == Json.Parent.InsertHint) {
- item.Rule = ToLowerUTF8(item.Rule);
+ item.Rule = normalize(item.Rule);
}
if (item.Parent == Json.Parent.Pragma) {
@@ -89,14 +91,24 @@ namespace NSQLComplete {
return data;
}
+ TFrequencyData ParseJsonFrequencyData(const TStringBuf text, auto normalize) {
+ return Convert(TFrequencyItem::ParseListFromJsonText(text), normalize);
+ }
+
TFrequencyData ParseJsonFrequencyData(const TStringBuf text) {
- return Convert(TFrequencyItem::ParseListFromJsonText(text));
+ return ParseJsonFrequencyData(text, NormalizeName);
}
TFrequencyData LoadFrequencyData() {
TString text;
Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text));
- return ParseJsonFrequencyData(text);
+ return ParseJsonFrequencyData(text, NormalizeName);
+ }
+
+ TFrequencyData LoadFrequencyDataForPrunning() {
+ TString text;
+ Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text));
+ return ParseJsonFrequencyData(text, UnchangedName);
}
} // namespace NSQLComplete
diff --git a/yql/essentials/sql/v1/complete/name/static/frequency.h b/yql/essentials/sql/v1/complete/name/static/frequency.h
index 6925c99fa5e..0a5dc36dfbd 100644
--- a/yql/essentials/sql/v1/complete/name/static/frequency.h
+++ b/yql/essentials/sql/v1/complete/name/static/frequency.h
@@ -17,4 +17,6 @@ namespace NSQLComplete {
TFrequencyData LoadFrequencyData();
+ TFrequencyData LoadFrequencyDataForPrunning();
+
} // namespace NSQLComplete
diff --git a/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp b/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp
index 8f7eafed2ea..c630f0ca987 100644
--- a/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp
+++ b/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp
@@ -31,7 +31,7 @@ Y_UNIT_TEST_SUITE(FrequencyTests) {
},
.Hints = {
{"columns", 826110},
- {"column_groups", 225},
+ {"columngroups", 225},
},
};
diff --git a/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp b/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp
index bc522fd674c..a30f62aac20 100644
--- a/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp
+++ b/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp
@@ -1,5 +1,8 @@
#include "name_service.h"
+#include "frequency.h"
+#include "name_index.h"
+
#include <library/cpp/json/json_reader.h>
#include <library/cpp/resource/resource.h>
@@ -78,15 +81,52 @@ namespace NSQLComplete {
return hints;
}
+ TVector<TString> Pruned(TVector<TString> names, const THashMap<TString, size_t>& frequency) {
+ THashMap<TString, TVector<std::tuple<TString, size_t>>> groups;
+
+ for (auto& [normalized, original] : BuildNameIndex(std::move(names), NormalizeName)) {
+ size_t freq = 0;
+ if (const size_t* it = frequency.FindPtr(original)) {
+ freq = *it;
+ }
+ groups[normalized].emplace_back(std::move(original), freq);
+ }
+
+ for (auto& [_, group] : groups) {
+ Sort(group, [](const auto& lhs, const auto& rhs) {
+ return std::get<1>(lhs) < std::get<1>(rhs);
+ });
+ }
+
+ names = TVector<TString>();
+ names.reserve(groups.size());
+ for (auto& [_, group] : groups) {
+ Y_ASSERT(!group.empty());
+ names.emplace_back(std::move(std::get<0>(group.back())));
+ }
+ return names;
+ }
+
+ NameSet Pruned(NameSet names) {
+ auto frequency = LoadFrequencyDataForPrunning();
+ names.Pragmas = Pruned(std::move(names.Pragmas), frequency.Pragmas);
+ names.Types = Pruned(std::move(names.Types), frequency.Types);
+ names.Functions = Pruned(std::move(names.Functions), frequency.Functions);
+ for (auto& [k, h] : names.Hints) {
+ h = Pruned(h, frequency.Hints);
+ }
+ return names;
+ }
+
NameSet MakeDefaultNameSet() {
- return {
+ return Pruned({
.Pragmas = ParsePragmas(LoadJsonResource("pragmas_opensource.json")),
.Types = ParseTypes(LoadJsonResource("types.json")),
.Functions = Merge(
ParseFunctions(LoadJsonResource("sql_functions.json")),
ParseUdfs(LoadJsonResource("udfs_basic.json"))),
.Hints = ParseHints(LoadJsonResource("statements_opensource.json")),
- };
+ });
}
} // namespace NSQLComplete
diff --git a/yql/essentials/sql/v1/complete/name/static/name_index.cpp b/yql/essentials/sql/v1/complete/name/static/name_index.cpp
new file mode 100644
index 00000000000..bfbf6af7fb4
--- /dev/null
+++ b/yql/essentials/sql/v1/complete/name/static/name_index.cpp
@@ -0,0 +1,21 @@
+#include "name_index.h"
+
+#include <yql/essentials/core/sql_types/normalize_name.h>
+
+#include <util/charset/utf8.h>
+
+namespace NSQLComplete {
+
+ TString NormalizeName(const TString& name) {
+ return NYql::NormalizeName(name);
+ }
+
+ TString LowerizeName(const TString& name) {
+ return ToLowerUTF8(name);
+ }
+
+ TString UnchangedName(const TString& name) {
+ return name;
+ }
+
+} // namespace NSQLComplete
diff --git a/yql/essentials/sql/v1/complete/name/static/name_index.h b/yql/essentials/sql/v1/complete/name/static/name_index.h
new file mode 100644
index 00000000000..77b50238846
--- /dev/null
+++ b/yql/essentials/sql/v1/complete/name/static/name_index.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <yql/essentials/sql/v1/complete/text/case.h>
+
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/algorithm.h>
+
+namespace NSQLComplete {
+
+ struct TNameIndexEntry {
+ TString Normalized;
+ TString Original;
+ };
+
+ using TNameIndex = TVector<TNameIndexEntry>;
+
+ inline bool NameIndexCompare(const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) {
+ return NoCaseCompare(lhs.Normalized, rhs.Normalized);
+ }
+
+ inline auto NameIndexCompareLimit(size_t limit) {
+ return [cmp = NoCaseCompareLimit(limit)](const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) {
+ return cmp(lhs.Normalized, rhs.Normalized);
+ };
+ }
+
+ TNameIndex BuildNameIndex(TVector<TString> originals, auto normalize) {
+ TNameIndex index;
+ for (auto& original : originals) {
+ TNameIndexEntry entry = {
+ .Normalized = normalize(original),
+ .Original = std::move(original),
+ };
+ index.emplace_back(std::move(entry));
+ }
+
+ Sort(index, NameIndexCompare);
+ return index;
+ }
+
+ TString NormalizeName(const TString& name);
+
+ TString LowerizeName(const TString& name);
+
+ TString UnchangedName(const TString& name);
+
+} // namespace NSQLComplete
diff --git a/yql/essentials/sql/v1/complete/name/static/name_service.cpp b/yql/essentials/sql/v1/complete/name/static/name_service.cpp
index 3fd33102d61..201f096bd9e 100644
--- a/yql/essentials/sql/v1/complete/name/static/name_service.cpp
+++ b/yql/essentials/sql/v1/complete/name/static/name_service.cpp
@@ -1,11 +1,29 @@
#include "name_service.h"
+#include "name_index.h"
#include "ranking.h"
#include <yql/essentials/sql/v1/complete/text/case.h>
namespace NSQLComplete {
+ const TVector<TStringBuf> FilteredByPrefix(const TString& prefix, const TNameIndex& index Y_LIFETIME_BOUND) {
+ TNameIndexEntry normalized = {
+ .Normalized = NormalizeName(prefix),
+ .Original = "",
+ };
+
+ auto range = std::ranges::equal_range(
+ std::begin(index), std::end(index),
+ normalized, NameIndexCompareLimit(normalized.Normalized.size()));
+
+ TVector<TStringBuf> filtered;
+ for (const TNameIndexEntry& entry : range) {
+ filtered.emplace_back(TStringBuf(entry.Original));
+ }
+ return filtered;
+ }
+
const TVector<TStringBuf> FilteredByPrefix(
const TString& prefix,
const TVector<TString>& sorted Y_LIFETIME_BOUND) {
@@ -55,15 +73,18 @@ namespace NSQLComplete {
class TStaticNameService: public INameService {
public:
explicit TStaticNameService(NameSet names, IRanking::TPtr ranking)
- : NameSet_(std::move(names))
+ : Pragmas_(BuildNameIndex(std::move(names.Pragmas), NormalizeName))
+ , Types_(BuildNameIndex(std::move(names.Types), NormalizeName))
+ , Functions_(BuildNameIndex(std::move(names.Functions), NormalizeName))
+ , Hints_([hints = std::move(names.Hints)] {
+ THashMap<EStatementKind, TNameIndex> index;
+ for (auto& [k, hints] : hints) {
+ index.emplace(k, BuildNameIndex(std::move(hints), NormalizeName));
+ }
+ return index;
+ }())
, Ranking_(std::move(ranking))
{
- Sort(NameSet_.Pragmas, NoCaseCompare);
- Sort(NameSet_.Types, NoCaseCompare);
- Sort(NameSet_.Functions, NoCaseCompare);
- for (auto& [_, hints] : NameSet_.Hints) {
- Sort(hints, NoCaseCompare);
- }
}
TFuture<TNameResponse> Lookup(TNameRequest request) override {
@@ -76,19 +97,19 @@ namespace NSQLComplete {
if (request.Constraints.Pragma) {
auto prefix = Prefixed(request.Prefix, ".", *request.Constraints.Pragma);
- auto names = FilteredByPrefix(prefix, NameSet_.Pragmas);
+ auto names = FilteredByPrefix(prefix, Pragmas_);
AppendAs<TPragmaName>(response.RankedNames, names);
}
if (request.Constraints.Type) {
AppendAs<TTypeName>(
response.RankedNames,
- FilteredByPrefix(request.Prefix, NameSet_.Types));
+ FilteredByPrefix(request.Prefix, Types_));
}
if (request.Constraints.Function) {
auto prefix = Prefixed(request.Prefix, "::", *request.Constraints.Function);
- auto names = FilteredByPrefix(prefix, NameSet_.Functions);
+ auto names = FilteredByPrefix(prefix, Functions_);
AppendAs<TFunctionName>(response.RankedNames, names);
}
@@ -96,7 +117,7 @@ namespace NSQLComplete {
const auto stmt = request.Constraints.Hint->Statement;
AppendAs<THintName>(
response.RankedNames,
- FilteredByPrefix(request.Prefix, NameSet_.Hints[stmt]));
+ FilteredByPrefix(request.Prefix, Hints_[stmt]));
}
Ranking_->CropToSortedPrefix(response.RankedNames, request.Limit);
@@ -109,7 +130,10 @@ namespace NSQLComplete {
}
private:
- NameSet NameSet_;
+ TNameIndex Pragmas_;
+ TNameIndex Types_;
+ TNameIndex Functions_;
+ THashMap<EStatementKind, TNameIndex> Hints_;
IRanking::TPtr Ranking_;
};
diff --git a/yql/essentials/sql/v1/complete/name/static/ranking.cpp b/yql/essentials/sql/v1/complete/name/static/ranking.cpp
index ee1cbef08f5..aa08bd7a639 100644
--- a/yql/essentials/sql/v1/complete/name/static/ranking.cpp
+++ b/yql/essentials/sql/v1/complete/name/static/ranking.cpp
@@ -4,6 +4,8 @@
#include <yql/essentials/sql/v1/complete/name/name_service.h>
+#include <yql/essentials/core/sql_types/normalize_name.h>
+
#include <util/charset/utf8.h>
namespace NSQLComplete {
@@ -57,7 +59,7 @@ namespace NSQLComplete {
return std::visit([this](const auto& name) -> size_t {
using T = std::decay_t<decltype(name)>;
- auto content = ToLowerUTF8(ContentView(name));
+ auto content = NYql::NormalizeName(ContentView(name));
if constexpr (std::is_same_v<T, TKeyword>) {
if (auto weight = Frequency_.Keywords.FindPtr(content)) {
diff --git a/yql/essentials/sql/v1/complete/name/static/ya.make b/yql/essentials/sql/v1/complete/name/static/ya.make
index 155c0926399..1315d7475da 100644
--- a/yql/essentials/sql/v1/complete/name/static/ya.make
+++ b/yql/essentials/sql/v1/complete/name/static/ya.make
@@ -3,11 +3,13 @@ LIBRARY()
SRCS(
frequency.cpp
json_name_set.cpp
+ name_index.cpp
name_service.cpp
ranking.cpp
)
PEERDIR(
+ yql/essentials/core/sql_types
yql/essentials/sql/v1/complete/name
yql/essentials/sql/v1/complete/text
)
diff --git a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
index a0681b1888f..d14d7b85442 100644
--- a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
+++ b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp
@@ -605,7 +605,6 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
{
TVector<TCandidate> expected = {
{HintName, "IGNORE_TYPE_V3"},
- {HintName, "IGNORETYPEV3"},
};
UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ig"}), expected);
}
@@ -642,6 +641,17 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
UNIT_ASSERT_GE(Complete(engine, {"SELECT "}).size(), 55);
}
+ Y_UNIT_TEST(NameNormalization) {
+ auto set = MakeDefaultNameSet();
+ auto service = MakeStaticNameService(std::move(set), MakeDefaultRanking());
+ auto engine = MakeSqlCompletionEngine(MakePureLexerSupplier(), std::move(service));
+
+ TVector<TCandidate> expected = {
+ {HintName, "IGNORE_TYPE_V3"},
+ };
+ UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ignoret"}), expected);
+ }
+
Y_UNIT_TEST(Ranking) {
TFrequencyData frequency = {
.Keywords = {
@@ -715,7 +725,7 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
{HintName, "XLOCK"},
{HintName, "UNORDERED"},
{Keyword, "COLUMNS"},
- {HintName, "FORCEINFERSCHEMA"},
+ {HintName, "FORCE_INFER_SCHEMA"},
};
UNIT_ASSERT_VALUES_EQUAL(CompleteTop(expected.size(), engine, {"SELECT * FROM a WITH "}), expected);
}