diff options
| author | vityaman <[email protected]> | 2025-04-22 19:01:16 +0300 | 
|---|---|---|
| committer | robot-piglet <[email protected]> | 2025-04-22 19:51:08 +0300 | 
| commit | b81087bb12f4cea90b4830dfae7f1808cc75b61f (patch) | |
| tree | 2bdee67b6abaddfc0fb3f6554f37b77ae8063058 /yql/essentials/sql | |
| parent | 60665d3830d1f5e3252c2b7f5bb4db32cadadfc1 (diff) | |
YQL-19747 Normalize names for ranking and filtering
I was lazy to search for a most frequent used name among equivalent by the relation `(a ~ b) iff (NormalizeName(a) = NormalizeName(b))`. Because it seems that names we receive from JSONs are canonized and therefore in a preferable style by the opinion of the YQL language designers. But because of duplicates at `statements_opensource.json` we have, for example, both `IGNORETYPEV3` and `IGNORE_TYPE_V3` in candidates list. I think that we should just remove `IGNORETYPEV3` from the JSON.
---
- Related to https://github.com/ydb-platform/ydb/issues/9056
- Related to https://github.com/vityaman/ydb/issues/21
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/1229
commit_hash:fe73374ae27df1fcacb0adccda930ec98ed1d7a6
Diffstat (limited to 'yql/essentials/sql')
10 files changed, 183 insertions, 22 deletions
| diff --git a/yql/essentials/sql/v1/complete/name/static/frequency.cpp b/yql/essentials/sql/v1/complete/name/static/frequency.cpp index 62997ccff7a..456a9feebcd 100644 --- a/yql/essentials/sql/v1/complete/name/static/frequency.cpp +++ b/yql/essentials/sql/v1/complete/name/static/frequency.cpp @@ -1,5 +1,7 @@  #include "frequency.h" +#include "name_index.h" +  #include <library/cpp/json/json_reader.h>  #include <library/cpp/resource/resource.h> @@ -54,7 +56,7 @@ namespace NSQLComplete {          }      }; -    TFrequencyData Convert(TVector<TFrequencyItem> items) { +    TFrequencyData Convert(TVector<TFrequencyItem> items, auto normalize) {          TFrequencyData data;          for (auto& item : items) {              if (item.Parent == Json.Parent.Pragma || @@ -65,7 +67,7 @@ namespace NSQLComplete {                  item.Parent == Json.Parent.Module ||                  item.Parent == Json.Parent.ReadHint ||                  item.Parent == Json.Parent.InsertHint) { -                item.Rule = ToLowerUTF8(item.Rule); +                item.Rule = normalize(item.Rule);              }              if (item.Parent == Json.Parent.Pragma) { @@ -89,14 +91,24 @@ namespace NSQLComplete {          return data;      } +    TFrequencyData ParseJsonFrequencyData(const TStringBuf text, auto normalize) { +        return Convert(TFrequencyItem::ParseListFromJsonText(text), normalize); +    } +      TFrequencyData ParseJsonFrequencyData(const TStringBuf text) { -        return Convert(TFrequencyItem::ParseListFromJsonText(text)); +        return ParseJsonFrequencyData(text, NormalizeName);      }      TFrequencyData LoadFrequencyData() {          TString text;          Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text)); -        return ParseJsonFrequencyData(text); +        return ParseJsonFrequencyData(text, NormalizeName); +    } + +    TFrequencyData LoadFrequencyDataForPrunning() { +        TString text; +        Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text)); +        return ParseJsonFrequencyData(text, UnchangedName);      }  } // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/frequency.h b/yql/essentials/sql/v1/complete/name/static/frequency.h index 6925c99fa5e..0a5dc36dfbd 100644 --- a/yql/essentials/sql/v1/complete/name/static/frequency.h +++ b/yql/essentials/sql/v1/complete/name/static/frequency.h @@ -17,4 +17,6 @@ namespace NSQLComplete {      TFrequencyData LoadFrequencyData(); +    TFrequencyData LoadFrequencyDataForPrunning(); +  } // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp b/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp index 8f7eafed2ea..c630f0ca987 100644 --- a/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp +++ b/yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp @@ -31,7 +31,7 @@ Y_UNIT_TEST_SUITE(FrequencyTests) {              },              .Hints = {                  {"columns", 826110}, -                {"column_groups", 225}, +                {"columngroups", 225},              },          }; diff --git a/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp b/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp index bc522fd674c..a30f62aac20 100644 --- a/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp +++ b/yql/essentials/sql/v1/complete/name/static/json_name_set.cpp @@ -1,5 +1,8 @@  #include "name_service.h" +#include "frequency.h" +#include "name_index.h" +  #include <library/cpp/json/json_reader.h>  #include <library/cpp/resource/resource.h> @@ -78,15 +81,52 @@ namespace NSQLComplete {          return hints;      } +    TVector<TString> Pruned(TVector<TString> names, const THashMap<TString, size_t>& frequency) { +        THashMap<TString, TVector<std::tuple<TString, size_t>>> groups; + +        for (auto& [normalized, original] : BuildNameIndex(std::move(names), NormalizeName)) { +            size_t freq = 0; +            if (const size_t* it = frequency.FindPtr(original)) { +                freq = *it; +            } +            groups[normalized].emplace_back(std::move(original), freq); +        } + +        for (auto& [_, group] : groups) { +            Sort(group, [](const auto& lhs, const auto& rhs) { +                return std::get<1>(lhs) < std::get<1>(rhs); +            }); +        } + +        names = TVector<TString>(); +        names.reserve(groups.size()); +        for (auto& [_, group] : groups) { +            Y_ASSERT(!group.empty()); +            names.emplace_back(std::move(std::get<0>(group.back()))); +        } +        return names; +    } + +    NameSet Pruned(NameSet names) { +        auto frequency = LoadFrequencyDataForPrunning(); +        names.Pragmas = Pruned(std::move(names.Pragmas), frequency.Pragmas); +        names.Types = Pruned(std::move(names.Types), frequency.Types); +        names.Functions = Pruned(std::move(names.Functions), frequency.Functions); +        for (auto& [k, h] : names.Hints) { +            h = Pruned(h, frequency.Hints); +        } +        return names; +    } +      NameSet MakeDefaultNameSet() { -        return { +        return Pruned({              .Pragmas = ParsePragmas(LoadJsonResource("pragmas_opensource.json")),              .Types = ParseTypes(LoadJsonResource("types.json")),              .Functions = Merge(                  ParseFunctions(LoadJsonResource("sql_functions.json")),                  ParseUdfs(LoadJsonResource("udfs_basic.json"))),              .Hints = ParseHints(LoadJsonResource("statements_opensource.json")), -        }; +        });      }  } // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/name_index.cpp b/yql/essentials/sql/v1/complete/name/static/name_index.cpp new file mode 100644 index 00000000000..bfbf6af7fb4 --- /dev/null +++ b/yql/essentials/sql/v1/complete/name/static/name_index.cpp @@ -0,0 +1,21 @@ +#include "name_index.h" + +#include <yql/essentials/core/sql_types/normalize_name.h> + +#include <util/charset/utf8.h> + +namespace NSQLComplete { + +    TString NormalizeName(const TString& name) { +        return NYql::NormalizeName(name); +    } + +    TString LowerizeName(const TString& name) { +        return ToLowerUTF8(name); +    } + +    TString UnchangedName(const TString& name) { +        return name; +    } + +} // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/name_index.h b/yql/essentials/sql/v1/complete/name/static/name_index.h new file mode 100644 index 00000000000..77b50238846 --- /dev/null +++ b/yql/essentials/sql/v1/complete/name/static/name_index.h @@ -0,0 +1,48 @@ +#pragma once + +#include <yql/essentials/sql/v1/complete/text/case.h> + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/algorithm.h> + +namespace NSQLComplete { + +    struct TNameIndexEntry { +        TString Normalized; +        TString Original; +    }; + +    using TNameIndex = TVector<TNameIndexEntry>; + +    inline bool NameIndexCompare(const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) { +        return NoCaseCompare(lhs.Normalized, rhs.Normalized); +    } + +    inline auto NameIndexCompareLimit(size_t limit) { +        return [cmp = NoCaseCompareLimit(limit)](const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) { +            return cmp(lhs.Normalized, rhs.Normalized); +        }; +    } + +    TNameIndex BuildNameIndex(TVector<TString> originals, auto normalize) { +        TNameIndex index; +        for (auto& original : originals) { +            TNameIndexEntry entry = { +                .Normalized = normalize(original), +                .Original = std::move(original), +            }; +            index.emplace_back(std::move(entry)); +        } + +        Sort(index, NameIndexCompare); +        return index; +    } + +    TString NormalizeName(const TString& name); + +    TString LowerizeName(const TString& name); + +    TString UnchangedName(const TString& name); + +} // namespace NSQLComplete diff --git a/yql/essentials/sql/v1/complete/name/static/name_service.cpp b/yql/essentials/sql/v1/complete/name/static/name_service.cpp index 3fd33102d61..201f096bd9e 100644 --- a/yql/essentials/sql/v1/complete/name/static/name_service.cpp +++ b/yql/essentials/sql/v1/complete/name/static/name_service.cpp @@ -1,11 +1,29 @@  #include "name_service.h" +#include "name_index.h"  #include "ranking.h"  #include <yql/essentials/sql/v1/complete/text/case.h>  namespace NSQLComplete { +    const TVector<TStringBuf> FilteredByPrefix(const TString& prefix, const TNameIndex& index Y_LIFETIME_BOUND) { +        TNameIndexEntry normalized = { +            .Normalized = NormalizeName(prefix), +            .Original = "", +        }; + +        auto range = std::ranges::equal_range( +            std::begin(index), std::end(index), +            normalized, NameIndexCompareLimit(normalized.Normalized.size())); + +        TVector<TStringBuf> filtered; +        for (const TNameIndexEntry& entry : range) { +            filtered.emplace_back(TStringBuf(entry.Original)); +        } +        return filtered; +    } +      const TVector<TStringBuf> FilteredByPrefix(          const TString& prefix,          const TVector<TString>& sorted Y_LIFETIME_BOUND) { @@ -55,15 +73,18 @@ namespace NSQLComplete {      class TStaticNameService: public INameService {      public:          explicit TStaticNameService(NameSet names, IRanking::TPtr ranking) -            : NameSet_(std::move(names)) +            : Pragmas_(BuildNameIndex(std::move(names.Pragmas), NormalizeName)) +            , Types_(BuildNameIndex(std::move(names.Types), NormalizeName)) +            , Functions_(BuildNameIndex(std::move(names.Functions), NormalizeName)) +            , Hints_([hints = std::move(names.Hints)] { +                THashMap<EStatementKind, TNameIndex> index; +                for (auto& [k, hints] : hints) { +                    index.emplace(k, BuildNameIndex(std::move(hints), NormalizeName)); +                } +                return index; +            }())              , Ranking_(std::move(ranking))          { -            Sort(NameSet_.Pragmas, NoCaseCompare); -            Sort(NameSet_.Types, NoCaseCompare); -            Sort(NameSet_.Functions, NoCaseCompare); -            for (auto& [_, hints] : NameSet_.Hints) { -                Sort(hints, NoCaseCompare); -            }          }          TFuture<TNameResponse> Lookup(TNameRequest request) override { @@ -76,19 +97,19 @@ namespace NSQLComplete {              if (request.Constraints.Pragma) {                  auto prefix = Prefixed(request.Prefix, ".", *request.Constraints.Pragma); -                auto names = FilteredByPrefix(prefix, NameSet_.Pragmas); +                auto names = FilteredByPrefix(prefix, Pragmas_);                  AppendAs<TPragmaName>(response.RankedNames, names);              }              if (request.Constraints.Type) {                  AppendAs<TTypeName>(                      response.RankedNames, -                    FilteredByPrefix(request.Prefix, NameSet_.Types)); +                    FilteredByPrefix(request.Prefix, Types_));              }              if (request.Constraints.Function) {                  auto prefix = Prefixed(request.Prefix, "::", *request.Constraints.Function); -                auto names = FilteredByPrefix(prefix, NameSet_.Functions); +                auto names = FilteredByPrefix(prefix, Functions_);                  AppendAs<TFunctionName>(response.RankedNames, names);              } @@ -96,7 +117,7 @@ namespace NSQLComplete {                  const auto stmt = request.Constraints.Hint->Statement;                  AppendAs<THintName>(                      response.RankedNames, -                    FilteredByPrefix(request.Prefix, NameSet_.Hints[stmt])); +                    FilteredByPrefix(request.Prefix, Hints_[stmt]));              }              Ranking_->CropToSortedPrefix(response.RankedNames, request.Limit); @@ -109,7 +130,10 @@ namespace NSQLComplete {          }      private: -        NameSet NameSet_; +        TNameIndex Pragmas_; +        TNameIndex Types_; +        TNameIndex Functions_; +        THashMap<EStatementKind, TNameIndex> Hints_;          IRanking::TPtr Ranking_;      }; diff --git a/yql/essentials/sql/v1/complete/name/static/ranking.cpp b/yql/essentials/sql/v1/complete/name/static/ranking.cpp index ee1cbef08f5..aa08bd7a639 100644 --- a/yql/essentials/sql/v1/complete/name/static/ranking.cpp +++ b/yql/essentials/sql/v1/complete/name/static/ranking.cpp @@ -4,6 +4,8 @@  #include <yql/essentials/sql/v1/complete/name/name_service.h> +#include <yql/essentials/core/sql_types/normalize_name.h> +  #include <util/charset/utf8.h>  namespace NSQLComplete { @@ -57,7 +59,7 @@ namespace NSQLComplete {              return std::visit([this](const auto& name) -> size_t {                  using T = std::decay_t<decltype(name)>; -                auto content = ToLowerUTF8(ContentView(name)); +                auto content = NYql::NormalizeName(ContentView(name));                  if constexpr (std::is_same_v<T, TKeyword>) {                      if (auto weight = Frequency_.Keywords.FindPtr(content)) { diff --git a/yql/essentials/sql/v1/complete/name/static/ya.make b/yql/essentials/sql/v1/complete/name/static/ya.make index 155c0926399..1315d7475da 100644 --- a/yql/essentials/sql/v1/complete/name/static/ya.make +++ b/yql/essentials/sql/v1/complete/name/static/ya.make @@ -3,11 +3,13 @@ LIBRARY()  SRCS(      frequency.cpp      json_name_set.cpp +    name_index.cpp      name_service.cpp      ranking.cpp  )  PEERDIR( +    yql/essentials/core/sql_types      yql/essentials/sql/v1/complete/name      yql/essentials/sql/v1/complete/text  ) diff --git a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp index a0681b1888f..d14d7b85442 100644 --- a/yql/essentials/sql/v1/complete/sql_complete_ut.cpp +++ b/yql/essentials/sql/v1/complete/sql_complete_ut.cpp @@ -605,7 +605,6 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {          {              TVector<TCandidate> expected = {                  {HintName, "IGNORE_TYPE_V3"}, -                {HintName, "IGNORETYPEV3"},              };              UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ig"}), expected);          } @@ -642,6 +641,17 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {          UNIT_ASSERT_GE(Complete(engine, {"SELECT "}).size(), 55);      } +    Y_UNIT_TEST(NameNormalization) { +        auto set = MakeDefaultNameSet(); +        auto service = MakeStaticNameService(std::move(set), MakeDefaultRanking()); +        auto engine = MakeSqlCompletionEngine(MakePureLexerSupplier(), std::move(service)); + +        TVector<TCandidate> expected = { +            {HintName, "IGNORE_TYPE_V3"}, +        }; +        UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ignoret"}), expected); +    } +      Y_UNIT_TEST(Ranking) {          TFrequencyData frequency = {              .Keywords = { @@ -715,7 +725,7 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {                  {HintName, "XLOCK"},                  {HintName, "UNORDERED"},                  {Keyword, "COLUMNS"}, -                {HintName, "FORCEINFERSCHEMA"}, +                {HintName, "FORCE_INFER_SCHEMA"},              };              UNIT_ASSERT_VALUES_EQUAL(CompleteTop(expected.size(), engine, {"SELECT * FROM a WITH "}), expected);          } | 
