diff options
author | Maxim Yurchuk <[email protected]> | 2025-09-23 14:45:24 +0000 |
---|---|---|
committer | GitHub <[email protected]> | 2025-09-23 14:45:24 +0000 |
commit | ccb159ff5e6a49d325f2342fea08f07e9bd30a83 (patch) | |
tree | 28d4e1ce6ae781ea1f5f0f07d1f67210e2952e6f | |
parent | 387ee17f1f389c956cfea14f239211a8a4027064 (diff) | |
parent | 9c1b4170d6c376675a5384e54d06d3cc74eaabe4 (diff) |
46 files changed, 2380 insertions, 178 deletions
diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp new file mode 100644 index 00000000000..242484e72af --- /dev/null +++ b/ydb/core/base/fulltext.cpp @@ -0,0 +1,257 @@ +#include "fulltext.h" +#include <regex> + +namespace NKikimr::NFulltext { + +namespace { + + Ydb::Table::FulltextIndexSettings::Layout ParseLayout(const TString& layout, TString& error) { + if (layout == "flat") + return Ydb::Table::FulltextIndexSettings::FLAT; + else { + error = TStringBuilder() << "Invalid layout: " << layout; + return Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED; + } + }; + + Ydb::Table::FulltextIndexSettings::Tokenizer ParseTokenizer(const TString& tokenizer, TString& error) { + if (tokenizer == "whitespace") + return Ydb::Table::FulltextIndexSettings::WHITESPACE; + else if (tokenizer == "standard") + return Ydb::Table::FulltextIndexSettings::STANDARD; + else if (tokenizer == "keyword") + return Ydb::Table::FulltextIndexSettings::KEYWORD; + else { + error = TStringBuilder() << "Invalid tokenizer: " << tokenizer; + return Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED; + } + }; + + i32 ParseInt32(const TString& name, const TString& value, TString& error) { + i32 result = 0; + if (!TryFromString(value, result) || result < 0) { // proto int32 fields with [(Ydb.value) = ">= 0"] annotation + error = TStringBuilder() << "Invalid " << name << ": " << value; + } + return result; + } + + bool ParseBool(const TString& name, const TString& value, TString& error) { + bool result = false; + if (!TryFromString(value, result)) { + error = TStringBuilder() << "Invalid " << name << ": " << value; + } + return result; + } + + // Note: written by llm, can be optimized a lot later + TVector<TString> Tokenize(const TString& text, const Ydb::Table::FulltextIndexSettings::Tokenizer& tokenizer) { + TVector<TString> tokens; + switch (tokenizer) { + case Ydb::Table::FulltextIndexSettings::WHITESPACE: { + std::istringstream stream(text); + TString token; + while (stream >> token) { + tokens.push_back(token); + } + break; + } + case Ydb::Table::FulltextIndexSettings::STANDARD: { + std::regex word_regex(R"(\b\w+\b)"); // match alphanumeric words + std::sregex_iterator it(text.begin(), text.end(), word_regex); + std::sregex_iterator end; + while (it != end) { + tokens.push_back(it->str()); + ++it; + } + break; + } + case Ydb::Table::FulltextIndexSettings::KEYWORD: + tokens.push_back(text); + break; + default: + Y_ENSURE(TStringBuilder() << "Invalid tokenizer: " << static_cast<int>(tokenizer)); + } + + return tokens; + } + + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) { + if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) { + error = "tokenizer should be set"; + return false; + } + + if (settings.has_language()) { + error = "Unsupported language setting"; + return false; + } + + if (settings.use_filter_stopwords()) { + error = "Unsupported use_filter_stopwords setting"; + return false; + } + + if (settings.use_filter_ngram()) { + error = "Unsupported use_filter_ngram setting"; + return false; + } + if (settings.use_filter_edge_ngram()) { + error = "Unsupported use_filter_edge_ngram setting"; + return false; + } + if (settings.has_filter_ngram_min_length()) { + error = "Unsupported filter_ngram_min_length setting"; + return false; + } + if (settings.has_filter_ngram_max_length()) { + error = "Unsupported filter_ngram_max_length setting"; + return false; + } + + if (settings.use_filter_length()) { + error = "Unsupported use_filter_length setting"; + return false; + } + if (settings.has_filter_length_min()) { + error = "Unsupported filter_length_min setting"; + return false; + } + if (settings.has_filter_length_max()) { + error = "Unsupported filter_length_max setting"; + return false; + } + + return true; + } +} + +TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) { + TVector<TString> tokens = Tokenize(text, settings.tokenizer()); + + if (settings.use_filter_lowercase()) { + for (auto& token : tokens) { + token.to_lower(); + } + } + + return tokens; +} + +bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error) { + return ValidateColumnsMatches(TVector<TString>{columns.begin(), columns.end()}, settings, error); +} + +bool ValidateColumnsMatches(const TVector<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error) { + TVector<TString> settingsColumns(::Reserve(settings.columns().size())); + for (auto column : settings.columns()) { + settingsColumns.push_back(column.column()); + } + + + if (columns != settingsColumns) { + error = TStringBuilder() << "columns " << settingsColumns << " should be " << columns; + return false; + } + + error = ""; + return true; +} + +bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error) { + if (!settings.has_layout() || settings.layout() == Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED) { + error = "layout should be set"; + return false; + } + + if (settings.columns().empty()) { + error = "columns should be set"; + return false; + } + + // current implementation limitation: + if (settings.columns().size() != 1) { + error = "columns should have a single value"; + return false; + } + + for (auto column : settings.columns()) { + if (!column.has_column()) { + error = "column name should be set"; + return false; + } + + // current implementation limitation: + if (!settings.columns().at(0).has_analyzers()) { + error = "column analyzers should be set"; + return false; + } + if (!ValidateSettings(column.analyzers(), error)) { + return false; + } + } + + error = ""; + return true; +} + +Ydb::Table::FulltextIndexSettings FillSettings(const TString& keyColumn, const TVector<std::pair<TString, TString>>& settings, TString& error) { + Ydb::Table::FulltextIndexSettings result; + Ydb::Table::FulltextIndexSettings::Analyzers resultAnalyzers; + + for (const auto& [name, value] : settings) { + if (name == "layout") { + result.set_layout(ParseLayout(value, error)); + } else if (name == "tokenizer") { + resultAnalyzers.set_tokenizer(ParseTokenizer(value, error)); + } else if (name == "language") { + resultAnalyzers.set_language(value); + } else if (name == "use_filter_lowercase") { + resultAnalyzers.set_use_filter_lowercase(ParseBool(name, value, error)); + } else if (name == "use_filter_stopwords") { + resultAnalyzers.set_use_filter_stopwords(ParseBool(name, value, error)); + } else if (name == "use_filter_ngram") { + resultAnalyzers.set_use_filter_ngram(ParseBool(name, value, error)); + } else if (name == "use_filter_edge_ngram") { + resultAnalyzers.set_use_filter_edge_ngram(ParseBool(name, value, error)); + } else if (name == "filter_ngram_min_length") { + resultAnalyzers.set_filter_ngram_min_length(ParseInt32(name, value, error)); + } else if (name == "filter_ngram_max_length") { + resultAnalyzers.set_filter_ngram_max_length(ParseInt32(name, value, error)); + } else if (name == "use_filter_length") { + resultAnalyzers.set_use_filter_length(ParseBool(name, value, error)); + } else if (name == "filter_length_min") { + resultAnalyzers.set_filter_length_min(ParseInt32(name, value, error)); + } else if (name == "filter_length_max") { + resultAnalyzers.set_filter_length_max(ParseInt32(name, value, error)); + } else { + error = TStringBuilder() << "Unknown index setting: " << name; + return result; + } + + if (error) { + return result; + } + } + + { + // only single-columned index is supported for now + auto columnAnalyzers = result.add_columns(); + columnAnalyzers->set_column(keyColumn); + columnAnalyzers->mutable_analyzers()->CopyFrom(resultAnalyzers); + } + + ValidateSettings(result, error); + + return result; +} + + +} + +template<> inline +void Out<TVector<TString>>(IOutputStream& o, const TVector<TString> &vec) { + o << "[ "; + for (const auto &x : vec) + o << x << ' '; + o << "]"; +} diff --git a/ydb/core/base/fulltext.h b/ydb/core/base/fulltext.h new file mode 100644 index 00000000000..b7303613ae2 --- /dev/null +++ b/ydb/core/base/fulltext.h @@ -0,0 +1,17 @@ +#pragma once + +#include "defs.h" + +#include <ydb/public/api/protos/ydb_table.pb.h> + +namespace NKikimr::NFulltext { + +TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings); + +bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error); +bool ValidateColumnsMatches(const TVector<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error); + +bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error); +Ydb::Table::FulltextIndexSettings FillSettings(const TString& keyColumn, const TVector<std::pair<TString, TString>>& values, TString& error); + +} diff --git a/ydb/core/base/kmeans_clusters.cpp b/ydb/core/base/kmeans_clusters.cpp index 65fb9874d76..9b938196539 100644 --- a/ydb/core/base/kmeans_clusters.cpp +++ b/ydb/core/base/kmeans_clusters.cpp @@ -493,6 +493,7 @@ bool ValidateSettings(const Ydb::Table::KMeansTreeSettings& settings, TString& e return false; } + error = ""; return true; } @@ -525,6 +526,7 @@ bool ValidateSettings(const Ydb::Table::VectorIndexSettings& settings, TString& return false; } + error = ""; return true; } diff --git a/ydb/core/base/table_index.cpp b/ydb/core/base/table_index.cpp index b24e0b607d5..d82aec792dd 100644 --- a/ydb/core/base/table_index.cpp +++ b/ydb/core/base/table_index.cpp @@ -1,5 +1,6 @@ #include "table_index.h" +#include <ydb/library/yverify_stream/yverify_stream.h> #include <ydb/core/protos/tx_datashard.pb.h> namespace NKikimr::NTableIndex { @@ -61,13 +62,40 @@ constexpr std::string_view PrefixedGlobalKMeansTreeImplTables[] = { }; static_assert(std::is_sorted(std::begin(PrefixedGlobalKMeansTreeImplTables), std::end(PrefixedGlobalKMeansTreeImplTables))); +constexpr std::string_view GlobalFulltextImplTables[] = { + ImplTable, +}; +static_assert(std::is_sorted(std::begin(GlobalFulltextImplTables), std::end(GlobalFulltextImplTables))); + +bool IsSecondaryIndex(NKikimrSchemeOp::EIndexType indexType) { + switch (indexType) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + return true; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + return false; + default: + Y_ENSURE(false, InvalidIndexType(indexType)); + } } -TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType type, const TTableColumns& table, const TIndexColumns& index) { +} + +TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index) { TTableColumns result; - const bool isSecondaryIndex = type != NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree; - std::for_each(index.KeyColumns.begin(), index.KeyColumns.end() - (isSecondaryIndex ? 0 : 1), [&] (const auto& ik) { + const bool isSecondaryIndex = IsSecondaryIndex(indexType); + + auto takeKeyColumns = index.KeyColumns.size(); + if (!isSecondaryIndex) { // vector and fulltext indexes have special embedding and text key columns + Y_ASSERT(indexType == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree + || indexType == NKikimrSchemeOp::EIndexTypeGlobalFulltext); + takeKeyColumns--; + } + + std::for_each(index.KeyColumns.begin(), index.KeyColumns.begin() + takeKeyColumns, [&] (const auto& ik) { result.Keys.push_back(ik); result.Columns.emplace(ik); }); @@ -85,6 +113,18 @@ TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType type, const T return result; } +NKikimrSchemeOp::EIndexType GetIndexType(NKikimrSchemeOp::TIndexCreationConfig indexCreation) { + // TODO: always provide EIndexTypeGlobal value instead of null + // TODO: do not cast unknown index types to EIndexTypeGlobal (proto2 specific) + return indexCreation.HasType() + ? indexCreation.GetType() + : NKikimrSchemeOp::EIndexTypeGlobal; +} + +TString InvalidIndexType(NKikimrSchemeOp::EIndexType indexType) { + return TStringBuilder() << "Invalid index type " << static_cast<int>(indexType); +} + bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain) { if (const auto* broken = IsContains(table.Keys, table.Columns)) { explain = TStringBuilder() @@ -127,7 +167,7 @@ bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumn return false; } - const bool isSecondaryIndex = indexType != NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree; + const bool isSecondaryIndex = IsSecondaryIndex(indexType); if (index.KeyColumns.size() < 1) { explain = "should be at least single index key column"; @@ -157,7 +197,9 @@ bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumn if (isSecondaryIndex) { tmp.insert(index.KeyColumns.begin(), index.KeyColumns.end()); } else { - // Vector indexes allow to add all columns both to index & data + // Vector and fulltext indexes allow to add all columns both to index & data + Y_ASSERT(indexType == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree + || indexType == NKikimrSchemeOp::EIndexTypeGlobalFulltext); } if (const auto* broken = IsContains(index.DataColumns, tmp, true)) { explain = TStringBuilder() @@ -167,15 +209,37 @@ bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumn return true; } +bool DoesIndexSupportTTL(NKikimrSchemeOp::EIndexType indexType) { + switch (indexType) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + return true; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + return false; + default: + Y_DEBUG_ABORT_S(InvalidIndexType(indexType)); + return false; + } +} + std::span<const std::string_view> GetImplTables(NKikimrSchemeOp::EIndexType indexType, std::span<const TString> indexKeys) { - if (indexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - if (indexKeys.size() == 1) { - return GlobalKMeansTreeImplTables; - } else { - return PrefixedGlobalKMeansTreeImplTables; - } - } else { - return GlobalSecondaryImplTables; + switch (indexType) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + return GlobalSecondaryImplTables; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + if (indexKeys.size() == 1) { + return GlobalKMeansTreeImplTables; + } else { + return PrefixedGlobalKMeansTreeImplTables; + } + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + return GlobalFulltextImplTables; + default: + Y_ENSURE(false, InvalidIndexType(indexType)); } } diff --git a/ydb/core/base/table_index.h b/ydb/core/base/table_index.h index ffe3681f0ae..a251122d0b5 100644 --- a/ydb/core/base/table_index.h +++ b/ydb/core/base/table_index.h @@ -38,8 +38,13 @@ struct TIndexColumns { inline constexpr const char* ImplTable = "indexImplTable"; -bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType type, const TTableColumns& table, const TIndexColumns& index, TString& explain); -TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType type, const TTableColumns& table, const TIndexColumns& index); +bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain); +TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index); + +bool DoesIndexSupportTTL(NKikimrSchemeOp::EIndexType indexType); + +NKikimrSchemeOp::EIndexType GetIndexType(NKikimrSchemeOp::TIndexCreationConfig indexCreation); +TString InvalidIndexType(NKikimrSchemeOp::EIndexType indexType); std::span<const std::string_view> GetImplTables(NKikimrSchemeOp::EIndexType indexType, std::span<const TString> indexKeys); bool IsImplTable(std::string_view tableName); @@ -76,6 +81,14 @@ TClusterId SetPostingParentFlag(TClusterId parent); } +namespace NFulltext { + // TODO: support utf-8 in fulltext index + inline constexpr auto TokenType = Ydb::Type::STRING; + inline constexpr const char* TokenTypeName = "String"; + + inline constexpr const char* TokenColumn = "__ydb_token"; +} + TString ToShortDebugString(const NKikimrTxDataShard::TEvReshuffleKMeansRequest& record); TString ToShortDebugString(const NKikimrTxDataShard::TEvRecomputeKMeansRequest& record); TString ToShortDebugString(const NKikimrTxDataShard::TEvRecomputeKMeansResponse& record); diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp new file mode 100644 index 00000000000..113c1821153 --- /dev/null +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -0,0 +1,127 @@ +#include "fulltext.h" + +#include <library/cpp/testing/unittest/registar.h> + +namespace NKikimr::NFulltext { + +Y_UNIT_TEST_SUITE(NFulltext) { + + Y_UNIT_TEST(ValidateColumnsMatches) { + TString error; + + Ydb::Table::FulltextIndexSettings settings; + settings.add_columns()->set_column("column1"); + settings.add_columns()->set_column("column2"); + + UNIT_ASSERT(!ValidateColumnsMatches(TVector<TString>{"column2"}, settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "columns [ column1 column2 ] should be [ column2 ]"); + + UNIT_ASSERT(!ValidateColumnsMatches(TVector<TString>{"column2", "column1"}, settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "columns [ column1 column2 ] should be [ column2 column1 ]"); + + UNIT_ASSERT(ValidateColumnsMatches(TVector<TString>{"column1", "column2"}, settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, ""); + } + + Y_UNIT_TEST(ValidateSettings) { + Ydb::Table::FulltextIndexSettings settings; + TString error; + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "layout should be set"); + settings.set_layout(Ydb::Table::FulltextIndexSettings::FLAT); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "columns should be set"); + auto columnSettings = settings.add_columns(); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "column name should be set"); + columnSettings->set_column("text"); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "column analyzers should be set"); + auto columnAnalyzers = columnSettings->mutable_analyzers(); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set"); + columnAnalyzers->set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + + UNIT_ASSERT_C(ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, ""); + + columnSettings = settings.add_columns(); + columnSettings->set_column("text2"); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "columns should have a single value"); + } + + Y_UNIT_TEST(FillSettings) { + TVector<std::pair<TString, TString>> list{ + {"layout", "flat"}, + {"tokenizer", "standard"}, + {"use_filter_lowercase", "true"} + }; + + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, ""); + + UNIT_ASSERT_EQUAL(settings.layout(), Ydb::Table::FulltextIndexSettings::FLAT); + UNIT_ASSERT_VALUES_EQUAL(settings.columns().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).column(), "text"); + UNIT_ASSERT_EQUAL(settings.columns().at(0).analyzers().tokenizer(), Ydb::Table::FulltextIndexSettings::STANDARD); + UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).analyzers().use_filter_lowercase(), true); + } + + Y_UNIT_TEST(FillSettingsInvalid) { + { + TVector<std::pair<TString, TString>> list{ + {"asdf", "qwer"} + }; + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, "Unknown index setting: asdf"); + } + + { + TVector<std::pair<TString, TString>> list{ + {"layout", "flat"}, + {"tokenizer", "standard"}, + {"use_filter_lowercase", "asdf"} + }; + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, "Invalid use_filter_lowercase: asdf"); + } + + { + TVector<std::pair<TString, TString>> list{ + {"layout", "flat"}, + }; + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set"); + } + } + + Y_UNIT_TEST(Analyze) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + TString text = "apple WaLLet spaced-dog"; + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "WaLLet", "spaced-dog"})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "WaLLet", "spaced", "dog"})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{text})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + analyzers.set_use_filter_lowercase(true); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "wallet", "spaced-dog"})); + } +} + +} diff --git a/ydb/core/base/ut/ya.make b/ydb/core/base/ut/ya.make index 82b340fde09..cc1fd67d984 100644 --- a/ydb/core/base/ut/ya.make +++ b/ydb/core/base/ut/ya.make @@ -9,13 +9,14 @@ PEERDIR( ) SRCS( - path_ut.cpp blobstorage_grouptype_ut.cpp + fulltext_ut.cpp localdb_ut.cpp logoblob_ut.cpp memory_stats_ut.cpp - statestorage_ut.cpp + path_ut.cpp statestorage_guardian_impl_ut.cpp + statestorage_ut.cpp table_index_ut.cpp ) diff --git a/ydb/core/base/ya.make b/ydb/core/base/ya.make index 78234ead0f8..285913b9bf5 100644 --- a/ydb/core/base/ya.make +++ b/ydb/core/base/ya.make @@ -28,6 +28,8 @@ SRCS( feature_flags.h feature_flags_service.cpp feature_flags_service.h + fulltext.cpp + fulltext.h group_stat.cpp group_stat.h hive.h diff --git a/ydb/core/protos/flat_scheme_op.proto b/ydb/core/protos/flat_scheme_op.proto index 1b78127d665..c02dc4eca53 100644 --- a/ydb/core/protos/flat_scheme_op.proto +++ b/ydb/core/protos/flat_scheme_op.proto @@ -1158,6 +1158,7 @@ enum EIndexType { EIndexTypeGlobalAsync = 2; EIndexTypeGlobalUnique = 3; EIndexTypeGlobalVectorKmeansTree = 4; + EIndexTypeGlobalFulltext = 5; } enum EIndexState { @@ -1171,6 +1172,10 @@ message TVectorIndexKmeansTreeDescription { optional Ydb.Table.KMeansTreeSettings Settings = 1; } +message TFulltextIndexDescription { + optional Ydb.Table.FulltextIndexSettings Settings = 1; +} + message TIndexDescription { optional string Name = 1; optional uint64 LocalPathId = 2; @@ -1192,6 +1197,7 @@ message TIndexDescription { oneof SpecializedIndexDescription { TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 11; + TFulltextIndexDescription FulltextIndexDescription = 12; } } @@ -1204,6 +1210,7 @@ message TIndexCreationConfig { repeated string DataColumnNames = 6; //columns to be denormalized to read data just from index oneof SpecializedIndexDescription { TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 7; + TFulltextIndexDescription FulltextIndexDescription = 8; } } @@ -1990,6 +1997,7 @@ enum EPathSubType { EPathSubTypeAsyncIndexImplTable = 2; EPathSubTypeStreamImpl = 3; EPathSubTypeVectorKmeansTreeIndexImplTable = 4; + EPathSubTypeFulltextIndexImplTable = 5; } enum EPathState { diff --git a/ydb/core/protos/tx_datashard.proto b/ydb/core/protos/tx_datashard.proto index 89cbbdef90d..1f34bbdc935 100644 --- a/ydb/core/protos/tx_datashard.proto +++ b/ydb/core/protos/tx_datashard.proto @@ -1787,6 +1787,41 @@ message TEvPrefixKMeansResponse { optional NKikimrIndexBuilder.TMeteringStats MeteringStats = 12; } +message TEvBuildFulltextIndexRequest { + optional uint64 Id = 1; + + optional uint64 TabletId = 2; + optional NKikimrProto.TPathID PathId = 3; + + optional uint64 SnapshotTxId = 4; + optional uint64 SnapshotStep = 5; + + optional uint64 SeqNoGeneration = 6; + optional uint64 SeqNoRound = 7; + + optional string IndexName = 8; + + optional Ydb.Table.FulltextIndexSettings Settings = 9; // also has key columns + repeated string DataColumns = 10; + + optional NKikimrIndexBuilder.TIndexBuildScanSettings ScanSettings = 11; +} + +message TEvBuildFulltextIndexResponse { + optional uint64 Id = 1; + + optional uint64 TabletId = 2; + optional NKikimrProto.TPathID PathId = 3; + + optional uint64 RequestSeqNoGeneration = 4; + optional uint64 RequestSeqNoRound = 5; + + optional NKikimrIndexBuilder.EBuildStatus Status = 6; + repeated Ydb.Issue.IssueMessage Issues = 7; + + optional NKikimrIndexBuilder.TMeteringStats MeteringStats = 8; +} + message TEvCdcStreamScanRequest { message TLimits { optional uint32 BatchMaxBytes = 1 [default = 512000]; diff --git a/ydb/core/sys_view/show_create/create_table_formatter.cpp b/ydb/core/sys_view/show_create/create_table_formatter.cpp index 923644ceac3..151fd8a0f02 100644 --- a/ydb/core/sys_view/show_create/create_table_formatter.cpp +++ b/ydb/core/sys_view/show_create/create_table_formatter.cpp @@ -514,6 +514,7 @@ void TCreateTableFormatter::Format(const TableIndex& index) { Stream << "\tINDEX "; EscapeName(index.name(), Stream); std::optional<KMeansTreeSettings> kMeansTreeSettings; + std::optional<FulltextIndexSettings> fulltextIndexSettings; switch (index.type_case()) { case TableIndex::kGlobalIndex: { Stream << " GLOBAL SYNC ON "; @@ -532,6 +533,11 @@ void TCreateTableFormatter::Format(const TableIndex& index) { kMeansTreeSettings = index.global_vector_kmeans_tree_index().vector_settings(); break; } + case Ydb::Table::TableIndex::kGlobalFulltextIndex: { + Stream << " GLOBAL USING fulltext ON "; + fulltextIndexSettings = index.global_fulltext_index().fulltext_settings(); + break; + } case Ydb::Table::TableIndex::TYPE_NOT_SET: ythrow TFormatFail(Ydb::StatusIds::INTERNAL_ERROR, "Unexpected Ydb::Table::TableIndex::TYPE_NOT_SET"); } @@ -622,6 +628,10 @@ void TCreateTableFormatter::Format(const TableIndex& index) { Stream << ")"; } + + if (fulltextIndexSettings) { + Y_ENSURE("todo not implemented"); + } } bool TCreateTableFormatter::Format(const TFamilyDescription& familyDesc) { diff --git a/ydb/core/tx/datashard/build_index/fulltext.cpp b/ydb/core/tx/datashard/build_index/fulltext.cpp new file mode 100644 index 00000000000..61aa3112c71 --- /dev/null +++ b/ydb/core/tx/datashard/build_index/fulltext.cpp @@ -0,0 +1,413 @@ +#include "common_helper.h" +#include "../datashard_impl.h" +#include "../scan_common.h" +#include "../upload_stats.h" +#include "../buffer_data.h" + +#include <ydb/core/base/appdata.h> +#include <ydb/core/base/counters.h> +#include <ydb/core/base/fulltext.h> +#include <ydb/core/kqp/common/kqp_types.h> +#include <ydb/core/scheme/scheme_tablecell.h> + +#include <ydb/core/tx/tx_proxy/proxy.h> +#include <ydb/core/tx/tx_proxy/upload_rows.h> + +#include <ydb/core/ydb_convert/table_description.h> +#include <ydb/core/ydb_convert/ydb_convert.h> +#include <yql/essentials/public/issue/yql_issue_message.h> + +#include <util/generic/algorithm.h> +#include <util/string/builder.h> + +namespace NKikimr::NDataShard { +using namespace NTableIndex::NFulltext; +using namespace NKikimr::NFulltext; + +class TBuildFulltextIndexScan: public TActor<TBuildFulltextIndexScan>, public IActorExceptionHandler, public NTable::IScan { + IDriver* Driver = nullptr; + + ui64 TabletId = 0; + ui64 BuildId = 0; + + ui64 ReadRows = 0; + ui64 ReadBytes = 0; + + TTags ScanTags; + TString TextColumn; + Ydb::Table::FulltextIndexSettings::Analyzers TextAnalyzers; + + TBatchRowsUploader Uploader; + TBufferData* UploadBuf = nullptr; + + const NKikimrTxDataShard::TEvBuildFulltextIndexRequest Request; + const TActorId ResponseActorId; + const TAutoPtr<TEvDataShard::TEvBuildFulltextIndexResponse> Response; + +public: + static constexpr NKikimrServices::TActivity::EType ActorActivityType() + { + return NKikimrServices::TActivity::BUILD_FULLTEXT_INDEX; + } + + TBuildFulltextIndexScan(ui64 tabletId, const TUserTable& table, NKikimrTxDataShard::TEvBuildFulltextIndexRequest request, + const TActorId& responseActorId, TAutoPtr<TEvDataShard::TEvBuildFulltextIndexResponse>&& response) + : TActor{&TThis::StateWork} + , TabletId(tabletId) + , BuildId{request.GetId()} + , Uploader(request.GetScanSettings()) + , Request(std::move(request)) + , ResponseActorId{responseActorId} + , Response{std::move(response)} + { + LOG_I("Create " << Debug()); + + Y_ENSURE(Request.settings().columns().size() == 1); + TextColumn = Request.settings().columns().at(0).column(); + TextAnalyzers = Request.settings().columns().at(0).analyzers(); + + auto tags = GetAllTags(table); + auto types = GetAllTypes(table); + + { + ScanTags.push_back(tags.at(TextColumn)); + + for (auto dataColumn : Request.GetDataColumns()) { + if (dataColumn != TextColumn) { + ScanTags.push_back(tags.at(dataColumn)); + } + } + } + + { + auto uploadTypes = std::make_shared<NTxProxy::TUploadTypes>(); + auto addType = [&](const auto& column) { + auto it = types.find(column); + if (it != types.end()) { + Ydb::Type type; + NScheme::ProtoFromTypeInfo(it->second, type); + uploadTypes->emplace_back(it->first, type); + types.erase(it); + } + }; + { + Ydb::Type type; + type.set_type_id(TokenType); + uploadTypes->emplace_back(TokenColumn, type); + } + for (const auto& column : table.KeyColumnIds) { + addType(table.Columns.at(column).Name); + } + for (auto dataColumn : Request.GetDataColumns()) { + addType(dataColumn); + } + UploadBuf = Uploader.AddDestination(Request.GetIndexName(), std::move(uploadTypes)); + } + } + + TInitialState Prepare(IDriver* driver, TIntrusiveConstPtr<TScheme>) final + { + TActivationContext::AsActorContext().RegisterWithSameMailbox(this); + LOG_I("Prepare " << Debug()); + + Driver = driver; + Uploader.SetOwner(SelfId()); + + return {EScan::Feed, {}}; + } + + EScan Seek(TLead& lead, ui64 seq) final + { + LOG_T("Seek " << seq << " " << Debug()); + + if (seq) { + return Uploader.CanFinish() + ? EScan::Final + : EScan::Sleep; + } + + lead.To(ScanTags, {}, NTable::ESeek::Lower); + + return EScan::Feed; + } + + EScan Feed(TArrayRef<const TCell> key, const TRow& row) final + { + // LOG_T("Feed " << Debug()); + + ++ReadRows; + ReadBytes += CountRowCellBytes(key, *row); + + TVector<TCell> uploadKey(::Reserve(key.size() + 1)); + TVector<TCell> uploadValue(::Reserve(Request.GetDataColumns().size())); + + TString text((*row).at(0).AsBuf()); + auto tokens = Analyze(text, TextAnalyzers); + for (const auto& token : tokens) { + uploadKey.clear(); + uploadKey.push_back(TCell(token)); + uploadKey.insert(uploadKey.end(), key.begin(), key.end()); + + uploadValue.clear(); + size_t index = 1; // skip text column + for (auto dataColumn : Request.GetDataColumns()) { + if (dataColumn != TextColumn) { + uploadValue.push_back(row.Get(index++)); + } else { + uploadValue.push_back(TCell(text)); + } + } + + UploadBuf->AddRow(uploadKey, uploadValue); + } + + return Uploader.ShouldWaitUpload() ? EScan::Sleep : EScan::Feed; + } + + EScan PageFault() final + { + LOG_T("PageFault " << Debug()); + return EScan::Feed; + } + + EScan Exhausted() final + { + LOG_T("Exhausted " << Debug()); + + // call Seek to wait uploads + return EScan::Reset; + } + + TAutoPtr<IDestructable> Finish(const std::exception& exc) final + { + Uploader.AddIssue(exc); + return Finish(EStatus::Exception); + } + + TAutoPtr<IDestructable> Finish(EStatus status) final + { + auto& record = Response->Record; + record.MutableMeteringStats()->SetReadRows(ReadRows); + record.MutableMeteringStats()->SetReadBytes(ReadBytes); + record.MutableMeteringStats()->SetCpuTimeUs(Driver->GetTotalCpuTimeUs()); + + Uploader.Finish(record, status); + + if (Response->Record.GetStatus() == NKikimrIndexBuilder::DONE) { + LOG_N("Done " << Debug() << " " << Response->Record.ShortDebugString()); + } else { + LOG_E("Failed " << Debug() << " " << Response->Record.ShortDebugString()); + } + Send(ResponseActorId, Response.Release()); + + Driver = nullptr; + this->PassAway(); + return nullptr; + } + + bool OnUnhandledException(const std::exception& exc) final + { + if (!Driver) { + return false; + } + Driver->Throw(exc); + return true; + } + + void Describe(IOutputStream& out) const final + { + out << Debug(); + } + +protected: + STFUNC(StateWork) + { + switch (ev->GetTypeRewrite()) { + HFunc(TEvTxUserProxy::TEvUploadRowsResponse, Handle); + CFunc(TEvents::TSystem::Wakeup, HandleWakeup); + default: + LOG_E("StateWork unexpected event type: " << ev->GetTypeRewrite() + << " event: " << ev->ToString() << " " << Debug()); + } + } + + void HandleWakeup(const NActors::TActorContext& /*ctx*/) + { + LOG_D("Retry upload " << Debug()); + + Uploader.RetryUpload(); + } + + void Handle(TEvTxUserProxy::TEvUploadRowsResponse::TPtr& ev, const TActorContext& ctx) + { + LOG_D("Handle TEvUploadRowsResponse " << Debug() + << " ev->Sender: " << ev->Sender.ToString()); + + if (!Driver) { + return; + } + + Uploader.Handle(ev); + + if (Uploader.GetUploadStatus().IsSuccess()) { + Driver->Touch(EScan::Feed); + return; + } + + if (auto retryAfter = Uploader.GetRetryAfter(); retryAfter) { + LOG_N("Got retriable error, " << Debug() << " " << Uploader.GetUploadStatus().ToString()); + ctx.Schedule(*retryAfter, new TEvents::TEvWakeup()); + return; + } + + LOG_N("Got error, abort scan, " << Debug() << " " << Uploader.GetUploadStatus().ToString()); + + Driver->Touch(EScan::Final); + } + + TString Debug() const + { + return TStringBuilder() << "TBuildFulltextIndexScan TabletId: " << TabletId << " Id: " << BuildId + << " " << Uploader.Debug(); + } +}; + +class TDataShard::TTxHandleSafeBuildFulltextIndexScan final: public NTabletFlatExecutor::TTransactionBase<TDataShard> { +public: + TTxHandleSafeBuildFulltextIndexScan(TDataShard* self, TEvDataShard::TEvBuildFulltextIndexRequest::TPtr&& ev) + : TTransactionBase(self) + , Ev(std::move(ev)) + { + } + + bool Execute(TTransactionContext&, const TActorContext& ctx) final + { + Self->HandleSafe(Ev, ctx); + return true; + } + + void Complete(const TActorContext&) final + { + } + +private: + TEvDataShard::TEvBuildFulltextIndexRequest::TPtr Ev; +}; + +void TDataShard::Handle(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext&) +{ + Execute(new TTxHandleSafeBuildFulltextIndexScan(this, std::move(ev))); +} + +void TDataShard::HandleSafe(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext& ctx) +{ + auto& request = ev->Get()->Record; + const ui64 id = request.GetId(); + TRowVersion rowVersion(request.GetSnapshotStep(), request.GetSnapshotTxId()); + TScanRecord::TSeqNo seqNo = {request.GetSeqNoGeneration(), request.GetSeqNoRound()}; + + try { + auto response = MakeHolder<TEvDataShard::TEvBuildFulltextIndexResponse>(); + FillScanResponseCommonFields(*response, id, TabletID(), seqNo); + + LOG_N("Starting TBuildFulltextIndexScan TabletId: " << TabletID() + << " " << request.ShortDebugString() + << " row version " << rowVersion); + + // Note: it's very unlikely that we have volatile txs before this snapshot + if (VolatileTxManager.HasVolatileTxsAtSnapshot(rowVersion)) { + VolatileTxManager.AttachWaitingSnapshotEvent(rowVersion, std::unique_ptr<IEventHandle>(ev.Release())); + return; + } + + auto badRequest = [&](const TString& error) { + response->Record.SetStatus(NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST); + auto issue = response->Record.AddIssues(); + issue->set_severity(NYql::TSeverityIds::S_ERROR); + issue->set_message(error); + }; + auto trySendBadRequest = [&] { + if (response->Record.GetStatus() == NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST) { + LOG_E("Rejecting TBuildFulltextIndexScan bad request TabletId: " << TabletID() + << " " << request.ShortDebugString() + << " with response " << response->Record.ShortDebugString()); + ctx.Send(ev->Sender, std::move(response)); + return true; + } else { + return false; + } + }; + + // 1. Validating table and path existence + if (request.GetTabletId() != TabletID()) { + badRequest(TStringBuilder() << "Wrong shard " << request.GetTabletId() << " this is " << TabletID()); + } + if (!IsStateActive()) { + badRequest(TStringBuilder() << "Shard " << TabletID() << " is " << State << " and not ready for requests"); + } + const auto pathId = TPathId::FromProto(request.GetPathId()); + const auto* userTableIt = GetUserTables().FindPtr(pathId.LocalPathId); + if (!userTableIt) { + badRequest(TStringBuilder() << "Unknown table id: " << pathId.LocalPathId); + } + if (trySendBadRequest()) { + return; + } + const auto& userTable = **userTableIt; + + // 2. Validating request fields + if (!request.HasSnapshotStep() || !request.HasSnapshotTxId()) { + badRequest(TStringBuilder() << "Missing snapshot"); + } else { + const TSnapshotKey snapshotKey(pathId, rowVersion.Step, rowVersion.TxId); + if (!SnapshotManager.FindAvailable(snapshotKey)) { + badRequest(TStringBuilder() << "Unknown snapshot for path id " << pathId.OwnerId << ":" << pathId.LocalPathId + << ", snapshot step is " << snapshotKey.Step << ", snapshot tx is " << snapshotKey.TxId); + } + } + + if (!request.GetIndexName()) { + badRequest(TStringBuilder() << "Empty index table name"); + } + + auto tags = GetAllTags(userTable); + for (auto column : request.GetSettings().columns()) { + if (!tags.contains(column.column())) { + badRequest(TStringBuilder() << "Unknown key column: " << column.column()); + } + } + for (auto dataColumn : request.GetDataColumns()) { + if (!tags.contains(dataColumn)) { + badRequest(TStringBuilder() << "Unknown data column: " << dataColumn); + } + } + + if (trySendBadRequest()) { + return; + } + + // 3. Validating fulltext index settings + if (!request.HasSettings()) { + badRequest(TStringBuilder() << "Missing fulltext index settings"); + } else { + TString error; + if (!NKikimr::NFulltext::ValidateSettings(request.GetSettings(), error)) { + badRequest(error); + } + } + + if (trySendBadRequest()) { + return; + } + + // 4. Creating scan + TAutoPtr<NTable::IScan> scan = new TBuildFulltextIndexScan(TabletID(), userTable, + request, ev->Sender, std::move(response)); + + StartScan(this, std::move(scan), id, seqNo, rowVersion, userTable.LocalTid); + } catch (const std::exception& exc) { + FailScan<TEvDataShard::TEvBuildFulltextIndexResponse>(id, TabletID(), ev->Sender, seqNo, exc, "TBuildFulltextIndexScan"); + } +} + +} diff --git a/ydb/core/tx/datashard/build_index/secondary_index.cpp b/ydb/core/tx/datashard/build_index/secondary_index.cpp index 20e68c03a5b..40342b06156 100644 --- a/ydb/core/tx/datashard/build_index/secondary_index.cpp +++ b/ydb/core/tx/datashard/build_index/secondary_index.cpp @@ -577,12 +577,13 @@ void TDataShard::HandleSafe(TEvDataShard::TEvBuildIndexCreateRequest::TPtr& ev, // 2. Validating request fields if (!request.HasSnapshotStep() || !request.HasSnapshotTxId()) { - badRequest(TStringBuilder() << "Empty snapshot"); - } - const TSnapshotKey snapshotKey(tableId.PathId, rowVersion.Step, rowVersion.TxId); - if (!SnapshotManager.FindAvailable(snapshotKey)) { - badRequest(TStringBuilder() << "Unknown snapshot for path id " << tableId.PathId.OwnerId << ":" << tableId.PathId.LocalPathId - << ", snapshot step is " << snapshotKey.Step << ", snapshot tx is " << snapshotKey.TxId); + badRequest(TStringBuilder() << "Missing snapshot"); + } else { + const TSnapshotKey snapshotKey(tableId.PathId, rowVersion.Step, rowVersion.TxId); + if (!SnapshotManager.FindAvailable(snapshotKey)) { + badRequest(TStringBuilder() << "Unknown snapshot for path id " << tableId.PathId.OwnerId << ":" << tableId.PathId.LocalPathId + << ", snapshot step is " << snapshotKey.Step << ", snapshot tx is " << snapshotKey.TxId); + } } TSerializedTableRange requestedRange; diff --git a/ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp b/ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp new file mode 100644 index 00000000000..a0f6884f7bf --- /dev/null +++ b/ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp @@ -0,0 +1,357 @@ +#include "ut_helpers.h" + +#include <ydb/core/base/table_index.h> +#include <ydb/core/protos/index_builder.pb.h> +#include <ydb/core/testlib/test_client.h> +#include <ydb/core/tx/datashard/ut_common/datashard_ut_common.h> +#include <ydb/core/tx/schemeshard/schemeshard.h> +#include <ydb/core/tx/tx_proxy/proxy.h> +#include <ydb/core/tx/tx_proxy/upload_rows.h> + +#include <yql/essentials/public/issue/yql_issue_message.h> + +#include <library/cpp/testing/unittest/registar.h> + +namespace NKikimr { +using namespace Tests; +using Ydb::Table::FulltextIndexSettings; +using namespace NTableIndex::NFulltext; + +static std::atomic<ui64> sId = 1; +static const TString kMainTable = "/Root/table-main"; +static const TString kIndexTable = "/Root/table-index"; + +Y_UNIT_TEST_SUITE(TTxDataShardBuildFulltextIndexScan) { + + ui64 FillRequest(Tests::TServer::TPtr server, TActorId sender, + NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request, + std::function<void(NKikimrTxDataShard::TEvBuildFulltextIndexRequest&)> setupRequest) + { + auto id = sId.fetch_add(1, std::memory_order_relaxed); + + auto snapshot = CreateVolatileSnapshot(server, {kMainTable}); + auto datashards = GetTableShards(server, sender, kMainTable); + TTableId tableId = ResolveTableId(server, sender, kMainTable); + + UNIT_ASSERT(datashards.size() == 1); + + request.SetId(1); + request.SetSeqNoGeneration(id); + request.SetSeqNoRound(1); + + request.SetTabletId(datashards[0]); + tableId.PathId.ToProto(request.MutablePathId()); + + request.SetSnapshotTxId(snapshot.TxId); + request.SetSnapshotStep(snapshot.Step); + + FulltextIndexSettings settings; + settings.set_layout(FulltextIndexSettings::FLAT); + auto column = settings.add_columns(); + column->set_column("text"); + column->mutable_analyzers()->set_tokenizer(FulltextIndexSettings::WHITESPACE); + *request.MutableSettings() = settings; + + request.SetIndexName(kIndexTable); + + setupRequest(request); + + return datashards[0]; + } + + void DoBadRequest(Tests::TServer::TPtr server, TActorId sender, + std::function<void(NKikimrTxDataShard::TEvBuildFulltextIndexRequest&)> setupRequest, + const TString& expectedError, bool expectedErrorSubstring = false, NKikimrIndexBuilder::EBuildStatus expectedStatus = NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST) + { + auto ev = std::make_unique<TEvDataShard::TEvBuildFulltextIndexRequest>(); + + auto tabletId = FillRequest(server, sender, ev->Record, setupRequest); + + NKikimr::DoBadRequest<TEvDataShard::TEvBuildFulltextIndexResponse>(server, sender, std::move(ev), tabletId, expectedError, expectedErrorSubstring, expectedStatus); + } + + TString DoBuild(Tests::TServer::TPtr server, TActorId sender, std::function<void(NKikimrTxDataShard::TEvBuildFulltextIndexRequest&)> setupRequest) { + auto ev1 = std::make_unique<TEvDataShard::TEvBuildFulltextIndexRequest>(); + auto tabletId = FillRequest(server, sender, ev1->Record, setupRequest); + + auto ev2 = std::make_unique<TEvDataShard::TEvBuildFulltextIndexRequest>(); + ev2->Record.CopyFrom(ev1->Record); + + auto& runtime = *server->GetRuntime(); + runtime.SendToPipe(tabletId, sender, ev1.release(), 0, GetPipeConfigWithRetries()); + runtime.SendToPipe(tabletId, sender, ev2.release(), 0, GetPipeConfigWithRetries()); + + TAutoPtr<IEventHandle> handle; + auto reply = runtime.GrabEdgeEventRethrow<TEvDataShard::TEvBuildFulltextIndexResponse>(handle); + + UNIT_ASSERT_EQUAL_C(reply->Record.GetStatus(), NKikimrIndexBuilder::EBuildStatus::DONE, reply->Record.ShortDebugString()); + + auto index = ReadShardedTable(server, kIndexTable); + Cerr << "Index:" << Endl; + Cerr << index << Endl; + return std::move(index); + } + + void CreateMainTable(Tests::TServer::TPtr server, TActorId sender) { + TShardedTableOptions options; + options.EnableOutOfOrder(true); + options.Shards(1); + options.AllowSystemColumnNames(false); + options.Columns({ + {"key", "Uint32", true, true}, + {"text", "String", false, false}, + {"data", "String", false, false}, + }); + CreateShardedTable(server, sender, "/Root", "table-main", options); + } + + void FillMainTable(Tests::TServer::TPtr server, TActorId sender) { + ExecSQL(server, sender, R"( + UPSERT INTO `/Root/table-main` (key, text, data) VALUES + (1, "green apple", "one"), + (2, "red apple", "two"), + (3, "yellow apple", "three"), + (4, "red car", "four") + )"); + } + + void CreateIndexTable(Tests::TServer::TPtr server, TActorId sender) { + TShardedTableOptions options; + options.EnableOutOfOrder(true); + options.Shards(1); + options.AllowSystemColumnNames(true); + options.Columns({ + {TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true}, + {"key", "Uint32", true, true}, + {"data", "String", false, false}, + }); + CreateShardedTable(server, sender, "/Root", "table-index", options); + } + + void Setup(Tests::TServer::TPtr server, TActorId sender) { + server->GetRuntime()->SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_DEBUG); + server->GetRuntime()->SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE); + + InitRoot(server, sender); + + CreateMainTable(server, sender); + FillMainTable(server, sender); + CreateIndexTable(server, sender); + } + + Y_UNIT_TEST(BadRequest) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root"); + + Tests::TServer::TPtr server = new TServer(serverSettings); + auto sender = server->GetRuntime()->AllocateEdgeActor(); + + Setup(server, sender); + + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.SetTabletId(0); + }, TStringBuilder() << "{ <main>: Error: Wrong shard 0 this is " << GetTableShards(server, sender, kMainTable)[0] << " }"); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + TPathId(0, 0).ToProto(request.MutablePathId()); + }, "{ <main>: Error: Unknown table id: 0 }"); + + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.SetSnapshotStep(request.GetSnapshotStep() + 1); + }, "Error: Unknown snapshot", true); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.ClearSnapshotStep(); + }, "{ <main>: Error: Missing snapshot }"); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.SetSnapshotTxId(request.GetSnapshotTxId() + 1); + }, "Error: Unknown snapshot", true); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.ClearSnapshotTxId(); + }, "{ <main>: Error: Missing snapshot }"); + + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.clear_settings(); + }, "{ <main>: Error: Missing fulltext index settings }"); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.MutableSettings()->clear_columns(); + }, "{ <main>: Error: columns should be set }"); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.MutableSettings()->mutable_columns()->at(0).mutable_analyzers()->clear_tokenizer(); + }, "{ <main>: Error: tokenizer should be set }"); + + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.ClearIndexName(); + }, "{ <main>: Error: Empty index table name }"); + + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.MutableSettings()->mutable_columns()->at(0).set_column("some"); + }, "{ <main>: Error: Unknown key column: some }"); + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.AddDataColumns("some"); + }, "{ <main>: Error: Unknown data column: some }"); + + // test multiple issues: + DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) { + request.ClearIndexName(); + request.AddDataColumns("some"); + }, "[ { <main>: Error: Empty index table name } { <main>: Error: Unknown data column: some } ]"); + } + + Y_UNIT_TEST(Build) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root"); + + Tests::TServer::TPtr server = new TServer(serverSettings); + auto sender = server->GetRuntime()->AllocateEdgeActor(); + + Setup(server, sender); + + auto result = DoBuild(server, sender, [](auto&){}); + + UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, data = (empty maybe) +__ydb_token = apple, key = 2, data = (empty maybe) +__ydb_token = apple, key = 3, data = (empty maybe) +__ydb_token = car, key = 4, data = (empty maybe) +__ydb_token = green, key = 1, data = (empty maybe) +__ydb_token = red, key = 2, data = (empty maybe) +__ydb_token = red, key = 4, data = (empty maybe) +__ydb_token = yellow, key = 3, data = (empty maybe) +)"); + } + + Y_UNIT_TEST(BuildWithData) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root"); + + Tests::TServer::TPtr server = new TServer(serverSettings); + auto sender = server->GetRuntime()->AllocateEdgeActor(); + + Setup(server, sender); + + auto result = DoBuild(server, sender, [](auto& request) { + request.AddDataColumns("data"); + }); + + UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, data = one +__ydb_token = apple, key = 2, data = two +__ydb_token = apple, key = 3, data = three +__ydb_token = car, key = 4, data = four +__ydb_token = green, key = 1, data = one +__ydb_token = red, key = 2, data = two +__ydb_token = red, key = 4, data = four +__ydb_token = yellow, key = 3, data = three +)"); + } + + Y_UNIT_TEST(BuildWithTextData) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root"); + + Tests::TServer::TPtr server = new TServer(serverSettings); + auto sender = server->GetRuntime()->AllocateEdgeActor(); + + InitRoot(server, sender); + + CreateMainTable(server, sender); + FillMainTable(server, sender); + + { // CreateIndexTable with text column + TShardedTableOptions options; + options.EnableOutOfOrder(true); + options.Shards(1); + options.AllowSystemColumnNames(true); + options.Columns({ + {TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true}, + {"key", "Uint32", true, true}, + {"text", "String", false, false}, + {"data", "String", false, false}, + }); + CreateShardedTable(server, sender, "/Root", "table-index", options); + } + + auto result = DoBuild(server, sender, [](auto& request) { + request.AddDataColumns("text"); + request.AddDataColumns("data"); + }); + + UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, text = green apple, data = one +__ydb_token = apple, key = 2, text = red apple, data = two +__ydb_token = apple, key = 3, text = yellow apple, data = three +__ydb_token = car, key = 4, text = red car, data = four +__ydb_token = green, key = 1, text = green apple, data = one +__ydb_token = red, key = 2, text = red apple, data = two +__ydb_token = red, key = 4, text = red car, data = four +__ydb_token = yellow, key = 3, text = yellow apple, data = three +)"); + } + + Y_UNIT_TEST(BuildWithTextFromKey) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root"); + + Tests::TServer::TPtr server = new TServer(serverSettings); + auto sender = server->GetRuntime()->AllocateEdgeActor(); + + server->GetRuntime()->SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_DEBUG); + server->GetRuntime()->SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE); + + InitRoot(server, sender); + + { // CreateMainTable + TShardedTableOptions options; + options.EnableOutOfOrder(true); + options.Shards(1); + options.AllowSystemColumnNames(false); + options.Columns({ + {"key", "Uint32", true, true}, + {"text", "String", true, true}, + {"subkey", "Uint32", true, true}, + {"data", "String", false, false}, + }); + CreateShardedTable(server, sender, "/Root", "table-main", options); + } + { // FillMainTable + ExecSQL(server, sender, R"( + UPSERT INTO `/Root/table-main` (key, text, subkey, data) VALUES + (1, "green apple", 11, "one"), + (2, "red apple", 22, "two"), + (3, "yellow apple", 33, "three"), + (4, "red car", 44, "four") + )"); + } + { // CreateIndexTable + TShardedTableOptions options; + options.EnableOutOfOrder(true); + options.Shards(1); + options.AllowSystemColumnNames(true); + options.Columns({ + {TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true}, + {"key", "Uint32", true, true}, + {"text", "String", true, true}, + {"subkey", "Uint32", true, true}, + {"data", "String", false, false}, + }); + CreateShardedTable(server, sender, "/Root", "table-index", options); + } + + auto result = DoBuild(server, sender, [](auto& request) { + request.AddDataColumns("data"); + }); + + UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, text = green apple, subkey = 11, data = one +__ydb_token = apple, key = 2, text = red apple, subkey = 22, data = two +__ydb_token = apple, key = 3, text = yellow apple, subkey = 33, data = three +__ydb_token = car, key = 4, text = red car, subkey = 44, data = four +__ydb_token = green, key = 1, text = green apple, subkey = 11, data = one +__ydb_token = red, key = 2, text = red apple, subkey = 22, data = two +__ydb_token = red, key = 4, text = red car, subkey = 44, data = four +__ydb_token = yellow, key = 3, text = yellow apple, subkey = 33, data = three +)"); + } +} + +} diff --git a/ydb/core/tx/datashard/build_index/ut/ya.make b/ydb/core/tx/datashard/build_index/ut/ya.make index 8da7981617c..b71af3fef00 100644 --- a/ydb/core/tx/datashard/build_index/ut/ya.make +++ b/ydb/core/tx/datashard/build_index/ut/ya.make @@ -27,6 +27,7 @@ PEERDIR( YQL_LAST_ABI_VERSION() SRCS( + ut_fulltext.cpp ut_local_kmeans.cpp ut_prefix_kmeans.cpp ut_recompute_kmeans.cpp diff --git a/ydb/core/tx/datashard/datashard.h b/ydb/core/tx/datashard/datashard.h index 47becb18023..dabc4175395 100644 --- a/ydb/core/tx/datashard/datashard.h +++ b/ydb/core/tx/datashard/datashard.h @@ -367,6 +367,9 @@ namespace TEvDataShard { EvIncrementalRestoreResponse, + EvBuildFulltextIndexRequest, + EvBuildFulltextIndexResponse, + EvEnd }; @@ -1560,6 +1563,18 @@ namespace TEvDataShard { TEvDataShard::EvPrefixKMeansResponse> { }; + struct TEvBuildFulltextIndexRequest + : public TEventPB<TEvBuildFulltextIndexRequest, + NKikimrTxDataShard::TEvBuildFulltextIndexRequest, + TEvDataShard::EvBuildFulltextIndexRequest> { + }; + + struct TEvBuildFulltextIndexResponse + : public TEventPB<TEvBuildFulltextIndexResponse, + NKikimrTxDataShard::TEvBuildFulltextIndexResponse, + TEvDataShard::EvBuildFulltextIndexResponse> { + }; + struct TEvIncrementalRestoreResponse : public TEventPB<TEvIncrementalRestoreResponse, NKikimrTxDataShard::TEvIncrementalRestoreResponse, diff --git a/ydb/core/tx/datashard/datashard_impl.h b/ydb/core/tx/datashard/datashard_impl.h index 52f48f77cd6..29a967d6aaf 100644 --- a/ydb/core/tx/datashard/datashard_impl.h +++ b/ydb/core/tx/datashard/datashard_impl.h @@ -241,6 +241,7 @@ class TDataShard class TTxHandleSafeReshuffleKMeansScan; class TTxHandleSafeRecomputeKMeansScan; class TTxHandleSafeStatisticsScan; + class TTxHandleSafeBuildFulltextIndexScan; class TTxMediatorStateRestored; @@ -1342,6 +1343,8 @@ class TDataShard void HandleSafe(TEvDataShard::TEvLocalKMeansRequest::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvPrefixKMeansRequest::TPtr& ev, const TActorContext& ctx); void HandleSafe(TEvDataShard::TEvPrefixKMeansRequest::TPtr& ev, const TActorContext& ctx); + void Handle(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext& ctx); + void HandleSafe(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvCdcStreamScanRequest::TPtr& ev, const TActorContext& ctx); void Handle(TEvPrivate::TEvCdcStreamScanRegistered::TPtr& ev, const TActorContext& ctx); void Handle(TEvPrivate::TEvCdcStreamScanProgress::TPtr& ev, const TActorContext& ctx); @@ -3225,6 +3228,7 @@ protected: HFunc(TEvDataShard::TEvRecomputeKMeansRequest, Handle); HFunc(TEvDataShard::TEvLocalKMeansRequest, Handle); HFunc(TEvDataShard::TEvPrefixKMeansRequest, Handle); + HFunc(TEvDataShard::TEvBuildFulltextIndexRequest, Handle); HFunc(TEvDataShard::TEvCdcStreamScanRequest, Handle); HFunc(TEvPrivate::TEvCdcStreamScanRegistered, Handle); HFunc(TEvPrivate::TEvCdcStreamScanProgress, Handle); diff --git a/ydb/core/tx/datashard/ya.make b/ydb/core/tx/datashard/ya.make index c210e96e4f9..ee92b38d455 100644 --- a/ydb/core/tx/datashard/ya.make +++ b/ydb/core/tx/datashard/ya.make @@ -218,13 +218,14 @@ SRCS( wait_for_plan_unit.cpp wait_for_stream_clearance_unit.cpp - build_index/prefix_kmeans.cpp + build_index/fulltext.cpp build_index/kmeans_helper.cpp build_index/local_kmeans.cpp - build_index/sample_k.cpp - build_index/secondary_index.cpp + build_index/prefix_kmeans.cpp build_index/recompute_kmeans.cpp build_index/reshuffle_kmeans.cpp + build_index/sample_k.cpp + build_index/secondary_index.cpp build_index/unique_index.cpp ) diff --git a/ydb/core/tx/scheme_board/cache.cpp b/ydb/core/tx/scheme_board/cache.cpp index b9cb865161a..cea2875fe8a 100644 --- a/ydb/core/tx/scheme_board/cache.cpp +++ b/ydb/core/tx/scheme_board/cache.cpp @@ -917,6 +917,8 @@ class TSchemeCache: public TMonitorableActor<TSchemeCache> { return NSchemeCache::ETableKind::KindAsyncIndexTable; case NKikimrSchemeOp::EPathSubTypeVectorKmeansTreeIndexImplTable: return NSchemeCache::ETableKind::KindVectorIndexTable; + case NKikimrSchemeOp::EPathSubTypeFulltextIndexImplTable: + return NSchemeCache::ETableKind::KindFulltextIndexTable; default: return NSchemeCache::ETableKind::KindRegularTable; } diff --git a/ydb/core/tx/scheme_cache/scheme_cache.h b/ydb/core/tx/scheme_cache/scheme_cache.h index 7d57ca289ee..e37c7fa57e3 100644 --- a/ydb/core/tx/scheme_cache/scheme_cache.h +++ b/ydb/core/tx/scheme_cache/scheme_cache.h @@ -157,6 +157,7 @@ enum class ETableKind { KindSyncIndexTable = 2, KindAsyncIndexTable = 3, KindVectorIndexTable = 4, + KindFulltextIndexTable = 5, }; struct TSchemeCacheNavigate { diff --git a/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp b/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp index 74bb9febe8b..0eeed9290d5 100644 --- a/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp @@ -1,6 +1,8 @@ #include "schemeshard_impl.h" #include <util/string/join.h> +#include <ydb/core/base/table_index.h> +#include <ydb/core/protos/flat_scheme_op.pb.h> namespace NKikimr { namespace NSchemeShard { @@ -239,7 +241,7 @@ private: auto index = GetIndex(childPath); if (index->Type == NKikimrSchemeOp::EIndexTypeGlobalAsync - || index->Type == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree) { + || !DoesIndexSupportTTL(index->Type)) { continue; } @@ -276,7 +278,7 @@ private: } static TVector<std::pair<ui32, ui32>> MakeColumnIds(TTableInfo::TPtr mainTable, TTableIndexInfo::TPtr index, TTableInfo::TPtr indexImplTable) { - Y_ABORT_UNLESS(index->Type != NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree); + Y_ABORT_UNLESS(DoesIndexSupportTTL(index->Type)); TVector<std::pair<ui32, ui32>> result; THashSet<TString> keys; diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp index adc9ba3ead1..b571fdde52a 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp @@ -50,11 +50,23 @@ static std::optional<NKikimrSchemeOp::TModifyScheme> CreateIndexTask(NKikimr::NS *operation->MutableDataColumnNames()->Add() = dataColumn; } - if (indexInfo->Type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - *operation->MutableVectorIndexKmeansTreeDescription() = - std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription); - } else if (!std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)) { - return {}; + switch (indexInfo->Type) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + Y_ASSERT(std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)); + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + *operation->MutableVectorIndexKmeansTreeDescription() = + std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription); + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + *operation->MutableFulltextIndexDescription() = + std::get<NKikimrSchemeOp::TFulltextIndexDescription>(indexInfo->SpecializedIndexDescription); + break; + default: + return {}; // reject } return scheme; diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp index e8103b6a4e6..accf49d53c6 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp @@ -852,12 +852,24 @@ TVector<ISubOperation::TPtr> CreateCopyTable(TOperationId nextId, const TTxTrans for (const auto& dataColumn: indexInfo->IndexDataColumns) { *operation->MutableDataColumnNames()->Add() = dataColumn; } - if (indexInfo->Type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - *operation->MutableVectorIndexKmeansTreeDescription() = - std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription); - } else if (!std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)) { - return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, - TStringBuilder{} << "Copy table doesn't support table with index type " << indexInfo->Type)}; + + switch (indexInfo->Type) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + Y_ASSERT(std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)); + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + *operation->MutableVectorIndexKmeansTreeDescription() = + std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription); + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + *operation->MutableFulltextIndexDescription() = + std::get<NKikimrSchemeOp::TFulltextIndexDescription>(indexInfo->SpecializedIndexDescription); + break; + default: + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, InvalidIndexType(indexInfo->Type))}; } result.push_back(CreateNewTableIndex(NextPartId(nextId, result), schema)); diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp index a135183dfbc..133d9670700 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp @@ -40,6 +40,30 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa const auto& op = tx.GetInitiateIndexBuild(); const auto& indexDesc = op.GetIndex(); + switch (GetIndexType(indexDesc)) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + // no feature flag, everything is fine + break; + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + if (!context.SS->EnableInitialUniqueIndex) { + return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Adding a unique index to an existing table is disabled")}; + } + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + if (!context.SS->EnableVectorIndex) { + return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Vector index support is disabled")}; + } + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + if (!context.SS->EnableFulltextIndex) { + return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Fulltext index support is disabled")}; + } + break; + default: + return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, InvalidIndexType(indexDesc.GetType()))}; + } + const auto table = TPath::Resolve(op.GetTable(), context.SS); const auto index = table.Child(indexDesc.GetName()); { @@ -93,15 +117,14 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa } TVector<ISubOperation::TPtr> result; - const NKikimrSchemeOp::EIndexType indexType = indexDesc.HasType() ? indexDesc.GetType() : NKikimrSchemeOp::EIndexTypeGlobal; { auto outTx = TransactionTemplate(table.PathString(), NKikimrSchemeOp::EOperationType::ESchemeOpCreateTableIndex); *outTx.MutableLockGuard() = tx.GetLockGuard(); outTx.MutableCreateTableIndex()->CopyFrom(indexDesc); + outTx.MutableCreateTableIndex()->SetType(GetIndexType(indexDesc)); outTx.MutableCreateTableIndex()->SetState(NKikimrSchemeOp::EIndexStateWriteOnly); outTx.SetInternal(tx.GetInternal()); - outTx.MutableCreateTableIndex()->SetType(indexType); result.push_back(CreateNewTableIndex(NextPartId(opId, result), outTx)); } @@ -118,7 +141,7 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa } auto createImplTable = [&](NKikimrSchemeOp::TTableDescription&& implTableDesc) { - if (indexType != NKikimrSchemeOp::EIndexTypeGlobalUnique) { + if (GetIndexType(indexDesc) != NKikimrSchemeOp::EIndexTypeGlobalUnique) { implTableDesc.MutablePartitionConfig()->SetShadowData(true); } @@ -129,34 +152,56 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa return CreateInitializeBuildIndexImplTable(NextPartId(opId, result), outTx); }; - if (indexDesc.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - const bool prefixVectorIndex = indexDesc.GetKeyColumnNames().size() > 1; - NKikimrSchemeOp::TTableDescription indexLevelTableDesc, indexPostingTableDesc, indexPrefixTableDesc; - // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS - if (indexDesc.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) { - indexLevelTableDesc = indexDesc.GetIndexImplTableDescriptions(0); - indexPostingTableDesc = indexDesc.GetIndexImplTableDescriptions(1); - if (prefixVectorIndex) { - indexPrefixTableDesc = indexDesc.GetIndexImplTableDescriptions(2); + switch (GetIndexType(indexDesc)) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: { + NKikimrSchemeOp::TTableDescription indexTableDesc; + // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS + if (indexDesc.IndexImplTableDescriptionsSize() == 1) { + indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0); } + auto implTableDesc = CalcImplTableDesc(tableInfo, implTableColumns, indexTableDesc); + // TODO if keep erase markers also speedup compaction or something else we can enable it for other impl tables too + implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true); + result.push_back(createImplTable(std::move(implTableDesc))); + break; } - const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()}; - result.push_back(createImplTable(CalcVectorKmeansTreeLevelImplTableDesc(tableInfo->PartitionConfig(), indexLevelTableDesc))); - result.push_back(createImplTable(CalcVectorKmeansTreePostingImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexPostingTableDesc))); - if (prefixVectorIndex) { - const THashSet<TString> prefixColumns{indexDesc.GetKeyColumnNames().begin(), indexDesc.GetKeyColumnNames().end() - 1}; - result.push_back(createImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, tableInfo, tableInfo->PartitionConfig(), implTableColumns, indexPrefixTableDesc))); + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: { + const bool prefixVectorIndex = indexDesc.GetKeyColumnNames().size() > 1; + NKikimrSchemeOp::TTableDescription indexLevelTableDesc, indexPostingTableDesc, indexPrefixTableDesc; + // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS + if (indexDesc.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) { + indexLevelTableDesc = indexDesc.GetIndexImplTableDescriptions(0); + indexPostingTableDesc = indexDesc.GetIndexImplTableDescriptions(1); + if (prefixVectorIndex) { + indexPrefixTableDesc = indexDesc.GetIndexImplTableDescriptions(2); + } + } + const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()}; + result.push_back(createImplTable(CalcVectorKmeansTreeLevelImplTableDesc(tableInfo->PartitionConfig(), indexLevelTableDesc))); + result.push_back(createImplTable(CalcVectorKmeansTreePostingImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexPostingTableDesc))); + if (prefixVectorIndex) { + const THashSet<TString> prefixColumns{indexDesc.GetKeyColumnNames().begin(), indexDesc.GetKeyColumnNames().end() - 1}; + result.push_back(createImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, tableInfo, tableInfo->PartitionConfig(), implTableColumns, indexPrefixTableDesc))); + } + break; } - } else { - NKikimrSchemeOp::TTableDescription indexTableDesc; - // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS - if (indexDesc.IndexImplTableDescriptionsSize() == 1) { - indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0); + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: { + NKikimrSchemeOp::TTableDescription indexTableDesc; + // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS + if (indexDesc.IndexImplTableDescriptionsSize() == 1) { + indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0); + } + const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()}; + auto implTableDesc = CalcFulltextImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexTableDesc); + implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true); + result.push_back(createImplTable(std::move(implTableDesc))); + break; } - auto implTableDesc = CalcImplTableDesc(tableInfo, implTableColumns, indexTableDesc); - // TODO if keep erase markers also speedup compaction or something else we can enable it for other impl tables too - implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true); - result.push_back(createImplTable(std::move(implTableDesc))); + default: + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexDesc.GetType())); + break; } return result; diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp index ea53e715266..fc648c9dda8 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp @@ -127,6 +127,41 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr TTableColumns baseTableColumns = ExtractInfo(baseTableDescription); for (auto& indexDescription: indexedTable.GetIndexDescription()) { const auto& indexName = indexDescription.GetName(); + + switch (GetIndexType(indexDescription)) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + // no feature flag, everything is fine + break; + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + if (!context.SS->EnableInitialUniqueIndex) { + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Unique constraint feature is disabled")}; + } + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: { + if (!context.SS->EnableVectorIndex) { + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Vector index support is disabled")}; + } + TString msg; + if (!NKikimr::NKMeans::ValidateSettings(indexDescription.GetVectorIndexKmeansTreeDescription().GetSettings(), msg)) { + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)}; + } + break; + } + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: { + if (!context.SS->EnableFulltextIndex) { + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Fulltext index support is disabled")}; + } + TString msg; + if (!NKikimr::NFulltext::ValidateSettings(indexDescription.GetFulltextIndexDescription().GetSettings(), msg)) { + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)}; + } + break; + } + default: + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, InvalidIndexType(indexDescription.GetType()))}; + } + bool uniformIndexTable = false; if (indexDescription.IndexImplTableDescriptionsSize()) { if (indexDescription.GetIndexImplTableDescriptions(0).HasUniformPartitionsCount()) { @@ -241,35 +276,6 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr } for (auto& indexDescription: indexedTable.GetIndexDescription()) { - const auto indexType = indexDescription.HasType() - ? indexDescription.GetType() - : NKikimrSchemeOp::EIndexTypeGlobal; - - switch (indexType) { - case NKikimrSchemeOp::EIndexTypeInvalid: - return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Invalid index type")}; - case NKikimrSchemeOp::EIndexTypeGlobal: - case NKikimrSchemeOp::EIndexTypeGlobalAsync: - // no feature flag, everything is fine - break; - case NKikimrSchemeOp::EIndexTypeGlobalUnique: { - if (!context.SS->EnableInitialUniqueIndex) { - return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Unique constraint feature is disabled")}; - } - break; - } - case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: { - if (!context.SS->EnableVectorIndex) { - return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Vector index support is disabled")}; - } - TString msg; - if (!NKikimr::NKMeans::ValidateSettings(indexDescription.GetVectorIndexKmeansTreeDescription().GetSettings(), msg)) { - return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)}; - } - break; - } - } - { auto scheme = TransactionTemplate( tx.GetWorkingDir() + "/" + baseTableDescription.GetName(), @@ -279,7 +285,7 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr scheme.SetInternal(tx.GetInternal()); scheme.MutableCreateTableIndex()->CopyFrom(indexDescription); - scheme.MutableCreateTableIndex()->SetType(indexType); + scheme.MutableCreateTableIndex()->SetType(GetIndexType(indexDescription)); result.push_back(CreateNewTableIndex(NextPartId(nextId, result), scheme)); } @@ -298,32 +304,51 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr }; const auto& implTableColumns = indexes.at(indexDescription.GetName()); - if (indexDescription.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - const bool prefixVectorIndex = indexDescription.GetKeyColumnNames().size() > 1; - NKikimrSchemeOp::TTableDescription userLevelDesc, userPostingDesc, userPrefixDesc; - if (indexDescription.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) { - // This description provided by user to override partition policy - userLevelDesc = indexDescription.GetIndexImplTableDescriptions(0); - userPostingDesc = indexDescription.GetIndexImplTableDescriptions(1); - if (prefixVectorIndex) { - userPrefixDesc = indexDescription.GetIndexImplTableDescriptions(2); + switch (GetIndexType(indexDescription)) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: { + NKikimrSchemeOp::TTableDescription userIndexDesc; + if (indexDescription.IndexImplTableDescriptionsSize()) { + // This description provided by user to override partition policy + userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0); } + result.push_back(createIndexImplTable(CalcImplTableDesc(baseTableDescription, implTableColumns, userIndexDesc))); + break; } - const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()}; - result.push_back(createIndexImplTable(CalcVectorKmeansTreeLevelImplTableDesc(baseTableDescription.GetPartitionConfig(), userLevelDesc))); - result.push_back(createIndexImplTable(CalcVectorKmeansTreePostingImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userPostingDesc))); - if (prefixVectorIndex) { - const THashSet<TString> prefixColumns{indexDescription.GetKeyColumnNames().begin(), indexDescription.GetKeyColumnNames().end() - 1}; - result.push_back(createIndexImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, baseTableDescription, baseTableDescription.GetPartitionConfig(), implTableColumns, userPrefixDesc))); + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: { + const bool prefixVectorIndex = indexDescription.GetKeyColumnNames().size() > 1; + NKikimrSchemeOp::TTableDescription userLevelDesc, userPostingDesc, userPrefixDesc; + if (indexDescription.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) { + // This description provided by user to override partition policy + userLevelDesc = indexDescription.GetIndexImplTableDescriptions(0); + userPostingDesc = indexDescription.GetIndexImplTableDescriptions(1); + if (prefixVectorIndex) { + userPrefixDesc = indexDescription.GetIndexImplTableDescriptions(2); + } + } + const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()}; + result.push_back(createIndexImplTable(CalcVectorKmeansTreeLevelImplTableDesc(baseTableDescription.GetPartitionConfig(), userLevelDesc))); + result.push_back(createIndexImplTable(CalcVectorKmeansTreePostingImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userPostingDesc))); + if (prefixVectorIndex) { + const THashSet<TString> prefixColumns{indexDescription.GetKeyColumnNames().begin(), indexDescription.GetKeyColumnNames().end() - 1}; + result.push_back(createIndexImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, baseTableDescription, baseTableDescription.GetPartitionConfig(), implTableColumns, userPrefixDesc))); + } + break; } - } else { - NKikimrSchemeOp::TTableDescription userIndexDesc; - if (indexDescription.IndexImplTableDescriptionsSize()) { - // This description provided by user to override partition policy - userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0); + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: { + NKikimrSchemeOp::TTableDescription userIndexDesc; + if (indexDescription.IndexImplTableDescriptionsSize()) { + // This description provided by user to override partition policy + userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0); + } + const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()}; + result.push_back(createIndexImplTable(CalcFulltextImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userIndexDesc))); + break; } - - result.push_back(createIndexImplTable(CalcImplTableDesc(baseTableDescription, implTableColumns, userIndexDesc))); + default: + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexDescription.GetType())); + break; } } diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index.cpp index 65d1401d0c4..8cb8de853d3 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index.cpp @@ -1,6 +1,7 @@ #include "schemeshard_build_index.h" #include "schemeshard_impl.h" +#include <ydb/core/protos/flat_scheme_op.pb.h> namespace NKikimr { namespace NSchemeShard { @@ -95,11 +96,25 @@ void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuil *serializableRepresentation.AddIndexImplTableDescriptions() = description; } - std::visit([&]<typename T>(const T& specializedDescription) { - if constexpr (std::is_same_v<T, NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>) { - *serializableRepresentation.MutableVectorIndexKmeansTreeDescription() = specializedDescription; - } - }, info.SpecializedIndexDescription); + switch (info.IndexType) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + Y_ASSERT(std::holds_alternative<std::monostate>(info.SpecializedIndexDescription)); + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + *serializableRepresentation.MutableVectorIndexKmeansTreeDescription() = + std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(info.SpecializedIndexDescription); + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + *serializableRepresentation.MutableFulltextIndexDescription() = + std::get<NKikimrSchemeOp::TFulltextIndexDescription>(info.SpecializedIndexDescription); + break; + default: + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(info.IndexType)); + break; + } persistedBuildIndex.Update( NIceDb::TUpdate<Schema::IndexBuild::CreationConfig>(serializableRepresentation.SerializeAsString()) diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp index ef71de0641e..0b83aae3012 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp @@ -5,6 +5,7 @@ #include "schemeshard_utils.h" // for NTableIndex::CommonCheck #include "schemeshard_xxport__helpers.h" +#include <ydb/core/protos/flat_scheme_op.pb.h> #include <ydb/core/ydb_convert/table_settings.h> namespace NKikimr::NSchemeShard { @@ -157,6 +158,11 @@ public: return makeReply(explain); } + if (tableInfo->IsTTLEnabled() && !DoesIndexSupportTTL(buildInfo->IndexType)) { + return Reply(Ydb::StatusIds::PRECONDITION_FAILED, + TStringBuilder() << buildInfo->IndexType << " index doesn't support TTL"); + } + NKikimrSchemeOp::TIndexBuildConfig tmpConfig; buildInfo->SerializeToProto(Self, &tmpConfig); const auto indexDesc = tmpConfig.GetIndex(); @@ -234,7 +240,7 @@ private: buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildSecondaryIndex; buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync; break; - case Ydb::Table::TableIndex::TypeCase::kGlobalUniqueIndex: + case Ydb::Table::TableIndex::TypeCase::kGlobalUniqueIndex: { if (!Self->EnableAddUniqueIndex) { explain = "Adding a unique index to an existing table is disabled"; return false; @@ -242,6 +248,7 @@ private: buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildSecondaryUniqueIndex; buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique; break; + } case Ydb::Table::TableIndex::TypeCase::kGlobalVectorKmeansTreeIndex: { if (!Self->EnableVectorIndex) { explain = "Vector index support is disabled"; @@ -266,6 +273,21 @@ private: } break; } + case Ydb::Table::TableIndex::TypeCase::kGlobalFulltextIndex: { + if (!Self->EnableFulltextIndex) { + explain = "Fulltext index support is disabled"; + return false; + } + buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildFulltext; + buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext; + NKikimrSchemeOp::TFulltextIndexDescription fulltextIndexDescription; + *fulltextIndexDescription.MutableSettings() = index.global_fulltext_index().fulltext_settings(); + if (!NKikimr::NFulltext::ValidateSettings(fulltextIndexDescription.GetSettings(), explain)) { + return false; + } + buildInfo.SpecializedIndexDescription = fulltextIndexDescription; + break; + } }; buildInfo.IndexName = index.name(); diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp index 771239e89d8..675b2cde48e 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp @@ -301,8 +301,11 @@ void TSchemeShard::TIndexBuilder::TTxBase::Fill(NKikimrIndexBuilder::TIndexBuild case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree: *index.mutable_global_vector_kmeans_tree_index() = Ydb::Table::GlobalVectorKMeansTreeIndex(); break; + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext: + *index.mutable_global_fulltext_index() = Ydb::Table::GlobalFulltextIndex(); + break; default: - Y_ABORT("Unreachable"); + Y_ENSURE(false, InvalidIndexType(info.IndexType)); } } else if (info.IsBuildColumns()) { for(const auto& column : info.BuildColumns) { diff --git a/ydb/core/tx/schemeshard/schemeshard_impl.cpp b/ydb/core/tx/schemeshard/schemeshard_impl.cpp index 2b4c4214332..18d789c4d5d 100644 --- a/ydb/core/tx/schemeshard/schemeshard_impl.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_impl.cpp @@ -5089,6 +5089,7 @@ void TSchemeShard::OnActivateExecutor(const TActorContext &ctx) { EnableVectorIndex = appData->FeatureFlags.GetEnableVectorIndex(); EnableInitialUniqueIndex = appData->FeatureFlags.GetEnableUniqConstraint(); EnableAddUniqueIndex = appData->FeatureFlags.GetEnableAddUniqueIndex(); + EnableFulltextIndex = appData->FeatureFlags.GetEnableFulltextIndex(); EnableResourcePoolsOnServerless = appData->FeatureFlags.GetEnableResourcePoolsOnServerless(); EnableExternalDataSourcesOnServerless = appData->FeatureFlags.GetEnableExternalDataSourcesOnServerless(); EnableShred = appData->FeatureFlags.GetEnableDataErasure(); @@ -7806,6 +7807,7 @@ void TSchemeShard::ApplyConsoleConfigs(const NKikimrConfig::TFeatureFlags& featu EnableVectorIndex = featureFlags.GetEnableVectorIndex(); EnableInitialUniqueIndex = featureFlags.GetEnableUniqConstraint(); EnableAddUniqueIndex = featureFlags.GetEnableAddUniqueIndex(); + EnableFulltextIndex = featureFlags.GetEnableFulltextIndex(); EnableExternalDataSourcesOnServerless = featureFlags.GetEnableExternalDataSourcesOnServerless(); EnableShred = featureFlags.GetEnableDataErasure(); EnableExternalSourceSchemaInference = featureFlags.GetEnableExternalSourceSchemaInference(); diff --git a/ydb/core/tx/schemeshard/schemeshard_impl.h b/ydb/core/tx/schemeshard/schemeshard_impl.h index 654d61e73de..6fc6d569ca5 100644 --- a/ydb/core/tx/schemeshard/schemeshard_impl.h +++ b/ydb/core/tx/schemeshard/schemeshard_impl.h @@ -359,6 +359,7 @@ public: bool EnableVectorIndex = false; bool EnableInitialUniqueIndex = false; bool EnableAddUniqueIndex = false; + bool EnableFulltextIndex = false; bool EnableExternalDataSourcesOnServerless = false; bool EnableShred = false; bool EnableExternalSourceSchemaInference = false; diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp index e7929c9ad09..f798da43ef4 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp @@ -6,10 +6,12 @@ #include <ydb/core/base/appdata.h> #include <ydb/core/base/channel_profiles.h> +#include <ydb/core/base/table_index.h> #include <ydb/core/base/tx_processing.h> #include <ydb/core/engine/minikql/flat_local_tx_factory.h> #include <ydb/core/engine/mkql_proto.h> #include <ydb/core/protos/config.pb.h> +#include <ydb/core/protos/flat_scheme_op.pb.h> #include <ydb/core/scheme/scheme_types_proto.h> #include <ydb/core/tablet/tablet_counters_aggregator.h> #include <ydb/core/tablet/tablet_counters_protobuf.h> @@ -618,8 +620,8 @@ TTableInfo::TAlterDataPtr TTableInfo::CreateAlterData( if (op.HasTTLSettings()) { for (const auto& indexDescription : op.GetTableIndexes()) { - if (indexDescription.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - errStr = "Table with vector indexes doesn't support TTL"; + if (!DoesIndexSupportTTL(indexDescription.GetType())) { + errStr = TStringBuilder() << "Table with " << indexDescription.GetType() << " index doesn't support TTL"; return nullptr; } } @@ -2292,8 +2294,22 @@ void TIndexBuildInfo::SerializeToProto(TSchemeShard* ss, NKikimrSchemeOp::TIndex ImplTableDescriptions.end() }; - if (IndexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - *index.MutableVectorIndexKmeansTreeDescription() = std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(SpecializedIndexDescription); + switch (IndexType) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + Y_ASSERT(std::holds_alternative<std::monostate>(SpecializedIndexDescription)); + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + *index.MutableVectorIndexKmeansTreeDescription() = std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(SpecializedIndexDescription); + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + *index.MutableFulltextIndexDescription() = std::get<NKikimrSchemeOp::TFulltextIndexDescription>(SpecializedIndexDescription); + break; + default: + Y_DEBUG_ABORT_S(InvalidIndexType(IndexType)); + break; } } diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index 45a43b4772a..ef04a3c1286 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -8,6 +8,8 @@ #include "schemeshard_tx_infly.h" #include "schemeshard_types.h" +#include <util/generic/yexception.h> +#include <ydb/core/protos/flat_scheme_op.pb.h> #include <ydb/public/api/protos/ydb_cms.pb.h> #include <ydb/public/api/protos/ydb_coordination.pb.h> #include <ydb/public/api/protos/ydb_import.pb.h> @@ -17,6 +19,7 @@ #include <ydb/core/backup/common/encryption.h> #include <ydb/core/backup/common/metadata.h> #include <ydb/core/base/feature_flags.h> +#include <ydb/core/base/fulltext.h> #include <ydb/core/base/kmeans_clusters.h> #include <ydb/core/base/storage_pools.h> #include <ydb/core/base/table_index.h> @@ -2443,9 +2446,30 @@ struct TTableIndexInfo : public TSimpleRefCount<TTableIndexInfo> { , Type(type) , State(state) { - if (type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - Y_ENSURE(SpecializedIndexDescription.emplace<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>() - .ParseFromString(description)); + switch (type) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + Y_ASSERT(description.empty()); + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: { + auto success = SpecializedIndexDescription + .emplace<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>() + .ParseFromString(description); + Y_ENSURE(success, description); + break; + } + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: { + auto success = SpecializedIndexDescription + .emplace<NKikimrSchemeOp::TFulltextIndexDescription>() + .ParseFromString(description); + Y_ENSURE(success, description); + break; + } + default: + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(type)); + break; } } @@ -2494,8 +2518,21 @@ struct TTableIndexInfo : public TSimpleRefCount<TTableIndexInfo> { alterData->State = config.HasState() ? config.GetState() : EState::EIndexStateReady; - if (config.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - alterData->SpecializedIndexDescription = config.GetVectorIndexKmeansTreeDescription(); + switch (GetIndexType(config)) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + alterData->SpecializedIndexDescription = config.GetVectorIndexKmeansTreeDescription(); + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + alterData->SpecializedIndexDescription = config.GetFulltextIndexDescription(); + break; + default: + errMsg += InvalidIndexType(config.GetType()); + return nullptr; } return result; @@ -2510,7 +2547,9 @@ struct TTableIndexInfo : public TSimpleRefCount<TTableIndexInfo> { TTableIndexInfo::TPtr AlterData = nullptr; - std::variant<std::monostate, NKikimrSchemeOp::TVectorIndexKmeansTreeDescription> SpecializedIndexDescription; + std::variant<std::monostate, + NKikimrSchemeOp::TVectorIndexKmeansTreeDescription, + NKikimrSchemeOp::TFulltextIndexDescription> SpecializedIndexDescription; }; struct TCdcStreamSettings { @@ -3127,6 +3166,7 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { BuildPrefixedVectorIndex = 12, BuildSecondaryUniqueIndex = 13, BuildColumns = 20, + BuildFulltext = 30, }; TActorId CreateSender; @@ -3155,7 +3195,9 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { TString TargetName; TVector<NKikimrSchemeOp::TTableDescription> ImplTableDescriptions; - std::variant<std::monostate, NKikimrSchemeOp::TVectorIndexKmeansTreeDescription> SpecializedIndexDescription; + std::variant<std::monostate, + NKikimrSchemeOp::TVectorIndexKmeansTreeDescription, + NKikimrSchemeOp::TFulltextIndexDescription> SpecializedIndexDescription; struct TKMeans { // TODO(mbkkt) move to TVectorIndexKmeansTreeDescription @@ -3606,11 +3648,17 @@ public: indexInfo->Clusters = NKikimr::NKMeans::CreateClusters(desc.settings().settings(), indexInfo->KMeans.Rounds, createError); Y_ENSURE(indexInfo->Clusters, createError); indexInfo->SpecializedIndexDescription = std::move(desc); - } break; + break; + } + case NKikimrSchemeOp::TIndexCreationConfig::kFulltextIndexDescription: { + auto& desc = *creationConfig.MutableFulltextIndexDescription(); + indexInfo->SpecializedIndexDescription = std::move(desc); + break; + } case NKikimrSchemeOp::TIndexCreationConfig::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: /* do nothing */ break; - } + } } LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::BUILD_INDEX, diff --git a/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp b/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp index e1c0bf16e34..73d2ae506b1 100644 --- a/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp @@ -1,5 +1,6 @@ #include "schemeshard_path_describer.h" +#include <ydb/core/protos/flat_scheme_op.pb.h> #include <ydb/public/api/protos/annotations/sensitive.pb.h> #include <ydb/core/base/appdata.h> @@ -216,8 +217,10 @@ TPathElement::EPathSubType TPathDescriber::CalcPathSubType(const TPath& path) { return TPathElement::EPathSubType::EPathSubTypeSyncIndexImplTable; case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: return TPathElement::EPathSubType::EPathSubTypeVectorKmeansTreeIndexImplTable; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + return TPathElement::EPathSubType::EPathSubTypeFulltextIndexImplTable; default: - Y_DEBUG_ABORT("%s", (TStringBuilder() << "unexpected indexInfo->Type# " << indexInfo->Type).data()); + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexInfo->Type)); return TPathElement::EPathSubType::EPathSubTypeEmpty; } } else if (parentPath.IsCdcStream()) { @@ -1471,14 +1474,23 @@ void TSchemeShard::DescribeTableIndex(const TPathId& pathId, const TString& name } entry.SetDataSize(dataSize); - if (indexInfo->Type == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree) { - if (const auto* vectorIndexKmeansTreeDescription = std::get_if<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(&indexInfo->SpecializedIndexDescription)) { - *entry.MutableVectorIndexKmeansTreeDescription() = *vectorIndexKmeansTreeDescription; - } else { - Y_FAIL_S("SpecializedIndexDescription should be set"); - } + switch (indexInfo->Type) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + // no specialized index description + Y_ASSERT(std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)); + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + *entry.MutableVectorIndexKmeansTreeDescription() = std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription); + break; + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + *entry.MutableFulltextIndexDescription() = std::get<NKikimrSchemeOp::TFulltextIndexDescription>(indexInfo->SpecializedIndexDescription); + break; + default: + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexInfo->Type)); + break; } - } void TSchemeShard::DescribeCdcStream(const TPathId& pathId, const TString& name, diff --git a/ydb/core/tx/schemeshard/schemeshard_schema.h b/ydb/core/tx/schemeshard/schemeshard_schema.h index 6122fb4ab53..c829f2ea41d 100644 --- a/ydb/core/tx/schemeshard/schemeshard_schema.h +++ b/ydb/core/tx/schemeshard/schemeshard_schema.h @@ -17,6 +17,9 @@ namespace NKikimr::NSchemeShard { inline constexpr auto ClusterIdTypeId = NScheme::NTypeIds::Uint64; +// TODO: support utf-8 in fulltext index +inline constexpr auto TokenTypeId = NScheme::NTypeIds::String; + struct Schema : NIceDb::Schema { struct Paths : Table<1> { struct Id : Column<1, NScheme::NTypeIds::Uint64> { using Type = TLocalPathId; }; diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.cpp b/ydb/core/tx/schemeshard/schemeshard_utils.cpp index 95cd196bdef..7ef907b8cd0 100644 --- a/ydb/core/tx/schemeshard/schemeshard_utils.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_utils.cpp @@ -315,6 +315,36 @@ auto CalcVectorKmeansTreePrefixImplTableDescImpl( return implTableDesc; } +auto CalcFulltextImplTableDescImpl( + const auto& baseTable, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const THashSet<TString>& indexDataColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) +{ + auto tableColumns = ExtractInfo(baseTable); + THashSet<TString> indexColumns = indexDataColumns; + for (const auto & keyColumn: tableColumns.Keys) { + indexColumns.insert(keyColumn); + } + + NKikimrSchemeOp::TTableDescription implTableDesc; + implTableDesc.SetName(NTableIndex::ImplTable); + SetImplTablePartitionConfig(baseTablePartitionConfig, indexTableDesc, implTableDesc); + { + auto tokenColumn = implTableDesc.AddColumns(); + tokenColumn->SetName(NFulltext::TokenColumn); + tokenColumn->SetType(NFulltext::TokenTypeName); + tokenColumn->SetTypeId(NSchemeShard::TokenTypeId); + tokenColumn->SetNotNull(true); + } + implTableDesc.AddKeyColumnNames(NFulltext::TokenColumn); + FillIndexImplTableColumns(GetColumns(baseTable), tableColumns.Keys, indexColumns, implTableDesc); + + implTableDesc.SetSystemColumnNamesAllowed(true); + + return implTableDesc; +} + } void FillIndexTableColumns( @@ -421,6 +451,24 @@ NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePrefixImplTableDesc( return CalcVectorKmeansTreePrefixImplTableDescImpl(indexKeyColumns, baseTableDescr, baseTablePartitionConfig, implTableColumns, indexTableDesc); } +NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc( + const NSchemeShard::TTableInfo::TPtr& baseTableInfo, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const THashSet<TString>& indexDataColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) +{ + return CalcFulltextImplTableDescImpl(baseTableInfo, baseTablePartitionConfig, indexDataColumns, indexTableDesc); +} + +NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc( + const NKikimrSchemeOp::TTableDescription& baseTableDescr, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const THashSet<TString>& indexDataColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) +{ + return CalcFulltextImplTableDescImpl(baseTableDescr, baseTablePartitionConfig, indexDataColumns, indexTableDesc); +} + bool ExtractTypes(const NKikimrSchemeOp::TTableDescription& baseTableDescr, TColumnTypes& columnTypes, TString& explain) { const NScheme::TTypeRegistry* typeRegistry = AppData()->TypeRegistry; Y_ABORT_UNLESS(typeRegistry); diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.h b/ydb/core/tx/schemeshard/schemeshard_utils.h index a1e04900fe5..5c490b80771 100644 --- a/ydb/core/tx/schemeshard/schemeshard_utils.h +++ b/ydb/core/tx/schemeshard/schemeshard_utils.h @@ -3,6 +3,7 @@ #include "schemeshard_info_types.h" #include "schemeshard_types.h" +#include <ydb/core/base/fulltext.h> #include <ydb/core/base/table_index.h> #include <yql/essentials/minikql/mkql_type_ops.h> @@ -92,6 +93,18 @@ NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePrefixImplTableDesc( const TTableColumns& implTableColumns, const NKikimrSchemeOp::TTableDescription& indexTableDesc); +NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc( + const NSchemeShard::TTableInfo::TPtr& baseTableInfo, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const THashSet<TString>& indexDataColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc); + +NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc( + const NKikimrSchemeOp::TTableDescription& baseTableDescr, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const THashSet<TString>& indexDataColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc); + TTableColumns ExtractInfo(const NSchemeShard::TTableInfo::TPtr& tableInfo); TTableColumns ExtractInfo(const NKikimrSchemeOp::TTableDescription& tableDesc); TIndexColumns ExtractInfo(const NKikimrSchemeOp::TIndexCreationConfig& indexDesc); @@ -133,7 +146,7 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat return false; } - if (!IsCompatibleIndex(indexDesc.GetType(), baseTableColumns, indexKeys, error)) { + if (!IsCompatibleIndex(GetIndexType(indexDesc), baseTableColumns, indexKeys, error)) { status = NKikimrScheme::EStatus::StatusInvalidParameter; return false; } @@ -144,29 +157,66 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat return false; } - implTableColumns = CalcTableImplDescription(indexDesc.GetType(), baseTableColumns, indexKeys); - - if (indexDesc.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { - //We have already checked this in IsCompatibleIndex - Y_ABORT_UNLESS(indexKeys.KeyColumns.size() >= 1); - - if (indexKeys.KeyColumns.size() > 1 && !IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) { - status = NKikimrScheme::EStatus::StatusInvalidParameter; - return false; + implTableColumns = CalcTableImplDescription(GetIndexType(indexDesc), baseTableColumns, indexKeys); + + switch (GetIndexType(indexDesc)) { + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + if (!IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + return false; + } + break; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: { + // We have already checked this in IsCompatibleIndex + Y_ABORT_UNLESS(indexKeys.KeyColumns.size() >= 1); + + if (indexKeys.KeyColumns.size() > 1 && !IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + return false; + } + + const TString& embeddingColumnName = indexKeys.KeyColumns.back(); + Y_ABORT_UNLESS(baseColumnTypes.contains(embeddingColumnName)); + auto typeInfo = baseColumnTypes.at(embeddingColumnName); + + if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "Embedding column '" << embeddingColumnName << "' expected type 'String' but got " << NScheme::TypeName(typeInfo); + return false; + } + break; } - - const TString& indexColumnName = indexKeys.KeyColumns.back(); - Y_ABORT_UNLESS(baseColumnTypes.contains(indexColumnName)); - auto typeInfo = baseColumnTypes.at(indexColumnName); - - if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) { + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: { + // We have already checked this in IsCompatibleIndex + Y_ABORT_UNLESS(indexKeys.KeyColumns.size() >= 1); + + // Here we only check that fulltext index columns matches table description + // the rest will be checked in NFulltext::ValidateSettings (called separately outside of CommonCheck) + if (!NKikimr::NFulltext::ValidateColumnsMatches(indexKeys.KeyColumns, indexDesc.GetFulltextIndexDescription().GetSettings(), error)) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + return false; + } + + for (const auto& column : indexDesc.GetFulltextIndexDescription().GetSettings().columns()) { + if (column.has_analyzers()) { + auto typeInfo = baseColumnTypes.at(column.column()); + // TODO: support utf-8 in fulltext index + if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "Fulltext column '" << column.column() << "' expected type 'String' but got " << NScheme::TypeName(typeInfo); + return false; + } + } + } + + break; + } + default: status = NKikimrScheme::EStatus::StatusInvalidParameter; - error = TStringBuilder() << "Index column '" << indexColumnName << "' expected type 'String' but got " << NScheme::TypeName(typeInfo); + error = InvalidIndexType(indexDesc.GetType()); return false; - } - } else if (!IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) { - status = NKikimrScheme::EStatus::StatusInvalidParameter; - return false; } if (implTableColumns.Keys.size() > schemeLimits.MaxTableKeyColumns) { diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp index 6927f9054ee..5193a2ca149 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp +++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp @@ -1,5 +1,6 @@ #include "ls_checks.h" +#include <google/protobuf/text_format.h> #include <ydb/public/api/protos/ydb_cms.pb.h> #include <ydb/public/api/protos/ydb_coordination.pb.h> #include <ydb/public/lib/scheme_types/scheme_type_id.h> @@ -919,6 +920,41 @@ TCheckFunc KMeansTreeDescription(Ydb::Table::VectorIndexSettings_Metric metric, }; } +TCheckFunc SpecializedIndexDescription(const TString& proto) { + return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { + switch (record.GetPathDescription().GetTableIndex().GetSpecializedIndexDescriptionCase()) { + case NKikimrSchemeOp::TIndexDescription::kVectorIndexKmeansTreeDescription: { + auto actual = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings(); + Ydb::Table::KMeansTreeSettings expected; + UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected)); + UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected), + TStringBuilder() << "Expected" + << expected.ShortDebugString() + << " but got " + << actual.ShortDebugString()); + break; + } + case NKikimrSchemeOp::TIndexDescription::kFulltextIndexDescription: { + auto actual = record.GetPathDescription().GetTableIndex().GetFulltextIndexDescription().GetSettings(); + Ydb::Table::FulltextIndexSettings expected; + UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected)); + UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected), + TStringBuilder() << "Expected" + << expected.ShortDebugString() + << " but got " + << actual.ShortDebugString()); + break; + } + case NKikimrSchemeOp::TIndexDescription::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: { + UNIT_ASSERT_C(proto == "SPECIALIZEDINDEXDESCRIPTION_NOT_SET", + TStringBuilder() << "Expected" + << proto + << " but got SPECIALIZEDINDEXDESCRIPTION_NOT_SET"); + break; + } + } + }; +} TCheckFunc SequenceName(const TString& name) { return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h index 64a5bd1f35f..5c112f69195 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h +++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h @@ -175,6 +175,8 @@ namespace NLs { ui32 levels ); + TCheckFunc SpecializedIndexDescription(const TString& proto); + TCheckFunc SequenceName(const TString& name); TCheckFunc SequenceIncrement(i64 increment); TCheckFunc SequenceMaxValue(i64 maxValue); diff --git a/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp b/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp index 6ce43b1547e..ab528d3565a 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp +++ b/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp @@ -594,6 +594,7 @@ NSchemeShardUT_Private::TTestEnv::TTestEnv(TTestActorRuntime& runtime, const TTe app.FeatureFlags.SetEnableTableDatetime64(true); app.FeatureFlags.SetEnableVectorIndex(true); app.FeatureFlags.SetEnableAddUniqueIndex(true); + app.FeatureFlags.SetEnableFulltextIndex(true); app.FeatureFlags.SetEnableColumnStore(true); app.FeatureFlags.SetEnableStrictAclCheck(opts.EnableStrictAclCheck_); app.SetEnableMoveIndex(opts.EnableMoveIndex_); diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp new file mode 100644 index 00000000000..4b5b2740545 --- /dev/null +++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp @@ -0,0 +1,340 @@ +#include <ydb/core/base/path.h> +#include <ydb/core/change_exchange/change_exchange.h> +#include <ydb/core/scheme/scheme_tablecell.h> +#include <ydb/core/testlib/tablet_helpers.h> +#include <ydb/core/tx/schemeshard/schemeshard_utils.h> +#include <ydb/core/tx/schemeshard/ut_helpers/helpers.h> + + +using namespace NKikimr; +using namespace NSchemeShard; +using namespace NSchemeShardUT_Private; +using namespace NKikimr::NTableIndex; +using namespace NKikimr::NTableIndex::NFulltext; + +Y_UNIT_TEST_SUITE(TFulltextIndexTests) { + Y_UNIT_TEST(CreateTable) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_lowercase: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str())); + env.TestWaitNotification(runtime, txId); + + NKikimrSchemeOp::TDescribeOptions opts; + opts.SetReturnChildren(true); + opts.SetShowPrivateTable(true); + Cout << DescribePath(runtime, "/MyRoot/texts/idx_fulltext/indexImplTable", opts).DebugString() << Endl; + + for (ui32 reboot = 0; reboot < 2; reboot++) { + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathExist, + NLs::IndexType(NKikimrSchemeOp::EIndexTypeGlobalFulltext), + NLs::IndexState(NKikimrSchemeOp::EIndexStateReady), + NLs::IndexKeys({"text"}), + NLs::IndexDataColumns({"covered"}), + NLs::SpecializedIndexDescription(fulltextSettings), + NLs::ChildrenCount(1), + }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext/indexImplTable"),{ + NLs::PathExist, + NLs::CheckColumns("indexImplTable", + { NTableIndex::NFulltext::TokenColumn, "id", "covered" }, {}, + { NTableIndex::NFulltext::TokenColumn, "id" }, true) }); + + Cerr << "Reboot SchemeShard.." << Endl; + TActorId sender = runtime.AllocateEdgeActor(); + RebootTablet(runtime, TTestTxConfig::SchemeShard, sender); + } + } + + Y_UNIT_TEST(CreateTablePrefix) { // not supported for now, maybe later + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_lowercase: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: [ "another", "text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableMultipleColumns) { // not supported for now, maybe later + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text1" + analyzers: { + tokenizer: STANDARD + use_filter_lowercase: true + } + } + columns: { + column: "text2" + analyzers: { + tokenizer: STANDARD + use_filter_lowercase: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text1" Type: "String" } + Columns { Name: "text2" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text1", "text2"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableNotText) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_lowercase: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "Uint64" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableColumnsMismatch) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text_wrong" + analyzers: { + tokenizer: STANDARD + use_filter_lowercase: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableNoColumnsSettings) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableUnsupportedSettings) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_edge_ngram: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } +} diff --git a/ydb/core/tx/schemeshard/ut_index/ya.make b/ydb/core/tx/schemeshard/ut_index/ya.make index ddd365aeee7..cfb8db9df7b 100644 --- a/ydb/core/tx/schemeshard/ut_index/ya.make +++ b/ydb/core/tx/schemeshard/ut_index/ya.make @@ -22,6 +22,7 @@ SRCS( ut_async_index.cpp ut_unique_index.cpp ut_vector_index.cpp + ut_fulltext_index.cpp ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/ydb_convert/table_description.cpp b/ydb/core/ydb_convert/table_description.cpp index 43596f2a8a7..b882896a8ca 100644 --- a/ydb/core/ydb_convert/table_description.cpp +++ b/ydb/core/ydb_convert/table_description.cpp @@ -5,6 +5,7 @@ #include <ydb/core/base/appdata.h> #include <ydb/core/base/path.h> +#include <ydb/core/base/table_index.h> #include <ydb/core/engine/mkql_proto.h> #include <ydb/core/formats/arrow/switch/switch_type.h> #include <ydb/core/protos/follower_group.pb.h> @@ -1088,7 +1089,18 @@ void FillIndexDescriptionImpl(TYdbProto& out, const NKikimrSchemeOp::TTableDescr break; } + case NKikimrSchemeOp::EIndexTypeGlobalFulltext: + FillGlobalIndexSettings( + *index->mutable_global_fulltext_index()->mutable_settings(), + tableIndex.GetIndexImplTableDescriptions(0) + ); + + *index->mutable_global_fulltext_index()->mutable_fulltext_settings() = tableIndex.GetFulltextIndexDescription().GetSettings(); + + break; default: + Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(tableIndex.GetType())); + break; }; @@ -1141,7 +1153,6 @@ bool FillIndexDescription(NKikimrSchemeOp::TIndexedTableCreationConfig& out, } // specific fields - std::vector<NKikimrSchemeOp::TTableDescription> indexImplTableDescriptionsVector; switch (index.type_case()) { case Ydb::Table::TableIndex::kGlobalIndex: indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobal); @@ -1159,17 +1170,23 @@ bool FillIndexDescription(NKikimrSchemeOp::TIndexedTableCreationConfig& out, indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree); *indexDesc->MutableVectorIndexKmeansTreeDescription()->MutableSettings() = index.global_vector_kmeans_tree_index().vector_settings(); break; + + case Ydb::Table::TableIndex::kGlobalFulltextIndex: + indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext); + *indexDesc->MutableFulltextIndexDescription()->MutableSettings() = index.global_fulltext_index().fulltext_settings(); + break; - default: - // pass through - // TODO: maybe return BAD_REQUEST? + case Ydb::Table::TableIndex::TYPE_NOT_SET: + // FIXME: python sdk can create a table with a secondary index without a type + // so it's not possible to return an invalid index type error here for now break; } - if (!FillIndexTablePartitioning(indexImplTableDescriptionsVector, index, status, error)) { + std::vector<NKikimrSchemeOp::TTableDescription> indexImplTableDescriptions; + if (!FillIndexTablePartitioning(indexImplTableDescriptions, index, status, error)) { return false; } - *indexDesc->MutableIndexImplTableDescriptions() = {indexImplTableDescriptionsVector.begin(), indexImplTableDescriptionsVector.end()}; + *indexDesc->MutableIndexImplTableDescriptions() = {indexImplTableDescriptions.begin(), indexImplTableDescriptions.end()}; } return true; diff --git a/ydb/core/ydb_convert/table_settings.cpp b/ydb/core/ydb_convert/table_settings.cpp index a7de2fd31ed..1e513664973 100644 --- a/ydb/core/ydb_convert/table_settings.cpp +++ b/ydb/core/ydb_convert/table_settings.cpp @@ -476,6 +476,13 @@ bool FillIndexTablePartitioning( } break; } + + case Ydb::Table::TableIndex::kGlobalFulltextIndex: + if (!fillIndexPartitioning(index.global_fulltext_index().settings(), indexImplTableDescriptions)) { + return false; + } + break; + case Ydb::Table::TableIndex::TYPE_NOT_SET: break; } diff --git a/ydb/library/services/services.proto b/ydb/library/services/services.proto index 93c74186a9c..3941e270baa 100644 --- a/ydb/library/services/services.proto +++ b/ydb/library/services/services.proto @@ -1119,5 +1119,6 @@ message TActivity { SCHEME_BOARD_RESTORE_ACTOR = 669; REPLICATION_CONTROLLER_RESOURCE_ID_RESOLVER = 670; BS_VDISK_METADATA_ACTOR = 671; + BUILD_FULLTEXT_INDEX = 672; }; }; diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index 9cb92adb9bc..e22ce614505 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -113,7 +113,149 @@ message GlobalVectorKMeansTreeIndex { KMeansTreeSettings vector_settings = 3; } -// Represent secondary index +message FulltextIndexSettings { + // Specifies the layout strategy for storing and updating the full-text index + enum Layout { + LAYOUT_UNSPECIFIED = 0; + + // Uses a single flat inverted index table (indexImplTable) + // Example source table: + // ┌────┬────────────────────────────┐ + // │ id │ text │ + // ├────┼────────────────────────────┤ + // │ 1 │ "The quick brown fox" │ + // │ 2 │ "The quick blue hare" │ + // └────┴────────────────────────────┘ + // Example inverted index table (indexImplTable): + // ┌──────────────┬────┐ + // │ __ydb_token │ id │ + // ├──────────────┼────┤ + // │ "blue" │ 2 │ + // │ "brown" │ 1 │ + // │ "fox" │ 1 │ + // │ "hare" │ 2 │ + // │ "quick" │ 1 │ + // │ "quick" │ 2 │ + // │ "The" │ 1 │ + // │ "The" │ 2 │ + // └──────────────┴────┘ + // Supports a single column only + FLAT = 1; + } + + // Specifies how text is tokenized during indexing + enum Tokenizer { + TOKENIZER_UNSPECIFIED = 0; + + // Splits text only by whitespace + // Does not split on punctuation + // Example: + // Text: "foo-bar baz_lorem ipsum" + // Tokens: ["foo-bar", "baz_lorem", "ipsum"] + WHITESPACE = 1; + + // Applies general language-aware tokenization + // Splits text on whitespace and punctuation + // Example: + // Text: "foo-bar baz_lorem ipsum" + // Tokens: ["foo", "bar", "baz", "lorem", "ipsum"] + STANDARD = 2; + + // Treats the entire input as a single token + // No splitting is performed + // Example: + // Text: "Hello World!" + // Tokens: ["Hello World!"] + KEYWORD = 3; + } + + // Represents text analyzers settings + message Analyzers { + // See Tokenizer enum + optional Tokenizer tokenizer = 1; + + // Language used for language-sensitive operations like stopword filtering + // Example: language = "english" + // By default is not specified and no language-specific logic is applied + optional string language = 2; + + // Whether to convert tokens to lowercase + // Example: + // Token: "Quick" + // Output: "quick" + optional bool use_filter_lowercase = 100; + + // Whether to remove common stopwords like "the", "a", "is" + // Example: language = "english" + // Tokens: ["the", "quick", "brown"] + // Output: ["quick", "brown"] + optional bool use_filter_stopwords = 110; + + // Whether to apply character n-gram indexing to each token + // Must be used with filter_ngram_min_length and filter_ngram_max_length + // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 + // Token: "search" + // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"] + optional bool use_filter_ngram = 120; + + // Whether to apply edge n-gram indexing (prefix-based) to each token + // Used with filter_ngram_min_length and filter_ngram_max_length + // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 + // Token: "search" + // Output: ["sea", "sear"] + optional bool use_filter_edge_ngram = 121; + + // Minimum length of n-grams to generate (inclusive) + // Must be used with use_filter_ngram or use_filter_edge_ngram + // Default value is 3 + optional int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; + + // Maximum length of n-grams to generate (inclusive) + // Must be used with use_filter_ngram or use_filter_edge_ngram + // Default value is 4 + optional int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; + + // Whether to filter tokens by their length + // Must be used with filter_length_min or filter_length_max + // Example: filter_length_min = 4, filter_length_max = 6 + // Tokens: ["foo", "fooba", "foobar", "foobarbaz"] + // Output: ["fooba", "foobar"] + optional bool use_filter_length = 130; + + // Minimum token length to keep (inclusive) + // Must be used with use_filter_length + optional int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; + + // Maximum token length to keep (inclusive) + // Must be used with use_filter_length + optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + } + + // Represents text analyzers settings for a specific column + message ColumnAnalyzers { + // Name of the column to be indexed + optional string column = 1; + + // Analyzer settings specific to this column + Analyzers analyzers = 2; + } + + // See Layout enum + optional Layout layout = 1; + + // List of columns and their fulltext settings + // Currently, this list should contain a single entry with specified analyzers + // Later, some columns may not use analyzers and will be indexed as-is + // This list must always match TableIndex.index_columns + repeated ColumnAnalyzers columns = 2; +} + +message GlobalFulltextIndex { + GlobalIndexSettings settings = 1; + FulltextIndexSettings fulltext_settings = 2; +} + +// Represent table index message TableIndex { // Name of index string name = 1; @@ -125,12 +267,13 @@ message TableIndex { GlobalAsyncIndex global_async_index = 4; GlobalUniqueIndex global_unique_index = 6; GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7; + GlobalFulltextIndex global_fulltext_index = 8; } // list of columns content to be copied in to index table repeated string data_columns = 5; } -// Represent secondary index with index state +// Represent table index with index state message TableIndexDescription { enum Status { STATUS_UNSPECIFIED = 0; @@ -149,6 +292,7 @@ message TableIndexDescription { GlobalAsyncIndex global_async_index = 5; GlobalUniqueIndex global_unique_index = 8; GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 9; + GlobalFulltextIndex global_fulltext_index = 10; } Status status = 4; // list of columns content to be copied in to index table @@ -648,7 +792,7 @@ message CreateTableRequest { // Table profile TableProfile profile = 5; Ydb.Operations.OperationParams operation_params = 6; - // List of secondary indexes + // List of table indexes repeated TableIndex indexes = 7; // Table rows time to live settings TtlSettings ttl_settings = 8; @@ -726,9 +870,9 @@ message AlterTableRequest { TtlSettings set_ttl_settings = 7; google.protobuf.Empty drop_ttl_settings = 8; } - // Add secondary indexes + // Add table indexes repeated TableIndex add_indexes = 9; - // Remove secondary indexes + // Remove table indexes repeated string drop_indexes = 10; // Change table storage settings StorageSettings alter_storage_settings = 11; diff --git a/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp b/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp index eb2efdb8e63..c339dc804af 100644 --- a/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp +++ b/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp @@ -2410,6 +2410,9 @@ Y_UNIT_TEST_SUITE(BackupRestore) { case EIndexTypeGlobalUnique: case EIndexTypeGlobalVectorKmeansTree: return TestTableWithIndexBackupRestore(Value); + case EIndexTypeGlobalFulltext: + // TODO: will be added later + break; case EIndexTypeInvalid: break; // not applicable default: @@ -3242,6 +3245,9 @@ Y_UNIT_TEST_SUITE(BackupRestoreS3) { case EIndexTypeGlobalVectorKmeansTree: TestTableWithIndexBackupRestore(Value); break; + case EIndexTypeGlobalFulltext: + // TODO: will be added later + break; case EIndexTypeInvalid: break; // not applicable default: |