summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ydb/core/base/fulltext.cpp257
-rw-r--r--ydb/core/base/fulltext.h17
-rw-r--r--ydb/core/base/kmeans_clusters.cpp2
-rw-r--r--ydb/core/base/table_index.cpp90
-rw-r--r--ydb/core/base/table_index.h17
-rw-r--r--ydb/core/base/ut/fulltext_ut.cpp127
-rw-r--r--ydb/core/base/ut/ya.make5
-rw-r--r--ydb/core/base/ya.make2
-rw-r--r--ydb/core/protos/flat_scheme_op.proto8
-rw-r--r--ydb/core/protos/tx_datashard.proto35
-rw-r--r--ydb/core/sys_view/show_create/create_table_formatter.cpp10
-rw-r--r--ydb/core/tx/datashard/build_index/fulltext.cpp413
-rw-r--r--ydb/core/tx/datashard/build_index/secondary_index.cpp13
-rw-r--r--ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp357
-rw-r--r--ydb/core/tx/datashard/build_index/ut/ya.make1
-rw-r--r--ydb/core/tx/datashard/datashard.h15
-rw-r--r--ydb/core/tx/datashard/datashard_impl.h4
-rw-r--r--ydb/core/tx/datashard/ya.make7
-rw-r--r--ydb/core/tx/scheme_board/cache.cpp2
-rw-r--r--ydb/core/tx/scheme_cache/scheme_cache.h1
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp6
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp22
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp24
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp99
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp129
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_build_index.cpp25
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp24
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp5
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_impl.cpp2
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_impl.h1
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_info_types.cpp24
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_info_types.h66
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_path_describer.cpp28
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_schema.h3
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_utils.cpp48
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_utils.h92
-rw-r--r--ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp36
-rw-r--r--ydb/core/tx/schemeshard/ut_helpers/ls_checks.h2
-rw-r--r--ydb/core/tx/schemeshard/ut_helpers/test_env.cpp1
-rw-r--r--ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp340
-rw-r--r--ydb/core/tx/schemeshard/ut_index/ya.make1
-rw-r--r--ydb/core/ydb_convert/table_description.cpp29
-rw-r--r--ydb/core/ydb_convert/table_settings.cpp7
-rw-r--r--ydb/library/services/services.proto1
-rw-r--r--ydb/public/api/protos/ydb_table.proto154
-rw-r--r--ydb/services/ydb/backup_ut/ydb_backup_ut.cpp6
46 files changed, 2380 insertions, 178 deletions
diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp
new file mode 100644
index 00000000000..242484e72af
--- /dev/null
+++ b/ydb/core/base/fulltext.cpp
@@ -0,0 +1,257 @@
+#include "fulltext.h"
+#include <regex>
+
+namespace NKikimr::NFulltext {
+
+namespace {
+
+ Ydb::Table::FulltextIndexSettings::Layout ParseLayout(const TString& layout, TString& error) {
+ if (layout == "flat")
+ return Ydb::Table::FulltextIndexSettings::FLAT;
+ else {
+ error = TStringBuilder() << "Invalid layout: " << layout;
+ return Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED;
+ }
+ };
+
+ Ydb::Table::FulltextIndexSettings::Tokenizer ParseTokenizer(const TString& tokenizer, TString& error) {
+ if (tokenizer == "whitespace")
+ return Ydb::Table::FulltextIndexSettings::WHITESPACE;
+ else if (tokenizer == "standard")
+ return Ydb::Table::FulltextIndexSettings::STANDARD;
+ else if (tokenizer == "keyword")
+ return Ydb::Table::FulltextIndexSettings::KEYWORD;
+ else {
+ error = TStringBuilder() << "Invalid tokenizer: " << tokenizer;
+ return Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED;
+ }
+ };
+
+ i32 ParseInt32(const TString& name, const TString& value, TString& error) {
+ i32 result = 0;
+ if (!TryFromString(value, result) || result < 0) { // proto int32 fields with [(Ydb.value) = ">= 0"] annotation
+ error = TStringBuilder() << "Invalid " << name << ": " << value;
+ }
+ return result;
+ }
+
+ bool ParseBool(const TString& name, const TString& value, TString& error) {
+ bool result = false;
+ if (!TryFromString(value, result)) {
+ error = TStringBuilder() << "Invalid " << name << ": " << value;
+ }
+ return result;
+ }
+
+ // Note: written by llm, can be optimized a lot later
+ TVector<TString> Tokenize(const TString& text, const Ydb::Table::FulltextIndexSettings::Tokenizer& tokenizer) {
+ TVector<TString> tokens;
+ switch (tokenizer) {
+ case Ydb::Table::FulltextIndexSettings::WHITESPACE: {
+ std::istringstream stream(text);
+ TString token;
+ while (stream >> token) {
+ tokens.push_back(token);
+ }
+ break;
+ }
+ case Ydb::Table::FulltextIndexSettings::STANDARD: {
+ std::regex word_regex(R"(\b\w+\b)"); // match alphanumeric words
+ std::sregex_iterator it(text.begin(), text.end(), word_regex);
+ std::sregex_iterator end;
+ while (it != end) {
+ tokens.push_back(it->str());
+ ++it;
+ }
+ break;
+ }
+ case Ydb::Table::FulltextIndexSettings::KEYWORD:
+ tokens.push_back(text);
+ break;
+ default:
+ Y_ENSURE(TStringBuilder() << "Invalid tokenizer: " << static_cast<int>(tokenizer));
+ }
+
+ return tokens;
+ }
+
+ bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) {
+ if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) {
+ error = "tokenizer should be set";
+ return false;
+ }
+
+ if (settings.has_language()) {
+ error = "Unsupported language setting";
+ return false;
+ }
+
+ if (settings.use_filter_stopwords()) {
+ error = "Unsupported use_filter_stopwords setting";
+ return false;
+ }
+
+ if (settings.use_filter_ngram()) {
+ error = "Unsupported use_filter_ngram setting";
+ return false;
+ }
+ if (settings.use_filter_edge_ngram()) {
+ error = "Unsupported use_filter_edge_ngram setting";
+ return false;
+ }
+ if (settings.has_filter_ngram_min_length()) {
+ error = "Unsupported filter_ngram_min_length setting";
+ return false;
+ }
+ if (settings.has_filter_ngram_max_length()) {
+ error = "Unsupported filter_ngram_max_length setting";
+ return false;
+ }
+
+ if (settings.use_filter_length()) {
+ error = "Unsupported use_filter_length setting";
+ return false;
+ }
+ if (settings.has_filter_length_min()) {
+ error = "Unsupported filter_length_min setting";
+ return false;
+ }
+ if (settings.has_filter_length_max()) {
+ error = "Unsupported filter_length_max setting";
+ return false;
+ }
+
+ return true;
+ }
+}
+
+TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) {
+ TVector<TString> tokens = Tokenize(text, settings.tokenizer());
+
+ if (settings.use_filter_lowercase()) {
+ for (auto& token : tokens) {
+ token.to_lower();
+ }
+ }
+
+ return tokens;
+}
+
+bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error) {
+ return ValidateColumnsMatches(TVector<TString>{columns.begin(), columns.end()}, settings, error);
+}
+
+bool ValidateColumnsMatches(const TVector<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error) {
+ TVector<TString> settingsColumns(::Reserve(settings.columns().size()));
+ for (auto column : settings.columns()) {
+ settingsColumns.push_back(column.column());
+ }
+
+
+ if (columns != settingsColumns) {
+ error = TStringBuilder() << "columns " << settingsColumns << " should be " << columns;
+ return false;
+ }
+
+ error = "";
+ return true;
+}
+
+bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error) {
+ if (!settings.has_layout() || settings.layout() == Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED) {
+ error = "layout should be set";
+ return false;
+ }
+
+ if (settings.columns().empty()) {
+ error = "columns should be set";
+ return false;
+ }
+
+ // current implementation limitation:
+ if (settings.columns().size() != 1) {
+ error = "columns should have a single value";
+ return false;
+ }
+
+ for (auto column : settings.columns()) {
+ if (!column.has_column()) {
+ error = "column name should be set";
+ return false;
+ }
+
+ // current implementation limitation:
+ if (!settings.columns().at(0).has_analyzers()) {
+ error = "column analyzers should be set";
+ return false;
+ }
+ if (!ValidateSettings(column.analyzers(), error)) {
+ return false;
+ }
+ }
+
+ error = "";
+ return true;
+}
+
+Ydb::Table::FulltextIndexSettings FillSettings(const TString& keyColumn, const TVector<std::pair<TString, TString>>& settings, TString& error) {
+ Ydb::Table::FulltextIndexSettings result;
+ Ydb::Table::FulltextIndexSettings::Analyzers resultAnalyzers;
+
+ for (const auto& [name, value] : settings) {
+ if (name == "layout") {
+ result.set_layout(ParseLayout(value, error));
+ } else if (name == "tokenizer") {
+ resultAnalyzers.set_tokenizer(ParseTokenizer(value, error));
+ } else if (name == "language") {
+ resultAnalyzers.set_language(value);
+ } else if (name == "use_filter_lowercase") {
+ resultAnalyzers.set_use_filter_lowercase(ParseBool(name, value, error));
+ } else if (name == "use_filter_stopwords") {
+ resultAnalyzers.set_use_filter_stopwords(ParseBool(name, value, error));
+ } else if (name == "use_filter_ngram") {
+ resultAnalyzers.set_use_filter_ngram(ParseBool(name, value, error));
+ } else if (name == "use_filter_edge_ngram") {
+ resultAnalyzers.set_use_filter_edge_ngram(ParseBool(name, value, error));
+ } else if (name == "filter_ngram_min_length") {
+ resultAnalyzers.set_filter_ngram_min_length(ParseInt32(name, value, error));
+ } else if (name == "filter_ngram_max_length") {
+ resultAnalyzers.set_filter_ngram_max_length(ParseInt32(name, value, error));
+ } else if (name == "use_filter_length") {
+ resultAnalyzers.set_use_filter_length(ParseBool(name, value, error));
+ } else if (name == "filter_length_min") {
+ resultAnalyzers.set_filter_length_min(ParseInt32(name, value, error));
+ } else if (name == "filter_length_max") {
+ resultAnalyzers.set_filter_length_max(ParseInt32(name, value, error));
+ } else {
+ error = TStringBuilder() << "Unknown index setting: " << name;
+ return result;
+ }
+
+ if (error) {
+ return result;
+ }
+ }
+
+ {
+ // only single-columned index is supported for now
+ auto columnAnalyzers = result.add_columns();
+ columnAnalyzers->set_column(keyColumn);
+ columnAnalyzers->mutable_analyzers()->CopyFrom(resultAnalyzers);
+ }
+
+ ValidateSettings(result, error);
+
+ return result;
+}
+
+
+}
+
+template<> inline
+void Out<TVector<TString>>(IOutputStream& o, const TVector<TString> &vec) {
+ o << "[ ";
+ for (const auto &x : vec)
+ o << x << ' ';
+ o << "]";
+}
diff --git a/ydb/core/base/fulltext.h b/ydb/core/base/fulltext.h
new file mode 100644
index 00000000000..b7303613ae2
--- /dev/null
+++ b/ydb/core/base/fulltext.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "defs.h"
+
+#include <ydb/public/api/protos/ydb_table.pb.h>
+
+namespace NKikimr::NFulltext {
+
+TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings);
+
+bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error);
+bool ValidateColumnsMatches(const TVector<TString>& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error);
+
+bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error);
+Ydb::Table::FulltextIndexSettings FillSettings(const TString& keyColumn, const TVector<std::pair<TString, TString>>& values, TString& error);
+
+}
diff --git a/ydb/core/base/kmeans_clusters.cpp b/ydb/core/base/kmeans_clusters.cpp
index 65fb9874d76..9b938196539 100644
--- a/ydb/core/base/kmeans_clusters.cpp
+++ b/ydb/core/base/kmeans_clusters.cpp
@@ -493,6 +493,7 @@ bool ValidateSettings(const Ydb::Table::KMeansTreeSettings& settings, TString& e
return false;
}
+ error = "";
return true;
}
@@ -525,6 +526,7 @@ bool ValidateSettings(const Ydb::Table::VectorIndexSettings& settings, TString&
return false;
}
+ error = "";
return true;
}
diff --git a/ydb/core/base/table_index.cpp b/ydb/core/base/table_index.cpp
index b24e0b607d5..d82aec792dd 100644
--- a/ydb/core/base/table_index.cpp
+++ b/ydb/core/base/table_index.cpp
@@ -1,5 +1,6 @@
#include "table_index.h"
+#include <ydb/library/yverify_stream/yverify_stream.h>
#include <ydb/core/protos/tx_datashard.pb.h>
namespace NKikimr::NTableIndex {
@@ -61,13 +62,40 @@ constexpr std::string_view PrefixedGlobalKMeansTreeImplTables[] = {
};
static_assert(std::is_sorted(std::begin(PrefixedGlobalKMeansTreeImplTables), std::end(PrefixedGlobalKMeansTreeImplTables)));
+constexpr std::string_view GlobalFulltextImplTables[] = {
+ ImplTable,
+};
+static_assert(std::is_sorted(std::begin(GlobalFulltextImplTables), std::end(GlobalFulltextImplTables)));
+
+bool IsSecondaryIndex(NKikimrSchemeOp::EIndexType indexType) {
+ switch (indexType) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ return true;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ return false;
+ default:
+ Y_ENSURE(false, InvalidIndexType(indexType));
+ }
}
-TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType type, const TTableColumns& table, const TIndexColumns& index) {
+}
+
+TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index) {
TTableColumns result;
- const bool isSecondaryIndex = type != NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree;
- std::for_each(index.KeyColumns.begin(), index.KeyColumns.end() - (isSecondaryIndex ? 0 : 1), [&] (const auto& ik) {
+ const bool isSecondaryIndex = IsSecondaryIndex(indexType);
+
+ auto takeKeyColumns = index.KeyColumns.size();
+ if (!isSecondaryIndex) { // vector and fulltext indexes have special embedding and text key columns
+ Y_ASSERT(indexType == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree
+ || indexType == NKikimrSchemeOp::EIndexTypeGlobalFulltext);
+ takeKeyColumns--;
+ }
+
+ std::for_each(index.KeyColumns.begin(), index.KeyColumns.begin() + takeKeyColumns, [&] (const auto& ik) {
result.Keys.push_back(ik);
result.Columns.emplace(ik);
});
@@ -85,6 +113,18 @@ TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType type, const T
return result;
}
+NKikimrSchemeOp::EIndexType GetIndexType(NKikimrSchemeOp::TIndexCreationConfig indexCreation) {
+ // TODO: always provide EIndexTypeGlobal value instead of null
+ // TODO: do not cast unknown index types to EIndexTypeGlobal (proto2 specific)
+ return indexCreation.HasType()
+ ? indexCreation.GetType()
+ : NKikimrSchemeOp::EIndexTypeGlobal;
+}
+
+TString InvalidIndexType(NKikimrSchemeOp::EIndexType indexType) {
+ return TStringBuilder() << "Invalid index type " << static_cast<int>(indexType);
+}
+
bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain) {
if (const auto* broken = IsContains(table.Keys, table.Columns)) {
explain = TStringBuilder()
@@ -127,7 +167,7 @@ bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumn
return false;
}
- const bool isSecondaryIndex = indexType != NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree;
+ const bool isSecondaryIndex = IsSecondaryIndex(indexType);
if (index.KeyColumns.size() < 1) {
explain = "should be at least single index key column";
@@ -157,7 +197,9 @@ bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumn
if (isSecondaryIndex) {
tmp.insert(index.KeyColumns.begin(), index.KeyColumns.end());
} else {
- // Vector indexes allow to add all columns both to index & data
+ // Vector and fulltext indexes allow to add all columns both to index & data
+ Y_ASSERT(indexType == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree
+ || indexType == NKikimrSchemeOp::EIndexTypeGlobalFulltext);
}
if (const auto* broken = IsContains(index.DataColumns, tmp, true)) {
explain = TStringBuilder()
@@ -167,15 +209,37 @@ bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumn
return true;
}
+bool DoesIndexSupportTTL(NKikimrSchemeOp::EIndexType indexType) {
+ switch (indexType) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ return true;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ return false;
+ default:
+ Y_DEBUG_ABORT_S(InvalidIndexType(indexType));
+ return false;
+ }
+}
+
std::span<const std::string_view> GetImplTables(NKikimrSchemeOp::EIndexType indexType, std::span<const TString> indexKeys) {
- if (indexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- if (indexKeys.size() == 1) {
- return GlobalKMeansTreeImplTables;
- } else {
- return PrefixedGlobalKMeansTreeImplTables;
- }
- } else {
- return GlobalSecondaryImplTables;
+ switch (indexType) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ return GlobalSecondaryImplTables;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ if (indexKeys.size() == 1) {
+ return GlobalKMeansTreeImplTables;
+ } else {
+ return PrefixedGlobalKMeansTreeImplTables;
+ }
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ return GlobalFulltextImplTables;
+ default:
+ Y_ENSURE(false, InvalidIndexType(indexType));
}
}
diff --git a/ydb/core/base/table_index.h b/ydb/core/base/table_index.h
index ffe3681f0ae..a251122d0b5 100644
--- a/ydb/core/base/table_index.h
+++ b/ydb/core/base/table_index.h
@@ -38,8 +38,13 @@ struct TIndexColumns {
inline constexpr const char* ImplTable = "indexImplTable";
-bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType type, const TTableColumns& table, const TIndexColumns& index, TString& explain);
-TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType type, const TTableColumns& table, const TIndexColumns& index);
+bool IsCompatibleIndex(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain);
+TTableColumns CalcTableImplDescription(NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index);
+
+bool DoesIndexSupportTTL(NKikimrSchemeOp::EIndexType indexType);
+
+NKikimrSchemeOp::EIndexType GetIndexType(NKikimrSchemeOp::TIndexCreationConfig indexCreation);
+TString InvalidIndexType(NKikimrSchemeOp::EIndexType indexType);
std::span<const std::string_view> GetImplTables(NKikimrSchemeOp::EIndexType indexType, std::span<const TString> indexKeys);
bool IsImplTable(std::string_view tableName);
@@ -76,6 +81,14 @@ TClusterId SetPostingParentFlag(TClusterId parent);
}
+namespace NFulltext {
+ // TODO: support utf-8 in fulltext index
+ inline constexpr auto TokenType = Ydb::Type::STRING;
+ inline constexpr const char* TokenTypeName = "String";
+
+ inline constexpr const char* TokenColumn = "__ydb_token";
+}
+
TString ToShortDebugString(const NKikimrTxDataShard::TEvReshuffleKMeansRequest& record);
TString ToShortDebugString(const NKikimrTxDataShard::TEvRecomputeKMeansRequest& record);
TString ToShortDebugString(const NKikimrTxDataShard::TEvRecomputeKMeansResponse& record);
diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp
new file mode 100644
index 00000000000..113c1821153
--- /dev/null
+++ b/ydb/core/base/ut/fulltext_ut.cpp
@@ -0,0 +1,127 @@
+#include "fulltext.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+namespace NKikimr::NFulltext {
+
+Y_UNIT_TEST_SUITE(NFulltext) {
+
+ Y_UNIT_TEST(ValidateColumnsMatches) {
+ TString error;
+
+ Ydb::Table::FulltextIndexSettings settings;
+ settings.add_columns()->set_column("column1");
+ settings.add_columns()->set_column("column2");
+
+ UNIT_ASSERT(!ValidateColumnsMatches(TVector<TString>{"column2"}, settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "columns [ column1 column2 ] should be [ column2 ]");
+
+ UNIT_ASSERT(!ValidateColumnsMatches(TVector<TString>{"column2", "column1"}, settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "columns [ column1 column2 ] should be [ column2 column1 ]");
+
+ UNIT_ASSERT(ValidateColumnsMatches(TVector<TString>{"column1", "column2"}, settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "");
+ }
+
+ Y_UNIT_TEST(ValidateSettings) {
+ Ydb::Table::FulltextIndexSettings settings;
+ TString error;
+
+ UNIT_ASSERT(!ValidateSettings(settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "layout should be set");
+ settings.set_layout(Ydb::Table::FulltextIndexSettings::FLAT);
+
+ UNIT_ASSERT(!ValidateSettings(settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "columns should be set");
+ auto columnSettings = settings.add_columns();
+
+ UNIT_ASSERT(!ValidateSettings(settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "column name should be set");
+ columnSettings->set_column("text");
+
+ UNIT_ASSERT(!ValidateSettings(settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "column analyzers should be set");
+ auto columnAnalyzers = columnSettings->mutable_analyzers();
+
+ UNIT_ASSERT(!ValidateSettings(settings, error));
+ UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set");
+ columnAnalyzers->set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD);
+
+ UNIT_ASSERT_C(ValidateSettings(settings, error), error);
+ UNIT_ASSERT_VALUES_EQUAL(error, "");
+
+ columnSettings = settings.add_columns();
+ columnSettings->set_column("text2");
+ UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
+ UNIT_ASSERT_VALUES_EQUAL(error, "columns should have a single value");
+ }
+
+ Y_UNIT_TEST(FillSettings) {
+ TVector<std::pair<TString, TString>> list{
+ {"layout", "flat"},
+ {"tokenizer", "standard"},
+ {"use_filter_lowercase", "true"}
+ };
+
+ TString error;
+ auto settings = FillSettings("text", list, error);
+ UNIT_ASSERT_VALUES_EQUAL(error, "");
+
+ UNIT_ASSERT_EQUAL(settings.layout(), Ydb::Table::FulltextIndexSettings::FLAT);
+ UNIT_ASSERT_VALUES_EQUAL(settings.columns().size(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).column(), "text");
+ UNIT_ASSERT_EQUAL(settings.columns().at(0).analyzers().tokenizer(), Ydb::Table::FulltextIndexSettings::STANDARD);
+ UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).analyzers().use_filter_lowercase(), true);
+ }
+
+ Y_UNIT_TEST(FillSettingsInvalid) {
+ {
+ TVector<std::pair<TString, TString>> list{
+ {"asdf", "qwer"}
+ };
+ TString error;
+ auto settings = FillSettings("text", list, error);
+ UNIT_ASSERT_VALUES_EQUAL(error, "Unknown index setting: asdf");
+ }
+
+ {
+ TVector<std::pair<TString, TString>> list{
+ {"layout", "flat"},
+ {"tokenizer", "standard"},
+ {"use_filter_lowercase", "asdf"}
+ };
+ TString error;
+ auto settings = FillSettings("text", list, error);
+ UNIT_ASSERT_VALUES_EQUAL(error, "Invalid use_filter_lowercase: asdf");
+ }
+
+ {
+ TVector<std::pair<TString, TString>> list{
+ {"layout", "flat"},
+ };
+ TString error;
+ auto settings = FillSettings("text", list, error);
+ UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set");
+ }
+ }
+
+ Y_UNIT_TEST(Analyze) {
+ Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
+ TString text = "apple WaLLet spaced-dog";
+
+ analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
+ UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "WaLLet", "spaced-dog"}));
+
+ analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD);
+ UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "WaLLet", "spaced", "dog"}));
+
+ analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD);
+ UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{text}));
+
+ analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
+ analyzers.set_use_filter_lowercase(true);
+ UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"apple", "wallet", "spaced-dog"}));
+ }
+}
+
+}
diff --git a/ydb/core/base/ut/ya.make b/ydb/core/base/ut/ya.make
index 82b340fde09..cc1fd67d984 100644
--- a/ydb/core/base/ut/ya.make
+++ b/ydb/core/base/ut/ya.make
@@ -9,13 +9,14 @@ PEERDIR(
)
SRCS(
- path_ut.cpp
blobstorage_grouptype_ut.cpp
+ fulltext_ut.cpp
localdb_ut.cpp
logoblob_ut.cpp
memory_stats_ut.cpp
- statestorage_ut.cpp
+ path_ut.cpp
statestorage_guardian_impl_ut.cpp
+ statestorage_ut.cpp
table_index_ut.cpp
)
diff --git a/ydb/core/base/ya.make b/ydb/core/base/ya.make
index 78234ead0f8..285913b9bf5 100644
--- a/ydb/core/base/ya.make
+++ b/ydb/core/base/ya.make
@@ -28,6 +28,8 @@ SRCS(
feature_flags.h
feature_flags_service.cpp
feature_flags_service.h
+ fulltext.cpp
+ fulltext.h
group_stat.cpp
group_stat.h
hive.h
diff --git a/ydb/core/protos/flat_scheme_op.proto b/ydb/core/protos/flat_scheme_op.proto
index 1b78127d665..c02dc4eca53 100644
--- a/ydb/core/protos/flat_scheme_op.proto
+++ b/ydb/core/protos/flat_scheme_op.proto
@@ -1158,6 +1158,7 @@ enum EIndexType {
EIndexTypeGlobalAsync = 2;
EIndexTypeGlobalUnique = 3;
EIndexTypeGlobalVectorKmeansTree = 4;
+ EIndexTypeGlobalFulltext = 5;
}
enum EIndexState {
@@ -1171,6 +1172,10 @@ message TVectorIndexKmeansTreeDescription {
optional Ydb.Table.KMeansTreeSettings Settings = 1;
}
+message TFulltextIndexDescription {
+ optional Ydb.Table.FulltextIndexSettings Settings = 1;
+}
+
message TIndexDescription {
optional string Name = 1;
optional uint64 LocalPathId = 2;
@@ -1192,6 +1197,7 @@ message TIndexDescription {
oneof SpecializedIndexDescription {
TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 11;
+ TFulltextIndexDescription FulltextIndexDescription = 12;
}
}
@@ -1204,6 +1210,7 @@ message TIndexCreationConfig {
repeated string DataColumnNames = 6; //columns to be denormalized to read data just from index
oneof SpecializedIndexDescription {
TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 7;
+ TFulltextIndexDescription FulltextIndexDescription = 8;
}
}
@@ -1990,6 +1997,7 @@ enum EPathSubType {
EPathSubTypeAsyncIndexImplTable = 2;
EPathSubTypeStreamImpl = 3;
EPathSubTypeVectorKmeansTreeIndexImplTable = 4;
+ EPathSubTypeFulltextIndexImplTable = 5;
}
enum EPathState {
diff --git a/ydb/core/protos/tx_datashard.proto b/ydb/core/protos/tx_datashard.proto
index 89cbbdef90d..1f34bbdc935 100644
--- a/ydb/core/protos/tx_datashard.proto
+++ b/ydb/core/protos/tx_datashard.proto
@@ -1787,6 +1787,41 @@ message TEvPrefixKMeansResponse {
optional NKikimrIndexBuilder.TMeteringStats MeteringStats = 12;
}
+message TEvBuildFulltextIndexRequest {
+ optional uint64 Id = 1;
+
+ optional uint64 TabletId = 2;
+ optional NKikimrProto.TPathID PathId = 3;
+
+ optional uint64 SnapshotTxId = 4;
+ optional uint64 SnapshotStep = 5;
+
+ optional uint64 SeqNoGeneration = 6;
+ optional uint64 SeqNoRound = 7;
+
+ optional string IndexName = 8;
+
+ optional Ydb.Table.FulltextIndexSettings Settings = 9; // also has key columns
+ repeated string DataColumns = 10;
+
+ optional NKikimrIndexBuilder.TIndexBuildScanSettings ScanSettings = 11;
+}
+
+message TEvBuildFulltextIndexResponse {
+ optional uint64 Id = 1;
+
+ optional uint64 TabletId = 2;
+ optional NKikimrProto.TPathID PathId = 3;
+
+ optional uint64 RequestSeqNoGeneration = 4;
+ optional uint64 RequestSeqNoRound = 5;
+
+ optional NKikimrIndexBuilder.EBuildStatus Status = 6;
+ repeated Ydb.Issue.IssueMessage Issues = 7;
+
+ optional NKikimrIndexBuilder.TMeteringStats MeteringStats = 8;
+}
+
message TEvCdcStreamScanRequest {
message TLimits {
optional uint32 BatchMaxBytes = 1 [default = 512000];
diff --git a/ydb/core/sys_view/show_create/create_table_formatter.cpp b/ydb/core/sys_view/show_create/create_table_formatter.cpp
index 923644ceac3..151fd8a0f02 100644
--- a/ydb/core/sys_view/show_create/create_table_formatter.cpp
+++ b/ydb/core/sys_view/show_create/create_table_formatter.cpp
@@ -514,6 +514,7 @@ void TCreateTableFormatter::Format(const TableIndex& index) {
Stream << "\tINDEX ";
EscapeName(index.name(), Stream);
std::optional<KMeansTreeSettings> kMeansTreeSettings;
+ std::optional<FulltextIndexSettings> fulltextIndexSettings;
switch (index.type_case()) {
case TableIndex::kGlobalIndex: {
Stream << " GLOBAL SYNC ON ";
@@ -532,6 +533,11 @@ void TCreateTableFormatter::Format(const TableIndex& index) {
kMeansTreeSettings = index.global_vector_kmeans_tree_index().vector_settings();
break;
}
+ case Ydb::Table::TableIndex::kGlobalFulltextIndex: {
+ Stream << " GLOBAL USING fulltext ON ";
+ fulltextIndexSettings = index.global_fulltext_index().fulltext_settings();
+ break;
+ }
case Ydb::Table::TableIndex::TYPE_NOT_SET:
ythrow TFormatFail(Ydb::StatusIds::INTERNAL_ERROR, "Unexpected Ydb::Table::TableIndex::TYPE_NOT_SET");
}
@@ -622,6 +628,10 @@ void TCreateTableFormatter::Format(const TableIndex& index) {
Stream << ")";
}
+
+ if (fulltextIndexSettings) {
+ Y_ENSURE("todo not implemented");
+ }
}
bool TCreateTableFormatter::Format(const TFamilyDescription& familyDesc) {
diff --git a/ydb/core/tx/datashard/build_index/fulltext.cpp b/ydb/core/tx/datashard/build_index/fulltext.cpp
new file mode 100644
index 00000000000..61aa3112c71
--- /dev/null
+++ b/ydb/core/tx/datashard/build_index/fulltext.cpp
@@ -0,0 +1,413 @@
+#include "common_helper.h"
+#include "../datashard_impl.h"
+#include "../scan_common.h"
+#include "../upload_stats.h"
+#include "../buffer_data.h"
+
+#include <ydb/core/base/appdata.h>
+#include <ydb/core/base/counters.h>
+#include <ydb/core/base/fulltext.h>
+#include <ydb/core/kqp/common/kqp_types.h>
+#include <ydb/core/scheme/scheme_tablecell.h>
+
+#include <ydb/core/tx/tx_proxy/proxy.h>
+#include <ydb/core/tx/tx_proxy/upload_rows.h>
+
+#include <ydb/core/ydb_convert/table_description.h>
+#include <ydb/core/ydb_convert/ydb_convert.h>
+#include <yql/essentials/public/issue/yql_issue_message.h>
+
+#include <util/generic/algorithm.h>
+#include <util/string/builder.h>
+
+namespace NKikimr::NDataShard {
+using namespace NTableIndex::NFulltext;
+using namespace NKikimr::NFulltext;
+
+class TBuildFulltextIndexScan: public TActor<TBuildFulltextIndexScan>, public IActorExceptionHandler, public NTable::IScan {
+ IDriver* Driver = nullptr;
+
+ ui64 TabletId = 0;
+ ui64 BuildId = 0;
+
+ ui64 ReadRows = 0;
+ ui64 ReadBytes = 0;
+
+ TTags ScanTags;
+ TString TextColumn;
+ Ydb::Table::FulltextIndexSettings::Analyzers TextAnalyzers;
+
+ TBatchRowsUploader Uploader;
+ TBufferData* UploadBuf = nullptr;
+
+ const NKikimrTxDataShard::TEvBuildFulltextIndexRequest Request;
+ const TActorId ResponseActorId;
+ const TAutoPtr<TEvDataShard::TEvBuildFulltextIndexResponse> Response;
+
+public:
+ static constexpr NKikimrServices::TActivity::EType ActorActivityType()
+ {
+ return NKikimrServices::TActivity::BUILD_FULLTEXT_INDEX;
+ }
+
+ TBuildFulltextIndexScan(ui64 tabletId, const TUserTable& table, NKikimrTxDataShard::TEvBuildFulltextIndexRequest request,
+ const TActorId& responseActorId, TAutoPtr<TEvDataShard::TEvBuildFulltextIndexResponse>&& response)
+ : TActor{&TThis::StateWork}
+ , TabletId(tabletId)
+ , BuildId{request.GetId()}
+ , Uploader(request.GetScanSettings())
+ , Request(std::move(request))
+ , ResponseActorId{responseActorId}
+ , Response{std::move(response)}
+ {
+ LOG_I("Create " << Debug());
+
+ Y_ENSURE(Request.settings().columns().size() == 1);
+ TextColumn = Request.settings().columns().at(0).column();
+ TextAnalyzers = Request.settings().columns().at(0).analyzers();
+
+ auto tags = GetAllTags(table);
+ auto types = GetAllTypes(table);
+
+ {
+ ScanTags.push_back(tags.at(TextColumn));
+
+ for (auto dataColumn : Request.GetDataColumns()) {
+ if (dataColumn != TextColumn) {
+ ScanTags.push_back(tags.at(dataColumn));
+ }
+ }
+ }
+
+ {
+ auto uploadTypes = std::make_shared<NTxProxy::TUploadTypes>();
+ auto addType = [&](const auto& column) {
+ auto it = types.find(column);
+ if (it != types.end()) {
+ Ydb::Type type;
+ NScheme::ProtoFromTypeInfo(it->second, type);
+ uploadTypes->emplace_back(it->first, type);
+ types.erase(it);
+ }
+ };
+ {
+ Ydb::Type type;
+ type.set_type_id(TokenType);
+ uploadTypes->emplace_back(TokenColumn, type);
+ }
+ for (const auto& column : table.KeyColumnIds) {
+ addType(table.Columns.at(column).Name);
+ }
+ for (auto dataColumn : Request.GetDataColumns()) {
+ addType(dataColumn);
+ }
+ UploadBuf = Uploader.AddDestination(Request.GetIndexName(), std::move(uploadTypes));
+ }
+ }
+
+ TInitialState Prepare(IDriver* driver, TIntrusiveConstPtr<TScheme>) final
+ {
+ TActivationContext::AsActorContext().RegisterWithSameMailbox(this);
+ LOG_I("Prepare " << Debug());
+
+ Driver = driver;
+ Uploader.SetOwner(SelfId());
+
+ return {EScan::Feed, {}};
+ }
+
+ EScan Seek(TLead& lead, ui64 seq) final
+ {
+ LOG_T("Seek " << seq << " " << Debug());
+
+ if (seq) {
+ return Uploader.CanFinish()
+ ? EScan::Final
+ : EScan::Sleep;
+ }
+
+ lead.To(ScanTags, {}, NTable::ESeek::Lower);
+
+ return EScan::Feed;
+ }
+
+ EScan Feed(TArrayRef<const TCell> key, const TRow& row) final
+ {
+ // LOG_T("Feed " << Debug());
+
+ ++ReadRows;
+ ReadBytes += CountRowCellBytes(key, *row);
+
+ TVector<TCell> uploadKey(::Reserve(key.size() + 1));
+ TVector<TCell> uploadValue(::Reserve(Request.GetDataColumns().size()));
+
+ TString text((*row).at(0).AsBuf());
+ auto tokens = Analyze(text, TextAnalyzers);
+ for (const auto& token : tokens) {
+ uploadKey.clear();
+ uploadKey.push_back(TCell(token));
+ uploadKey.insert(uploadKey.end(), key.begin(), key.end());
+
+ uploadValue.clear();
+ size_t index = 1; // skip text column
+ for (auto dataColumn : Request.GetDataColumns()) {
+ if (dataColumn != TextColumn) {
+ uploadValue.push_back(row.Get(index++));
+ } else {
+ uploadValue.push_back(TCell(text));
+ }
+ }
+
+ UploadBuf->AddRow(uploadKey, uploadValue);
+ }
+
+ return Uploader.ShouldWaitUpload() ? EScan::Sleep : EScan::Feed;
+ }
+
+ EScan PageFault() final
+ {
+ LOG_T("PageFault " << Debug());
+ return EScan::Feed;
+ }
+
+ EScan Exhausted() final
+ {
+ LOG_T("Exhausted " << Debug());
+
+ // call Seek to wait uploads
+ return EScan::Reset;
+ }
+
+ TAutoPtr<IDestructable> Finish(const std::exception& exc) final
+ {
+ Uploader.AddIssue(exc);
+ return Finish(EStatus::Exception);
+ }
+
+ TAutoPtr<IDestructable> Finish(EStatus status) final
+ {
+ auto& record = Response->Record;
+ record.MutableMeteringStats()->SetReadRows(ReadRows);
+ record.MutableMeteringStats()->SetReadBytes(ReadBytes);
+ record.MutableMeteringStats()->SetCpuTimeUs(Driver->GetTotalCpuTimeUs());
+
+ Uploader.Finish(record, status);
+
+ if (Response->Record.GetStatus() == NKikimrIndexBuilder::DONE) {
+ LOG_N("Done " << Debug() << " " << Response->Record.ShortDebugString());
+ } else {
+ LOG_E("Failed " << Debug() << " " << Response->Record.ShortDebugString());
+ }
+ Send(ResponseActorId, Response.Release());
+
+ Driver = nullptr;
+ this->PassAway();
+ return nullptr;
+ }
+
+ bool OnUnhandledException(const std::exception& exc) final
+ {
+ if (!Driver) {
+ return false;
+ }
+ Driver->Throw(exc);
+ return true;
+ }
+
+ void Describe(IOutputStream& out) const final
+ {
+ out << Debug();
+ }
+
+protected:
+ STFUNC(StateWork)
+ {
+ switch (ev->GetTypeRewrite()) {
+ HFunc(TEvTxUserProxy::TEvUploadRowsResponse, Handle);
+ CFunc(TEvents::TSystem::Wakeup, HandleWakeup);
+ default:
+ LOG_E("StateWork unexpected event type: " << ev->GetTypeRewrite()
+ << " event: " << ev->ToString() << " " << Debug());
+ }
+ }
+
+ void HandleWakeup(const NActors::TActorContext& /*ctx*/)
+ {
+ LOG_D("Retry upload " << Debug());
+
+ Uploader.RetryUpload();
+ }
+
+ void Handle(TEvTxUserProxy::TEvUploadRowsResponse::TPtr& ev, const TActorContext& ctx)
+ {
+ LOG_D("Handle TEvUploadRowsResponse " << Debug()
+ << " ev->Sender: " << ev->Sender.ToString());
+
+ if (!Driver) {
+ return;
+ }
+
+ Uploader.Handle(ev);
+
+ if (Uploader.GetUploadStatus().IsSuccess()) {
+ Driver->Touch(EScan::Feed);
+ return;
+ }
+
+ if (auto retryAfter = Uploader.GetRetryAfter(); retryAfter) {
+ LOG_N("Got retriable error, " << Debug() << " " << Uploader.GetUploadStatus().ToString());
+ ctx.Schedule(*retryAfter, new TEvents::TEvWakeup());
+ return;
+ }
+
+ LOG_N("Got error, abort scan, " << Debug() << " " << Uploader.GetUploadStatus().ToString());
+
+ Driver->Touch(EScan::Final);
+ }
+
+ TString Debug() const
+ {
+ return TStringBuilder() << "TBuildFulltextIndexScan TabletId: " << TabletId << " Id: " << BuildId
+ << " " << Uploader.Debug();
+ }
+};
+
+class TDataShard::TTxHandleSafeBuildFulltextIndexScan final: public NTabletFlatExecutor::TTransactionBase<TDataShard> {
+public:
+ TTxHandleSafeBuildFulltextIndexScan(TDataShard* self, TEvDataShard::TEvBuildFulltextIndexRequest::TPtr&& ev)
+ : TTransactionBase(self)
+ , Ev(std::move(ev))
+ {
+ }
+
+ bool Execute(TTransactionContext&, const TActorContext& ctx) final
+ {
+ Self->HandleSafe(Ev, ctx);
+ return true;
+ }
+
+ void Complete(const TActorContext&) final
+ {
+ }
+
+private:
+ TEvDataShard::TEvBuildFulltextIndexRequest::TPtr Ev;
+};
+
+void TDataShard::Handle(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext&)
+{
+ Execute(new TTxHandleSafeBuildFulltextIndexScan(this, std::move(ev)));
+}
+
+void TDataShard::HandleSafe(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext& ctx)
+{
+ auto& request = ev->Get()->Record;
+ const ui64 id = request.GetId();
+ TRowVersion rowVersion(request.GetSnapshotStep(), request.GetSnapshotTxId());
+ TScanRecord::TSeqNo seqNo = {request.GetSeqNoGeneration(), request.GetSeqNoRound()};
+
+ try {
+ auto response = MakeHolder<TEvDataShard::TEvBuildFulltextIndexResponse>();
+ FillScanResponseCommonFields(*response, id, TabletID(), seqNo);
+
+ LOG_N("Starting TBuildFulltextIndexScan TabletId: " << TabletID()
+ << " " << request.ShortDebugString()
+ << " row version " << rowVersion);
+
+ // Note: it's very unlikely that we have volatile txs before this snapshot
+ if (VolatileTxManager.HasVolatileTxsAtSnapshot(rowVersion)) {
+ VolatileTxManager.AttachWaitingSnapshotEvent(rowVersion, std::unique_ptr<IEventHandle>(ev.Release()));
+ return;
+ }
+
+ auto badRequest = [&](const TString& error) {
+ response->Record.SetStatus(NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST);
+ auto issue = response->Record.AddIssues();
+ issue->set_severity(NYql::TSeverityIds::S_ERROR);
+ issue->set_message(error);
+ };
+ auto trySendBadRequest = [&] {
+ if (response->Record.GetStatus() == NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST) {
+ LOG_E("Rejecting TBuildFulltextIndexScan bad request TabletId: " << TabletID()
+ << " " << request.ShortDebugString()
+ << " with response " << response->Record.ShortDebugString());
+ ctx.Send(ev->Sender, std::move(response));
+ return true;
+ } else {
+ return false;
+ }
+ };
+
+ // 1. Validating table and path existence
+ if (request.GetTabletId() != TabletID()) {
+ badRequest(TStringBuilder() << "Wrong shard " << request.GetTabletId() << " this is " << TabletID());
+ }
+ if (!IsStateActive()) {
+ badRequest(TStringBuilder() << "Shard " << TabletID() << " is " << State << " and not ready for requests");
+ }
+ const auto pathId = TPathId::FromProto(request.GetPathId());
+ const auto* userTableIt = GetUserTables().FindPtr(pathId.LocalPathId);
+ if (!userTableIt) {
+ badRequest(TStringBuilder() << "Unknown table id: " << pathId.LocalPathId);
+ }
+ if (trySendBadRequest()) {
+ return;
+ }
+ const auto& userTable = **userTableIt;
+
+ // 2. Validating request fields
+ if (!request.HasSnapshotStep() || !request.HasSnapshotTxId()) {
+ badRequest(TStringBuilder() << "Missing snapshot");
+ } else {
+ const TSnapshotKey snapshotKey(pathId, rowVersion.Step, rowVersion.TxId);
+ if (!SnapshotManager.FindAvailable(snapshotKey)) {
+ badRequest(TStringBuilder() << "Unknown snapshot for path id " << pathId.OwnerId << ":" << pathId.LocalPathId
+ << ", snapshot step is " << snapshotKey.Step << ", snapshot tx is " << snapshotKey.TxId);
+ }
+ }
+
+ if (!request.GetIndexName()) {
+ badRequest(TStringBuilder() << "Empty index table name");
+ }
+
+ auto tags = GetAllTags(userTable);
+ for (auto column : request.GetSettings().columns()) {
+ if (!tags.contains(column.column())) {
+ badRequest(TStringBuilder() << "Unknown key column: " << column.column());
+ }
+ }
+ for (auto dataColumn : request.GetDataColumns()) {
+ if (!tags.contains(dataColumn)) {
+ badRequest(TStringBuilder() << "Unknown data column: " << dataColumn);
+ }
+ }
+
+ if (trySendBadRequest()) {
+ return;
+ }
+
+ // 3. Validating fulltext index settings
+ if (!request.HasSettings()) {
+ badRequest(TStringBuilder() << "Missing fulltext index settings");
+ } else {
+ TString error;
+ if (!NKikimr::NFulltext::ValidateSettings(request.GetSettings(), error)) {
+ badRequest(error);
+ }
+ }
+
+ if (trySendBadRequest()) {
+ return;
+ }
+
+ // 4. Creating scan
+ TAutoPtr<NTable::IScan> scan = new TBuildFulltextIndexScan(TabletID(), userTable,
+ request, ev->Sender, std::move(response));
+
+ StartScan(this, std::move(scan), id, seqNo, rowVersion, userTable.LocalTid);
+ } catch (const std::exception& exc) {
+ FailScan<TEvDataShard::TEvBuildFulltextIndexResponse>(id, TabletID(), ev->Sender, seqNo, exc, "TBuildFulltextIndexScan");
+ }
+}
+
+}
diff --git a/ydb/core/tx/datashard/build_index/secondary_index.cpp b/ydb/core/tx/datashard/build_index/secondary_index.cpp
index 20e68c03a5b..40342b06156 100644
--- a/ydb/core/tx/datashard/build_index/secondary_index.cpp
+++ b/ydb/core/tx/datashard/build_index/secondary_index.cpp
@@ -577,12 +577,13 @@ void TDataShard::HandleSafe(TEvDataShard::TEvBuildIndexCreateRequest::TPtr& ev,
// 2. Validating request fields
if (!request.HasSnapshotStep() || !request.HasSnapshotTxId()) {
- badRequest(TStringBuilder() << "Empty snapshot");
- }
- const TSnapshotKey snapshotKey(tableId.PathId, rowVersion.Step, rowVersion.TxId);
- if (!SnapshotManager.FindAvailable(snapshotKey)) {
- badRequest(TStringBuilder() << "Unknown snapshot for path id " << tableId.PathId.OwnerId << ":" << tableId.PathId.LocalPathId
- << ", snapshot step is " << snapshotKey.Step << ", snapshot tx is " << snapshotKey.TxId);
+ badRequest(TStringBuilder() << "Missing snapshot");
+ } else {
+ const TSnapshotKey snapshotKey(tableId.PathId, rowVersion.Step, rowVersion.TxId);
+ if (!SnapshotManager.FindAvailable(snapshotKey)) {
+ badRequest(TStringBuilder() << "Unknown snapshot for path id " << tableId.PathId.OwnerId << ":" << tableId.PathId.LocalPathId
+ << ", snapshot step is " << snapshotKey.Step << ", snapshot tx is " << snapshotKey.TxId);
+ }
}
TSerializedTableRange requestedRange;
diff --git a/ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp b/ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp
new file mode 100644
index 00000000000..a0f6884f7bf
--- /dev/null
+++ b/ydb/core/tx/datashard/build_index/ut/ut_fulltext.cpp
@@ -0,0 +1,357 @@
+#include "ut_helpers.h"
+
+#include <ydb/core/base/table_index.h>
+#include <ydb/core/protos/index_builder.pb.h>
+#include <ydb/core/testlib/test_client.h>
+#include <ydb/core/tx/datashard/ut_common/datashard_ut_common.h>
+#include <ydb/core/tx/schemeshard/schemeshard.h>
+#include <ydb/core/tx/tx_proxy/proxy.h>
+#include <ydb/core/tx/tx_proxy/upload_rows.h>
+
+#include <yql/essentials/public/issue/yql_issue_message.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+namespace NKikimr {
+using namespace Tests;
+using Ydb::Table::FulltextIndexSettings;
+using namespace NTableIndex::NFulltext;
+
+static std::atomic<ui64> sId = 1;
+static const TString kMainTable = "/Root/table-main";
+static const TString kIndexTable = "/Root/table-index";
+
+Y_UNIT_TEST_SUITE(TTxDataShardBuildFulltextIndexScan) {
+
+ ui64 FillRequest(Tests::TServer::TPtr server, TActorId sender,
+ NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request,
+ std::function<void(NKikimrTxDataShard::TEvBuildFulltextIndexRequest&)> setupRequest)
+ {
+ auto id = sId.fetch_add(1, std::memory_order_relaxed);
+
+ auto snapshot = CreateVolatileSnapshot(server, {kMainTable});
+ auto datashards = GetTableShards(server, sender, kMainTable);
+ TTableId tableId = ResolveTableId(server, sender, kMainTable);
+
+ UNIT_ASSERT(datashards.size() == 1);
+
+ request.SetId(1);
+ request.SetSeqNoGeneration(id);
+ request.SetSeqNoRound(1);
+
+ request.SetTabletId(datashards[0]);
+ tableId.PathId.ToProto(request.MutablePathId());
+
+ request.SetSnapshotTxId(snapshot.TxId);
+ request.SetSnapshotStep(snapshot.Step);
+
+ FulltextIndexSettings settings;
+ settings.set_layout(FulltextIndexSettings::FLAT);
+ auto column = settings.add_columns();
+ column->set_column("text");
+ column->mutable_analyzers()->set_tokenizer(FulltextIndexSettings::WHITESPACE);
+ *request.MutableSettings() = settings;
+
+ request.SetIndexName(kIndexTable);
+
+ setupRequest(request);
+
+ return datashards[0];
+ }
+
+ void DoBadRequest(Tests::TServer::TPtr server, TActorId sender,
+ std::function<void(NKikimrTxDataShard::TEvBuildFulltextIndexRequest&)> setupRequest,
+ const TString& expectedError, bool expectedErrorSubstring = false, NKikimrIndexBuilder::EBuildStatus expectedStatus = NKikimrIndexBuilder::EBuildStatus::BAD_REQUEST)
+ {
+ auto ev = std::make_unique<TEvDataShard::TEvBuildFulltextIndexRequest>();
+
+ auto tabletId = FillRequest(server, sender, ev->Record, setupRequest);
+
+ NKikimr::DoBadRequest<TEvDataShard::TEvBuildFulltextIndexResponse>(server, sender, std::move(ev), tabletId, expectedError, expectedErrorSubstring, expectedStatus);
+ }
+
+ TString DoBuild(Tests::TServer::TPtr server, TActorId sender, std::function<void(NKikimrTxDataShard::TEvBuildFulltextIndexRequest&)> setupRequest) {
+ auto ev1 = std::make_unique<TEvDataShard::TEvBuildFulltextIndexRequest>();
+ auto tabletId = FillRequest(server, sender, ev1->Record, setupRequest);
+
+ auto ev2 = std::make_unique<TEvDataShard::TEvBuildFulltextIndexRequest>();
+ ev2->Record.CopyFrom(ev1->Record);
+
+ auto& runtime = *server->GetRuntime();
+ runtime.SendToPipe(tabletId, sender, ev1.release(), 0, GetPipeConfigWithRetries());
+ runtime.SendToPipe(tabletId, sender, ev2.release(), 0, GetPipeConfigWithRetries());
+
+ TAutoPtr<IEventHandle> handle;
+ auto reply = runtime.GrabEdgeEventRethrow<TEvDataShard::TEvBuildFulltextIndexResponse>(handle);
+
+ UNIT_ASSERT_EQUAL_C(reply->Record.GetStatus(), NKikimrIndexBuilder::EBuildStatus::DONE, reply->Record.ShortDebugString());
+
+ auto index = ReadShardedTable(server, kIndexTable);
+ Cerr << "Index:" << Endl;
+ Cerr << index << Endl;
+ return std::move(index);
+ }
+
+ void CreateMainTable(Tests::TServer::TPtr server, TActorId sender) {
+ TShardedTableOptions options;
+ options.EnableOutOfOrder(true);
+ options.Shards(1);
+ options.AllowSystemColumnNames(false);
+ options.Columns({
+ {"key", "Uint32", true, true},
+ {"text", "String", false, false},
+ {"data", "String", false, false},
+ });
+ CreateShardedTable(server, sender, "/Root", "table-main", options);
+ }
+
+ void FillMainTable(Tests::TServer::TPtr server, TActorId sender) {
+ ExecSQL(server, sender, R"(
+ UPSERT INTO `/Root/table-main` (key, text, data) VALUES
+ (1, "green apple", "one"),
+ (2, "red apple", "two"),
+ (3, "yellow apple", "three"),
+ (4, "red car", "four")
+ )");
+ }
+
+ void CreateIndexTable(Tests::TServer::TPtr server, TActorId sender) {
+ TShardedTableOptions options;
+ options.EnableOutOfOrder(true);
+ options.Shards(1);
+ options.AllowSystemColumnNames(true);
+ options.Columns({
+ {TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true},
+ {"key", "Uint32", true, true},
+ {"data", "String", false, false},
+ });
+ CreateShardedTable(server, sender, "/Root", "table-index", options);
+ }
+
+ void Setup(Tests::TServer::TPtr server, TActorId sender) {
+ server->GetRuntime()->SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_DEBUG);
+ server->GetRuntime()->SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE);
+
+ InitRoot(server, sender);
+
+ CreateMainTable(server, sender);
+ FillMainTable(server, sender);
+ CreateIndexTable(server, sender);
+ }
+
+ Y_UNIT_TEST(BadRequest) {
+ TPortManager pm;
+ TServerSettings serverSettings(pm.GetPort(2134));
+ serverSettings.SetDomainName("Root");
+
+ Tests::TServer::TPtr server = new TServer(serverSettings);
+ auto sender = server->GetRuntime()->AllocateEdgeActor();
+
+ Setup(server, sender);
+
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.SetTabletId(0);
+ }, TStringBuilder() << "{ <main>: Error: Wrong shard 0 this is " << GetTableShards(server, sender, kMainTable)[0] << " }");
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ TPathId(0, 0).ToProto(request.MutablePathId());
+ }, "{ <main>: Error: Unknown table id: 0 }");
+
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.SetSnapshotStep(request.GetSnapshotStep() + 1);
+ }, "Error: Unknown snapshot", true);
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.ClearSnapshotStep();
+ }, "{ <main>: Error: Missing snapshot }");
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.SetSnapshotTxId(request.GetSnapshotTxId() + 1);
+ }, "Error: Unknown snapshot", true);
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.ClearSnapshotTxId();
+ }, "{ <main>: Error: Missing snapshot }");
+
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.clear_settings();
+ }, "{ <main>: Error: Missing fulltext index settings }");
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.MutableSettings()->clear_columns();
+ }, "{ <main>: Error: columns should be set }");
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.MutableSettings()->mutable_columns()->at(0).mutable_analyzers()->clear_tokenizer();
+ }, "{ <main>: Error: tokenizer should be set }");
+
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.ClearIndexName();
+ }, "{ <main>: Error: Empty index table name }");
+
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.MutableSettings()->mutable_columns()->at(0).set_column("some");
+ }, "{ <main>: Error: Unknown key column: some }");
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.AddDataColumns("some");
+ }, "{ <main>: Error: Unknown data column: some }");
+
+ // test multiple issues:
+ DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvBuildFulltextIndexRequest& request) {
+ request.ClearIndexName();
+ request.AddDataColumns("some");
+ }, "[ { <main>: Error: Empty index table name } { <main>: Error: Unknown data column: some } ]");
+ }
+
+ Y_UNIT_TEST(Build) {
+ TPortManager pm;
+ TServerSettings serverSettings(pm.GetPort(2134));
+ serverSettings.SetDomainName("Root");
+
+ Tests::TServer::TPtr server = new TServer(serverSettings);
+ auto sender = server->GetRuntime()->AllocateEdgeActor();
+
+ Setup(server, sender);
+
+ auto result = DoBuild(server, sender, [](auto&){});
+
+ UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, data = (empty maybe)
+__ydb_token = apple, key = 2, data = (empty maybe)
+__ydb_token = apple, key = 3, data = (empty maybe)
+__ydb_token = car, key = 4, data = (empty maybe)
+__ydb_token = green, key = 1, data = (empty maybe)
+__ydb_token = red, key = 2, data = (empty maybe)
+__ydb_token = red, key = 4, data = (empty maybe)
+__ydb_token = yellow, key = 3, data = (empty maybe)
+)");
+ }
+
+ Y_UNIT_TEST(BuildWithData) {
+ TPortManager pm;
+ TServerSettings serverSettings(pm.GetPort(2134));
+ serverSettings.SetDomainName("Root");
+
+ Tests::TServer::TPtr server = new TServer(serverSettings);
+ auto sender = server->GetRuntime()->AllocateEdgeActor();
+
+ Setup(server, sender);
+
+ auto result = DoBuild(server, sender, [](auto& request) {
+ request.AddDataColumns("data");
+ });
+
+ UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, data = one
+__ydb_token = apple, key = 2, data = two
+__ydb_token = apple, key = 3, data = three
+__ydb_token = car, key = 4, data = four
+__ydb_token = green, key = 1, data = one
+__ydb_token = red, key = 2, data = two
+__ydb_token = red, key = 4, data = four
+__ydb_token = yellow, key = 3, data = three
+)");
+ }
+
+ Y_UNIT_TEST(BuildWithTextData) {
+ TPortManager pm;
+ TServerSettings serverSettings(pm.GetPort(2134));
+ serverSettings.SetDomainName("Root");
+
+ Tests::TServer::TPtr server = new TServer(serverSettings);
+ auto sender = server->GetRuntime()->AllocateEdgeActor();
+
+ InitRoot(server, sender);
+
+ CreateMainTable(server, sender);
+ FillMainTable(server, sender);
+
+ { // CreateIndexTable with text column
+ TShardedTableOptions options;
+ options.EnableOutOfOrder(true);
+ options.Shards(1);
+ options.AllowSystemColumnNames(true);
+ options.Columns({
+ {TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true},
+ {"key", "Uint32", true, true},
+ {"text", "String", false, false},
+ {"data", "String", false, false},
+ });
+ CreateShardedTable(server, sender, "/Root", "table-index", options);
+ }
+
+ auto result = DoBuild(server, sender, [](auto& request) {
+ request.AddDataColumns("text");
+ request.AddDataColumns("data");
+ });
+
+ UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, text = green apple, data = one
+__ydb_token = apple, key = 2, text = red apple, data = two
+__ydb_token = apple, key = 3, text = yellow apple, data = three
+__ydb_token = car, key = 4, text = red car, data = four
+__ydb_token = green, key = 1, text = green apple, data = one
+__ydb_token = red, key = 2, text = red apple, data = two
+__ydb_token = red, key = 4, text = red car, data = four
+__ydb_token = yellow, key = 3, text = yellow apple, data = three
+)");
+ }
+
+ Y_UNIT_TEST(BuildWithTextFromKey) {
+ TPortManager pm;
+ TServerSettings serverSettings(pm.GetPort(2134));
+ serverSettings.SetDomainName("Root");
+
+ Tests::TServer::TPtr server = new TServer(serverSettings);
+ auto sender = server->GetRuntime()->AllocateEdgeActor();
+
+ server->GetRuntime()->SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_DEBUG);
+ server->GetRuntime()->SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE);
+
+ InitRoot(server, sender);
+
+ { // CreateMainTable
+ TShardedTableOptions options;
+ options.EnableOutOfOrder(true);
+ options.Shards(1);
+ options.AllowSystemColumnNames(false);
+ options.Columns({
+ {"key", "Uint32", true, true},
+ {"text", "String", true, true},
+ {"subkey", "Uint32", true, true},
+ {"data", "String", false, false},
+ });
+ CreateShardedTable(server, sender, "/Root", "table-main", options);
+ }
+ { // FillMainTable
+ ExecSQL(server, sender, R"(
+ UPSERT INTO `/Root/table-main` (key, text, subkey, data) VALUES
+ (1, "green apple", 11, "one"),
+ (2, "red apple", 22, "two"),
+ (3, "yellow apple", 33, "three"),
+ (4, "red car", 44, "four")
+ )");
+ }
+ { // CreateIndexTable
+ TShardedTableOptions options;
+ options.EnableOutOfOrder(true);
+ options.Shards(1);
+ options.AllowSystemColumnNames(true);
+ options.Columns({
+ {TokenColumn, NTableIndex::NFulltext::TokenTypeName, true, true},
+ {"key", "Uint32", true, true},
+ {"text", "String", true, true},
+ {"subkey", "Uint32", true, true},
+ {"data", "String", false, false},
+ });
+ CreateShardedTable(server, sender, "/Root", "table-index", options);
+ }
+
+ auto result = DoBuild(server, sender, [](auto& request) {
+ request.AddDataColumns("data");
+ });
+
+ UNIT_ASSERT_VALUES_EQUAL(result, R"(__ydb_token = apple, key = 1, text = green apple, subkey = 11, data = one
+__ydb_token = apple, key = 2, text = red apple, subkey = 22, data = two
+__ydb_token = apple, key = 3, text = yellow apple, subkey = 33, data = three
+__ydb_token = car, key = 4, text = red car, subkey = 44, data = four
+__ydb_token = green, key = 1, text = green apple, subkey = 11, data = one
+__ydb_token = red, key = 2, text = red apple, subkey = 22, data = two
+__ydb_token = red, key = 4, text = red car, subkey = 44, data = four
+__ydb_token = yellow, key = 3, text = yellow apple, subkey = 33, data = three
+)");
+ }
+}
+
+}
diff --git a/ydb/core/tx/datashard/build_index/ut/ya.make b/ydb/core/tx/datashard/build_index/ut/ya.make
index 8da7981617c..b71af3fef00 100644
--- a/ydb/core/tx/datashard/build_index/ut/ya.make
+++ b/ydb/core/tx/datashard/build_index/ut/ya.make
@@ -27,6 +27,7 @@ PEERDIR(
YQL_LAST_ABI_VERSION()
SRCS(
+ ut_fulltext.cpp
ut_local_kmeans.cpp
ut_prefix_kmeans.cpp
ut_recompute_kmeans.cpp
diff --git a/ydb/core/tx/datashard/datashard.h b/ydb/core/tx/datashard/datashard.h
index 47becb18023..dabc4175395 100644
--- a/ydb/core/tx/datashard/datashard.h
+++ b/ydb/core/tx/datashard/datashard.h
@@ -367,6 +367,9 @@ namespace TEvDataShard {
EvIncrementalRestoreResponse,
+ EvBuildFulltextIndexRequest,
+ EvBuildFulltextIndexResponse,
+
EvEnd
};
@@ -1560,6 +1563,18 @@ namespace TEvDataShard {
TEvDataShard::EvPrefixKMeansResponse> {
};
+ struct TEvBuildFulltextIndexRequest
+ : public TEventPB<TEvBuildFulltextIndexRequest,
+ NKikimrTxDataShard::TEvBuildFulltextIndexRequest,
+ TEvDataShard::EvBuildFulltextIndexRequest> {
+ };
+
+ struct TEvBuildFulltextIndexResponse
+ : public TEventPB<TEvBuildFulltextIndexResponse,
+ NKikimrTxDataShard::TEvBuildFulltextIndexResponse,
+ TEvDataShard::EvBuildFulltextIndexResponse> {
+ };
+
struct TEvIncrementalRestoreResponse
: public TEventPB<TEvIncrementalRestoreResponse,
NKikimrTxDataShard::TEvIncrementalRestoreResponse,
diff --git a/ydb/core/tx/datashard/datashard_impl.h b/ydb/core/tx/datashard/datashard_impl.h
index 52f48f77cd6..29a967d6aaf 100644
--- a/ydb/core/tx/datashard/datashard_impl.h
+++ b/ydb/core/tx/datashard/datashard_impl.h
@@ -241,6 +241,7 @@ class TDataShard
class TTxHandleSafeReshuffleKMeansScan;
class TTxHandleSafeRecomputeKMeansScan;
class TTxHandleSafeStatisticsScan;
+ class TTxHandleSafeBuildFulltextIndexScan;
class TTxMediatorStateRestored;
@@ -1342,6 +1343,8 @@ class TDataShard
void HandleSafe(TEvDataShard::TEvLocalKMeansRequest::TPtr& ev, const TActorContext& ctx);
void Handle(TEvDataShard::TEvPrefixKMeansRequest::TPtr& ev, const TActorContext& ctx);
void HandleSafe(TEvDataShard::TEvPrefixKMeansRequest::TPtr& ev, const TActorContext& ctx);
+ void Handle(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext& ctx);
+ void HandleSafe(TEvDataShard::TEvBuildFulltextIndexRequest::TPtr& ev, const TActorContext& ctx);
void Handle(TEvDataShard::TEvCdcStreamScanRequest::TPtr& ev, const TActorContext& ctx);
void Handle(TEvPrivate::TEvCdcStreamScanRegistered::TPtr& ev, const TActorContext& ctx);
void Handle(TEvPrivate::TEvCdcStreamScanProgress::TPtr& ev, const TActorContext& ctx);
@@ -3225,6 +3228,7 @@ protected:
HFunc(TEvDataShard::TEvRecomputeKMeansRequest, Handle);
HFunc(TEvDataShard::TEvLocalKMeansRequest, Handle);
HFunc(TEvDataShard::TEvPrefixKMeansRequest, Handle);
+ HFunc(TEvDataShard::TEvBuildFulltextIndexRequest, Handle);
HFunc(TEvDataShard::TEvCdcStreamScanRequest, Handle);
HFunc(TEvPrivate::TEvCdcStreamScanRegistered, Handle);
HFunc(TEvPrivate::TEvCdcStreamScanProgress, Handle);
diff --git a/ydb/core/tx/datashard/ya.make b/ydb/core/tx/datashard/ya.make
index c210e96e4f9..ee92b38d455 100644
--- a/ydb/core/tx/datashard/ya.make
+++ b/ydb/core/tx/datashard/ya.make
@@ -218,13 +218,14 @@ SRCS(
wait_for_plan_unit.cpp
wait_for_stream_clearance_unit.cpp
- build_index/prefix_kmeans.cpp
+ build_index/fulltext.cpp
build_index/kmeans_helper.cpp
build_index/local_kmeans.cpp
- build_index/sample_k.cpp
- build_index/secondary_index.cpp
+ build_index/prefix_kmeans.cpp
build_index/recompute_kmeans.cpp
build_index/reshuffle_kmeans.cpp
+ build_index/sample_k.cpp
+ build_index/secondary_index.cpp
build_index/unique_index.cpp
)
diff --git a/ydb/core/tx/scheme_board/cache.cpp b/ydb/core/tx/scheme_board/cache.cpp
index b9cb865161a..cea2875fe8a 100644
--- a/ydb/core/tx/scheme_board/cache.cpp
+++ b/ydb/core/tx/scheme_board/cache.cpp
@@ -917,6 +917,8 @@ class TSchemeCache: public TMonitorableActor<TSchemeCache> {
return NSchemeCache::ETableKind::KindAsyncIndexTable;
case NKikimrSchemeOp::EPathSubTypeVectorKmeansTreeIndexImplTable:
return NSchemeCache::ETableKind::KindVectorIndexTable;
+ case NKikimrSchemeOp::EPathSubTypeFulltextIndexImplTable:
+ return NSchemeCache::ETableKind::KindFulltextIndexTable;
default:
return NSchemeCache::ETableKind::KindRegularTable;
}
diff --git a/ydb/core/tx/scheme_cache/scheme_cache.h b/ydb/core/tx/scheme_cache/scheme_cache.h
index 7d57ca289ee..e37c7fa57e3 100644
--- a/ydb/core/tx/scheme_cache/scheme_cache.h
+++ b/ydb/core/tx/scheme_cache/scheme_cache.h
@@ -157,6 +157,7 @@ enum class ETableKind {
KindSyncIndexTable = 2,
KindAsyncIndexTable = 3,
KindVectorIndexTable = 4,
+ KindFulltextIndexTable = 5,
};
struct TSchemeCacheNavigate {
diff --git a/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp b/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp
index 74bb9febe8b..0eeed9290d5 100644
--- a/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__conditional_erase.cpp
@@ -1,6 +1,8 @@
#include "schemeshard_impl.h"
#include <util/string/join.h>
+#include <ydb/core/base/table_index.h>
+#include <ydb/core/protos/flat_scheme_op.pb.h>
namespace NKikimr {
namespace NSchemeShard {
@@ -239,7 +241,7 @@ private:
auto index = GetIndex(childPath);
if (index->Type == NKikimrSchemeOp::EIndexTypeGlobalAsync
- || index->Type == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree) {
+ || !DoesIndexSupportTTL(index->Type)) {
continue;
}
@@ -276,7 +278,7 @@ private:
}
static TVector<std::pair<ui32, ui32>> MakeColumnIds(TTableInfo::TPtr mainTable, TTableIndexInfo::TPtr index, TTableInfo::TPtr indexImplTable) {
- Y_ABORT_UNLESS(index->Type != NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree);
+ Y_ABORT_UNLESS(DoesIndexSupportTTL(index->Type));
TVector<std::pair<ui32, ui32>> result;
THashSet<TString> keys;
diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp
index adc9ba3ead1..b571fdde52a 100644
--- a/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__operation_consistent_copy_tables.cpp
@@ -50,11 +50,23 @@ static std::optional<NKikimrSchemeOp::TModifyScheme> CreateIndexTask(NKikimr::NS
*operation->MutableDataColumnNames()->Add() = dataColumn;
}
- if (indexInfo->Type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- *operation->MutableVectorIndexKmeansTreeDescription() =
- std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription);
- } else if (!std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)) {
- return {};
+ switch (indexInfo->Type) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ Y_ASSERT(std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription));
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ *operation->MutableVectorIndexKmeansTreeDescription() =
+ std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription);
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ *operation->MutableFulltextIndexDescription() =
+ std::get<NKikimrSchemeOp::TFulltextIndexDescription>(indexInfo->SpecializedIndexDescription);
+ break;
+ default:
+ return {}; // reject
}
return scheme;
diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp
index e8103b6a4e6..accf49d53c6 100644
--- a/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__operation_copy_table.cpp
@@ -852,12 +852,24 @@ TVector<ISubOperation::TPtr> CreateCopyTable(TOperationId nextId, const TTxTrans
for (const auto& dataColumn: indexInfo->IndexDataColumns) {
*operation->MutableDataColumnNames()->Add() = dataColumn;
}
- if (indexInfo->Type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- *operation->MutableVectorIndexKmeansTreeDescription() =
- std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription);
- } else if (!std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription)) {
- return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter,
- TStringBuilder{} << "Copy table doesn't support table with index type " << indexInfo->Type)};
+
+ switch (indexInfo->Type) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ Y_ASSERT(std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription));
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ *operation->MutableVectorIndexKmeansTreeDescription() =
+ std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription);
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ *operation->MutableFulltextIndexDescription() =
+ std::get<NKikimrSchemeOp::TFulltextIndexDescription>(indexInfo->SpecializedIndexDescription);
+ break;
+ default:
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, InvalidIndexType(indexInfo->Type))};
}
result.push_back(CreateNewTableIndex(NextPartId(nextId, result), schema));
diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp
index a135183dfbc..133d9670700 100644
--- a/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp
@@ -40,6 +40,30 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa
const auto& op = tx.GetInitiateIndexBuild();
const auto& indexDesc = op.GetIndex();
+ switch (GetIndexType(indexDesc)) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ // no feature flag, everything is fine
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ if (!context.SS->EnableInitialUniqueIndex) {
+ return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Adding a unique index to an existing table is disabled")};
+ }
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ if (!context.SS->EnableVectorIndex) {
+ return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Vector index support is disabled")};
+ }
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ if (!context.SS->EnableFulltextIndex) {
+ return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Fulltext index support is disabled")};
+ }
+ break;
+ default:
+ return {CreateReject(opId, NKikimrScheme::EStatus::StatusPreconditionFailed, InvalidIndexType(indexDesc.GetType()))};
+ }
+
const auto table = TPath::Resolve(op.GetTable(), context.SS);
const auto index = table.Child(indexDesc.GetName());
{
@@ -93,15 +117,14 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa
}
TVector<ISubOperation::TPtr> result;
- const NKikimrSchemeOp::EIndexType indexType = indexDesc.HasType() ? indexDesc.GetType() : NKikimrSchemeOp::EIndexTypeGlobal;
{
auto outTx = TransactionTemplate(table.PathString(), NKikimrSchemeOp::EOperationType::ESchemeOpCreateTableIndex);
*outTx.MutableLockGuard() = tx.GetLockGuard();
outTx.MutableCreateTableIndex()->CopyFrom(indexDesc);
+ outTx.MutableCreateTableIndex()->SetType(GetIndexType(indexDesc));
outTx.MutableCreateTableIndex()->SetState(NKikimrSchemeOp::EIndexStateWriteOnly);
outTx.SetInternal(tx.GetInternal());
- outTx.MutableCreateTableIndex()->SetType(indexType);
result.push_back(CreateNewTableIndex(NextPartId(opId, result), outTx));
}
@@ -118,7 +141,7 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa
}
auto createImplTable = [&](NKikimrSchemeOp::TTableDescription&& implTableDesc) {
- if (indexType != NKikimrSchemeOp::EIndexTypeGlobalUnique) {
+ if (GetIndexType(indexDesc) != NKikimrSchemeOp::EIndexTypeGlobalUnique) {
implTableDesc.MutablePartitionConfig()->SetShadowData(true);
}
@@ -129,34 +152,56 @@ TVector<ISubOperation::TPtr> CreateBuildIndex(TOperationId opId, const TTxTransa
return CreateInitializeBuildIndexImplTable(NextPartId(opId, result), outTx);
};
- if (indexDesc.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- const bool prefixVectorIndex = indexDesc.GetKeyColumnNames().size() > 1;
- NKikimrSchemeOp::TTableDescription indexLevelTableDesc, indexPostingTableDesc, indexPrefixTableDesc;
- // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS
- if (indexDesc.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) {
- indexLevelTableDesc = indexDesc.GetIndexImplTableDescriptions(0);
- indexPostingTableDesc = indexDesc.GetIndexImplTableDescriptions(1);
- if (prefixVectorIndex) {
- indexPrefixTableDesc = indexDesc.GetIndexImplTableDescriptions(2);
+ switch (GetIndexType(indexDesc)) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique: {
+ NKikimrSchemeOp::TTableDescription indexTableDesc;
+ // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS
+ if (indexDesc.IndexImplTableDescriptionsSize() == 1) {
+ indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0);
}
+ auto implTableDesc = CalcImplTableDesc(tableInfo, implTableColumns, indexTableDesc);
+ // TODO if keep erase markers also speedup compaction or something else we can enable it for other impl tables too
+ implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true);
+ result.push_back(createImplTable(std::move(implTableDesc)));
+ break;
}
- const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()};
- result.push_back(createImplTable(CalcVectorKmeansTreeLevelImplTableDesc(tableInfo->PartitionConfig(), indexLevelTableDesc)));
- result.push_back(createImplTable(CalcVectorKmeansTreePostingImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexPostingTableDesc)));
- if (prefixVectorIndex) {
- const THashSet<TString> prefixColumns{indexDesc.GetKeyColumnNames().begin(), indexDesc.GetKeyColumnNames().end() - 1};
- result.push_back(createImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, tableInfo, tableInfo->PartitionConfig(), implTableColumns, indexPrefixTableDesc)));
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
+ const bool prefixVectorIndex = indexDesc.GetKeyColumnNames().size() > 1;
+ NKikimrSchemeOp::TTableDescription indexLevelTableDesc, indexPostingTableDesc, indexPrefixTableDesc;
+ // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS
+ if (indexDesc.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) {
+ indexLevelTableDesc = indexDesc.GetIndexImplTableDescriptions(0);
+ indexPostingTableDesc = indexDesc.GetIndexImplTableDescriptions(1);
+ if (prefixVectorIndex) {
+ indexPrefixTableDesc = indexDesc.GetIndexImplTableDescriptions(2);
+ }
+ }
+ const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()};
+ result.push_back(createImplTable(CalcVectorKmeansTreeLevelImplTableDesc(tableInfo->PartitionConfig(), indexLevelTableDesc)));
+ result.push_back(createImplTable(CalcVectorKmeansTreePostingImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexPostingTableDesc)));
+ if (prefixVectorIndex) {
+ const THashSet<TString> prefixColumns{indexDesc.GetKeyColumnNames().begin(), indexDesc.GetKeyColumnNames().end() - 1};
+ result.push_back(createImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, tableInfo, tableInfo->PartitionConfig(), implTableColumns, indexPrefixTableDesc)));
+ }
+ break;
}
- } else {
- NKikimrSchemeOp::TTableDescription indexTableDesc;
- // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS
- if (indexDesc.IndexImplTableDescriptionsSize() == 1) {
- indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0);
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext: {
+ NKikimrSchemeOp::TTableDescription indexTableDesc;
+ // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS
+ if (indexDesc.IndexImplTableDescriptionsSize() == 1) {
+ indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0);
+ }
+ const THashSet<TString> indexDataColumns{indexDesc.GetDataColumnNames().begin(), indexDesc.GetDataColumnNames().end()};
+ auto implTableDesc = CalcFulltextImplTableDesc(tableInfo, tableInfo->PartitionConfig(), indexDataColumns, indexTableDesc);
+ implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true);
+ result.push_back(createImplTable(std::move(implTableDesc)));
+ break;
}
- auto implTableDesc = CalcImplTableDesc(tableInfo, implTableColumns, indexTableDesc);
- // TODO if keep erase markers also speedup compaction or something else we can enable it for other impl tables too
- implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true);
- result.push_back(createImplTable(std::move(implTableDesc)));
+ default:
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexDesc.GetType()));
+ break;
}
return result;
diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp
index ea53e715266..fc648c9dda8 100644
--- a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp
@@ -127,6 +127,41 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr
TTableColumns baseTableColumns = ExtractInfo(baseTableDescription);
for (auto& indexDescription: indexedTable.GetIndexDescription()) {
const auto& indexName = indexDescription.GetName();
+
+ switch (GetIndexType(indexDescription)) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ // no feature flag, everything is fine
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ if (!context.SS->EnableInitialUniqueIndex) {
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Unique constraint feature is disabled")};
+ }
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
+ if (!context.SS->EnableVectorIndex) {
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Vector index support is disabled")};
+ }
+ TString msg;
+ if (!NKikimr::NKMeans::ValidateSettings(indexDescription.GetVectorIndexKmeansTreeDescription().GetSettings(), msg)) {
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)};
+ }
+ break;
+ }
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext: {
+ if (!context.SS->EnableFulltextIndex) {
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Fulltext index support is disabled")};
+ }
+ TString msg;
+ if (!NKikimr::NFulltext::ValidateSettings(indexDescription.GetFulltextIndexDescription().GetSettings(), msg)) {
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)};
+ }
+ break;
+ }
+ default:
+ return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, InvalidIndexType(indexDescription.GetType()))};
+ }
+
bool uniformIndexTable = false;
if (indexDescription.IndexImplTableDescriptionsSize()) {
if (indexDescription.GetIndexImplTableDescriptions(0).HasUniformPartitionsCount()) {
@@ -241,35 +276,6 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr
}
for (auto& indexDescription: indexedTable.GetIndexDescription()) {
- const auto indexType = indexDescription.HasType()
- ? indexDescription.GetType()
- : NKikimrSchemeOp::EIndexTypeGlobal;
-
- switch (indexType) {
- case NKikimrSchemeOp::EIndexTypeInvalid:
- return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Invalid index type")};
- case NKikimrSchemeOp::EIndexTypeGlobal:
- case NKikimrSchemeOp::EIndexTypeGlobalAsync:
- // no feature flag, everything is fine
- break;
- case NKikimrSchemeOp::EIndexTypeGlobalUnique: {
- if (!context.SS->EnableInitialUniqueIndex) {
- return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Unique constraint feature is disabled")};
- }
- break;
- }
- case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
- if (!context.SS->EnableVectorIndex) {
- return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Vector index support is disabled")};
- }
- TString msg;
- if (!NKikimr::NKMeans::ValidateSettings(indexDescription.GetVectorIndexKmeansTreeDescription().GetSettings(), msg)) {
- return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)};
- }
- break;
- }
- }
-
{
auto scheme = TransactionTemplate(
tx.GetWorkingDir() + "/" + baseTableDescription.GetName(),
@@ -279,7 +285,7 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr
scheme.SetInternal(tx.GetInternal());
scheme.MutableCreateTableIndex()->CopyFrom(indexDescription);
- scheme.MutableCreateTableIndex()->SetType(indexType);
+ scheme.MutableCreateTableIndex()->SetType(GetIndexType(indexDescription));
result.push_back(CreateNewTableIndex(NextPartId(nextId, result), scheme));
}
@@ -298,32 +304,51 @@ TVector<ISubOperation::TPtr> CreateIndexedTable(TOperationId nextId, const TTxTr
};
const auto& implTableColumns = indexes.at(indexDescription.GetName());
- if (indexDescription.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- const bool prefixVectorIndex = indexDescription.GetKeyColumnNames().size() > 1;
- NKikimrSchemeOp::TTableDescription userLevelDesc, userPostingDesc, userPrefixDesc;
- if (indexDescription.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) {
- // This description provided by user to override partition policy
- userLevelDesc = indexDescription.GetIndexImplTableDescriptions(0);
- userPostingDesc = indexDescription.GetIndexImplTableDescriptions(1);
- if (prefixVectorIndex) {
- userPrefixDesc = indexDescription.GetIndexImplTableDescriptions(2);
+ switch (GetIndexType(indexDescription)) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique: {
+ NKikimrSchemeOp::TTableDescription userIndexDesc;
+ if (indexDescription.IndexImplTableDescriptionsSize()) {
+ // This description provided by user to override partition policy
+ userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0);
}
+ result.push_back(createIndexImplTable(CalcImplTableDesc(baseTableDescription, implTableColumns, userIndexDesc)));
+ break;
}
- const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()};
- result.push_back(createIndexImplTable(CalcVectorKmeansTreeLevelImplTableDesc(baseTableDescription.GetPartitionConfig(), userLevelDesc)));
- result.push_back(createIndexImplTable(CalcVectorKmeansTreePostingImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userPostingDesc)));
- if (prefixVectorIndex) {
- const THashSet<TString> prefixColumns{indexDescription.GetKeyColumnNames().begin(), indexDescription.GetKeyColumnNames().end() - 1};
- result.push_back(createIndexImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, baseTableDescription, baseTableDescription.GetPartitionConfig(), implTableColumns, userPrefixDesc)));
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
+ const bool prefixVectorIndex = indexDescription.GetKeyColumnNames().size() > 1;
+ NKikimrSchemeOp::TTableDescription userLevelDesc, userPostingDesc, userPrefixDesc;
+ if (indexDescription.IndexImplTableDescriptionsSize() == 2 + prefixVectorIndex) {
+ // This description provided by user to override partition policy
+ userLevelDesc = indexDescription.GetIndexImplTableDescriptions(0);
+ userPostingDesc = indexDescription.GetIndexImplTableDescriptions(1);
+ if (prefixVectorIndex) {
+ userPrefixDesc = indexDescription.GetIndexImplTableDescriptions(2);
+ }
+ }
+ const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()};
+ result.push_back(createIndexImplTable(CalcVectorKmeansTreeLevelImplTableDesc(baseTableDescription.GetPartitionConfig(), userLevelDesc)));
+ result.push_back(createIndexImplTable(CalcVectorKmeansTreePostingImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userPostingDesc)));
+ if (prefixVectorIndex) {
+ const THashSet<TString> prefixColumns{indexDescription.GetKeyColumnNames().begin(), indexDescription.GetKeyColumnNames().end() - 1};
+ result.push_back(createIndexImplTable(CalcVectorKmeansTreePrefixImplTableDesc(prefixColumns, baseTableDescription, baseTableDescription.GetPartitionConfig(), implTableColumns, userPrefixDesc)));
+ }
+ break;
}
- } else {
- NKikimrSchemeOp::TTableDescription userIndexDesc;
- if (indexDescription.IndexImplTableDescriptionsSize()) {
- // This description provided by user to override partition policy
- userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0);
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext: {
+ NKikimrSchemeOp::TTableDescription userIndexDesc;
+ if (indexDescription.IndexImplTableDescriptionsSize()) {
+ // This description provided by user to override partition policy
+ userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0);
+ }
+ const THashSet<TString> indexDataColumns{indexDescription.GetDataColumnNames().begin(), indexDescription.GetDataColumnNames().end()};
+ result.push_back(createIndexImplTable(CalcFulltextImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), indexDataColumns, userIndexDesc)));
+ break;
}
-
- result.push_back(createIndexImplTable(CalcImplTableDesc(baseTableDescription, implTableColumns, userIndexDesc)));
+ default:
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexDescription.GetType()));
+ break;
}
}
diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index.cpp
index 65d1401d0c4..8cb8de853d3 100644
--- a/ydb/core/tx/schemeshard/schemeshard_build_index.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_build_index.cpp
@@ -1,6 +1,7 @@
#include "schemeshard_build_index.h"
#include "schemeshard_impl.h"
+#include <ydb/core/protos/flat_scheme_op.pb.h>
namespace NKikimr {
namespace NSchemeShard {
@@ -95,11 +96,25 @@ void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuil
*serializableRepresentation.AddIndexImplTableDescriptions() = description;
}
- std::visit([&]<typename T>(const T& specializedDescription) {
- if constexpr (std::is_same_v<T, NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>) {
- *serializableRepresentation.MutableVectorIndexKmeansTreeDescription() = specializedDescription;
- }
- }, info.SpecializedIndexDescription);
+ switch (info.IndexType) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ Y_ASSERT(std::holds_alternative<std::monostate>(info.SpecializedIndexDescription));
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ *serializableRepresentation.MutableVectorIndexKmeansTreeDescription() =
+ std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(info.SpecializedIndexDescription);
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ *serializableRepresentation.MutableFulltextIndexDescription() =
+ std::get<NKikimrSchemeOp::TFulltextIndexDescription>(info.SpecializedIndexDescription);
+ break;
+ default:
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(info.IndexType));
+ break;
+ }
persistedBuildIndex.Update(
NIceDb::TUpdate<Schema::IndexBuild::CreationConfig>(serializableRepresentation.SerializeAsString())
diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp
index ef71de0641e..0b83aae3012 100644
--- a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp
@@ -5,6 +5,7 @@
#include "schemeshard_utils.h" // for NTableIndex::CommonCheck
#include "schemeshard_xxport__helpers.h"
+#include <ydb/core/protos/flat_scheme_op.pb.h>
#include <ydb/core/ydb_convert/table_settings.h>
namespace NKikimr::NSchemeShard {
@@ -157,6 +158,11 @@ public:
return makeReply(explain);
}
+ if (tableInfo->IsTTLEnabled() && !DoesIndexSupportTTL(buildInfo->IndexType)) {
+ return Reply(Ydb::StatusIds::PRECONDITION_FAILED,
+ TStringBuilder() << buildInfo->IndexType << " index doesn't support TTL");
+ }
+
NKikimrSchemeOp::TIndexBuildConfig tmpConfig;
buildInfo->SerializeToProto(Self, &tmpConfig);
const auto indexDesc = tmpConfig.GetIndex();
@@ -234,7 +240,7 @@ private:
buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildSecondaryIndex;
buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync;
break;
- case Ydb::Table::TableIndex::TypeCase::kGlobalUniqueIndex:
+ case Ydb::Table::TableIndex::TypeCase::kGlobalUniqueIndex: {
if (!Self->EnableAddUniqueIndex) {
explain = "Adding a unique index to an existing table is disabled";
return false;
@@ -242,6 +248,7 @@ private:
buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildSecondaryUniqueIndex;
buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique;
break;
+ }
case Ydb::Table::TableIndex::TypeCase::kGlobalVectorKmeansTreeIndex: {
if (!Self->EnableVectorIndex) {
explain = "Vector index support is disabled";
@@ -266,6 +273,21 @@ private:
}
break;
}
+ case Ydb::Table::TableIndex::TypeCase::kGlobalFulltextIndex: {
+ if (!Self->EnableFulltextIndex) {
+ explain = "Fulltext index support is disabled";
+ return false;
+ }
+ buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildFulltext;
+ buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext;
+ NKikimrSchemeOp::TFulltextIndexDescription fulltextIndexDescription;
+ *fulltextIndexDescription.MutableSettings() = index.global_fulltext_index().fulltext_settings();
+ if (!NKikimr::NFulltext::ValidateSettings(fulltextIndexDescription.GetSettings(), explain)) {
+ return false;
+ }
+ buildInfo.SpecializedIndexDescription = fulltextIndexDescription;
+ break;
+ }
};
buildInfo.IndexName = index.name();
diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp
index 771239e89d8..675b2cde48e 100644
--- a/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp
@@ -301,8 +301,11 @@ void TSchemeShard::TIndexBuilder::TTxBase::Fill(NKikimrIndexBuilder::TIndexBuild
case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree:
*index.mutable_global_vector_kmeans_tree_index() = Ydb::Table::GlobalVectorKMeansTreeIndex();
break;
+ case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext:
+ *index.mutable_global_fulltext_index() = Ydb::Table::GlobalFulltextIndex();
+ break;
default:
- Y_ABORT("Unreachable");
+ Y_ENSURE(false, InvalidIndexType(info.IndexType));
}
} else if (info.IsBuildColumns()) {
for(const auto& column : info.BuildColumns) {
diff --git a/ydb/core/tx/schemeshard/schemeshard_impl.cpp b/ydb/core/tx/schemeshard/schemeshard_impl.cpp
index 2b4c4214332..18d789c4d5d 100644
--- a/ydb/core/tx/schemeshard/schemeshard_impl.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_impl.cpp
@@ -5089,6 +5089,7 @@ void TSchemeShard::OnActivateExecutor(const TActorContext &ctx) {
EnableVectorIndex = appData->FeatureFlags.GetEnableVectorIndex();
EnableInitialUniqueIndex = appData->FeatureFlags.GetEnableUniqConstraint();
EnableAddUniqueIndex = appData->FeatureFlags.GetEnableAddUniqueIndex();
+ EnableFulltextIndex = appData->FeatureFlags.GetEnableFulltextIndex();
EnableResourcePoolsOnServerless = appData->FeatureFlags.GetEnableResourcePoolsOnServerless();
EnableExternalDataSourcesOnServerless = appData->FeatureFlags.GetEnableExternalDataSourcesOnServerless();
EnableShred = appData->FeatureFlags.GetEnableDataErasure();
@@ -7806,6 +7807,7 @@ void TSchemeShard::ApplyConsoleConfigs(const NKikimrConfig::TFeatureFlags& featu
EnableVectorIndex = featureFlags.GetEnableVectorIndex();
EnableInitialUniqueIndex = featureFlags.GetEnableUniqConstraint();
EnableAddUniqueIndex = featureFlags.GetEnableAddUniqueIndex();
+ EnableFulltextIndex = featureFlags.GetEnableFulltextIndex();
EnableExternalDataSourcesOnServerless = featureFlags.GetEnableExternalDataSourcesOnServerless();
EnableShred = featureFlags.GetEnableDataErasure();
EnableExternalSourceSchemaInference = featureFlags.GetEnableExternalSourceSchemaInference();
diff --git a/ydb/core/tx/schemeshard/schemeshard_impl.h b/ydb/core/tx/schemeshard/schemeshard_impl.h
index 654d61e73de..6fc6d569ca5 100644
--- a/ydb/core/tx/schemeshard/schemeshard_impl.h
+++ b/ydb/core/tx/schemeshard/schemeshard_impl.h
@@ -359,6 +359,7 @@ public:
bool EnableVectorIndex = false;
bool EnableInitialUniqueIndex = false;
bool EnableAddUniqueIndex = false;
+ bool EnableFulltextIndex = false;
bool EnableExternalDataSourcesOnServerless = false;
bool EnableShred = false;
bool EnableExternalSourceSchemaInference = false;
diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp
index e7929c9ad09..f798da43ef4 100644
--- a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp
@@ -6,10 +6,12 @@
#include <ydb/core/base/appdata.h>
#include <ydb/core/base/channel_profiles.h>
+#include <ydb/core/base/table_index.h>
#include <ydb/core/base/tx_processing.h>
#include <ydb/core/engine/minikql/flat_local_tx_factory.h>
#include <ydb/core/engine/mkql_proto.h>
#include <ydb/core/protos/config.pb.h>
+#include <ydb/core/protos/flat_scheme_op.pb.h>
#include <ydb/core/scheme/scheme_types_proto.h>
#include <ydb/core/tablet/tablet_counters_aggregator.h>
#include <ydb/core/tablet/tablet_counters_protobuf.h>
@@ -618,8 +620,8 @@ TTableInfo::TAlterDataPtr TTableInfo::CreateAlterData(
if (op.HasTTLSettings()) {
for (const auto& indexDescription : op.GetTableIndexes()) {
- if (indexDescription.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- errStr = "Table with vector indexes doesn't support TTL";
+ if (!DoesIndexSupportTTL(indexDescription.GetType())) {
+ errStr = TStringBuilder() << "Table with " << indexDescription.GetType() << " index doesn't support TTL";
return nullptr;
}
}
@@ -2292,8 +2294,22 @@ void TIndexBuildInfo::SerializeToProto(TSchemeShard* ss, NKikimrSchemeOp::TIndex
ImplTableDescriptions.end()
};
- if (IndexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- *index.MutableVectorIndexKmeansTreeDescription() = std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(SpecializedIndexDescription);
+ switch (IndexType) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ Y_ASSERT(std::holds_alternative<std::monostate>(SpecializedIndexDescription));
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ *index.MutableVectorIndexKmeansTreeDescription() = std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(SpecializedIndexDescription);
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ *index.MutableFulltextIndexDescription() = std::get<NKikimrSchemeOp::TFulltextIndexDescription>(SpecializedIndexDescription);
+ break;
+ default:
+ Y_DEBUG_ABORT_S(InvalidIndexType(IndexType));
+ break;
}
}
diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h
index 45a43b4772a..ef04a3c1286 100644
--- a/ydb/core/tx/schemeshard/schemeshard_info_types.h
+++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h
@@ -8,6 +8,8 @@
#include "schemeshard_tx_infly.h"
#include "schemeshard_types.h"
+#include <util/generic/yexception.h>
+#include <ydb/core/protos/flat_scheme_op.pb.h>
#include <ydb/public/api/protos/ydb_cms.pb.h>
#include <ydb/public/api/protos/ydb_coordination.pb.h>
#include <ydb/public/api/protos/ydb_import.pb.h>
@@ -17,6 +19,7 @@
#include <ydb/core/backup/common/encryption.h>
#include <ydb/core/backup/common/metadata.h>
#include <ydb/core/base/feature_flags.h>
+#include <ydb/core/base/fulltext.h>
#include <ydb/core/base/kmeans_clusters.h>
#include <ydb/core/base/storage_pools.h>
#include <ydb/core/base/table_index.h>
@@ -2443,9 +2446,30 @@ struct TTableIndexInfo : public TSimpleRefCount<TTableIndexInfo> {
, Type(type)
, State(state)
{
- if (type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- Y_ENSURE(SpecializedIndexDescription.emplace<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>()
- .ParseFromString(description));
+ switch (type) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ Y_ASSERT(description.empty());
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
+ auto success = SpecializedIndexDescription
+ .emplace<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>()
+ .ParseFromString(description);
+ Y_ENSURE(success, description);
+ break;
+ }
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext: {
+ auto success = SpecializedIndexDescription
+ .emplace<NKikimrSchemeOp::TFulltextIndexDescription>()
+ .ParseFromString(description);
+ Y_ENSURE(success, description);
+ break;
+ }
+ default:
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(type));
+ break;
}
}
@@ -2494,8 +2518,21 @@ struct TTableIndexInfo : public TSimpleRefCount<TTableIndexInfo> {
alterData->State = config.HasState() ? config.GetState() : EState::EIndexStateReady;
- if (config.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- alterData->SpecializedIndexDescription = config.GetVectorIndexKmeansTreeDescription();
+ switch (GetIndexType(config)) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ alterData->SpecializedIndexDescription = config.GetVectorIndexKmeansTreeDescription();
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ alterData->SpecializedIndexDescription = config.GetFulltextIndexDescription();
+ break;
+ default:
+ errMsg += InvalidIndexType(config.GetType());
+ return nullptr;
}
return result;
@@ -2510,7 +2547,9 @@ struct TTableIndexInfo : public TSimpleRefCount<TTableIndexInfo> {
TTableIndexInfo::TPtr AlterData = nullptr;
- std::variant<std::monostate, NKikimrSchemeOp::TVectorIndexKmeansTreeDescription> SpecializedIndexDescription;
+ std::variant<std::monostate,
+ NKikimrSchemeOp::TVectorIndexKmeansTreeDescription,
+ NKikimrSchemeOp::TFulltextIndexDescription> SpecializedIndexDescription;
};
struct TCdcStreamSettings {
@@ -3127,6 +3166,7 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
BuildPrefixedVectorIndex = 12,
BuildSecondaryUniqueIndex = 13,
BuildColumns = 20,
+ BuildFulltext = 30,
};
TActorId CreateSender;
@@ -3155,7 +3195,9 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
TString TargetName;
TVector<NKikimrSchemeOp::TTableDescription> ImplTableDescriptions;
- std::variant<std::monostate, NKikimrSchemeOp::TVectorIndexKmeansTreeDescription> SpecializedIndexDescription;
+ std::variant<std::monostate,
+ NKikimrSchemeOp::TVectorIndexKmeansTreeDescription,
+ NKikimrSchemeOp::TFulltextIndexDescription> SpecializedIndexDescription;
struct TKMeans {
// TODO(mbkkt) move to TVectorIndexKmeansTreeDescription
@@ -3606,11 +3648,17 @@ public:
indexInfo->Clusters = NKikimr::NKMeans::CreateClusters(desc.settings().settings(), indexInfo->KMeans.Rounds, createError);
Y_ENSURE(indexInfo->Clusters, createError);
indexInfo->SpecializedIndexDescription = std::move(desc);
- } break;
+ break;
+ }
+ case NKikimrSchemeOp::TIndexCreationConfig::kFulltextIndexDescription: {
+ auto& desc = *creationConfig.MutableFulltextIndexDescription();
+ indexInfo->SpecializedIndexDescription = std::move(desc);
+ break;
+ }
case NKikimrSchemeOp::TIndexCreationConfig::SPECIALIZEDINDEXDESCRIPTION_NOT_SET:
/* do nothing */
break;
- }
+ }
}
LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::BUILD_INDEX,
diff --git a/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp b/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp
index e1c0bf16e34..73d2ae506b1 100644
--- a/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp
@@ -1,5 +1,6 @@
#include "schemeshard_path_describer.h"
+#include <ydb/core/protos/flat_scheme_op.pb.h>
#include <ydb/public/api/protos/annotations/sensitive.pb.h>
#include <ydb/core/base/appdata.h>
@@ -216,8 +217,10 @@ TPathElement::EPathSubType TPathDescriber::CalcPathSubType(const TPath& path) {
return TPathElement::EPathSubType::EPathSubTypeSyncIndexImplTable;
case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
return TPathElement::EPathSubType::EPathSubTypeVectorKmeansTreeIndexImplTable;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ return TPathElement::EPathSubType::EPathSubTypeFulltextIndexImplTable;
default:
- Y_DEBUG_ABORT("%s", (TStringBuilder() << "unexpected indexInfo->Type# " << indexInfo->Type).data());
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexInfo->Type));
return TPathElement::EPathSubType::EPathSubTypeEmpty;
}
} else if (parentPath.IsCdcStream()) {
@@ -1471,14 +1474,23 @@ void TSchemeShard::DescribeTableIndex(const TPathId& pathId, const TString& name
}
entry.SetDataSize(dataSize);
- if (indexInfo->Type == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree) {
- if (const auto* vectorIndexKmeansTreeDescription = std::get_if<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(&indexInfo->SpecializedIndexDescription)) {
- *entry.MutableVectorIndexKmeansTreeDescription() = *vectorIndexKmeansTreeDescription;
- } else {
- Y_FAIL_S("SpecializedIndexDescription should be set");
- }
+ switch (indexInfo->Type) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ // no specialized index description
+ Y_ASSERT(std::holds_alternative<std::monostate>(indexInfo->SpecializedIndexDescription));
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree:
+ *entry.MutableVectorIndexKmeansTreeDescription() = std::get<NKikimrSchemeOp::TVectorIndexKmeansTreeDescription>(indexInfo->SpecializedIndexDescription);
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ *entry.MutableFulltextIndexDescription() = std::get<NKikimrSchemeOp::TFulltextIndexDescription>(indexInfo->SpecializedIndexDescription);
+ break;
+ default:
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(indexInfo->Type));
+ break;
}
-
}
void TSchemeShard::DescribeCdcStream(const TPathId& pathId, const TString& name,
diff --git a/ydb/core/tx/schemeshard/schemeshard_schema.h b/ydb/core/tx/schemeshard/schemeshard_schema.h
index 6122fb4ab53..c829f2ea41d 100644
--- a/ydb/core/tx/schemeshard/schemeshard_schema.h
+++ b/ydb/core/tx/schemeshard/schemeshard_schema.h
@@ -17,6 +17,9 @@ namespace NKikimr::NSchemeShard {
inline constexpr auto ClusterIdTypeId = NScheme::NTypeIds::Uint64;
+// TODO: support utf-8 in fulltext index
+inline constexpr auto TokenTypeId = NScheme::NTypeIds::String;
+
struct Schema : NIceDb::Schema {
struct Paths : Table<1> {
struct Id : Column<1, NScheme::NTypeIds::Uint64> { using Type = TLocalPathId; };
diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.cpp b/ydb/core/tx/schemeshard/schemeshard_utils.cpp
index 95cd196bdef..7ef907b8cd0 100644
--- a/ydb/core/tx/schemeshard/schemeshard_utils.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_utils.cpp
@@ -315,6 +315,36 @@ auto CalcVectorKmeansTreePrefixImplTableDescImpl(
return implTableDesc;
}
+auto CalcFulltextImplTableDescImpl(
+ const auto& baseTable,
+ const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig,
+ const THashSet<TString>& indexDataColumns,
+ const NKikimrSchemeOp::TTableDescription& indexTableDesc)
+{
+ auto tableColumns = ExtractInfo(baseTable);
+ THashSet<TString> indexColumns = indexDataColumns;
+ for (const auto & keyColumn: tableColumns.Keys) {
+ indexColumns.insert(keyColumn);
+ }
+
+ NKikimrSchemeOp::TTableDescription implTableDesc;
+ implTableDesc.SetName(NTableIndex::ImplTable);
+ SetImplTablePartitionConfig(baseTablePartitionConfig, indexTableDesc, implTableDesc);
+ {
+ auto tokenColumn = implTableDesc.AddColumns();
+ tokenColumn->SetName(NFulltext::TokenColumn);
+ tokenColumn->SetType(NFulltext::TokenTypeName);
+ tokenColumn->SetTypeId(NSchemeShard::TokenTypeId);
+ tokenColumn->SetNotNull(true);
+ }
+ implTableDesc.AddKeyColumnNames(NFulltext::TokenColumn);
+ FillIndexImplTableColumns(GetColumns(baseTable), tableColumns.Keys, indexColumns, implTableDesc);
+
+ implTableDesc.SetSystemColumnNamesAllowed(true);
+
+ return implTableDesc;
+}
+
}
void FillIndexTableColumns(
@@ -421,6 +451,24 @@ NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePrefixImplTableDesc(
return CalcVectorKmeansTreePrefixImplTableDescImpl(indexKeyColumns, baseTableDescr, baseTablePartitionConfig, implTableColumns, indexTableDesc);
}
+NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc(
+ const NSchemeShard::TTableInfo::TPtr& baseTableInfo,
+ const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig,
+ const THashSet<TString>& indexDataColumns,
+ const NKikimrSchemeOp::TTableDescription& indexTableDesc)
+{
+ return CalcFulltextImplTableDescImpl(baseTableInfo, baseTablePartitionConfig, indexDataColumns, indexTableDesc);
+}
+
+NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc(
+ const NKikimrSchemeOp::TTableDescription& baseTableDescr,
+ const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig,
+ const THashSet<TString>& indexDataColumns,
+ const NKikimrSchemeOp::TTableDescription& indexTableDesc)
+{
+ return CalcFulltextImplTableDescImpl(baseTableDescr, baseTablePartitionConfig, indexDataColumns, indexTableDesc);
+}
+
bool ExtractTypes(const NKikimrSchemeOp::TTableDescription& baseTableDescr, TColumnTypes& columnTypes, TString& explain) {
const NScheme::TTypeRegistry* typeRegistry = AppData()->TypeRegistry;
Y_ABORT_UNLESS(typeRegistry);
diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.h b/ydb/core/tx/schemeshard/schemeshard_utils.h
index a1e04900fe5..5c490b80771 100644
--- a/ydb/core/tx/schemeshard/schemeshard_utils.h
+++ b/ydb/core/tx/schemeshard/schemeshard_utils.h
@@ -3,6 +3,7 @@
#include "schemeshard_info_types.h"
#include "schemeshard_types.h"
+#include <ydb/core/base/fulltext.h>
#include <ydb/core/base/table_index.h>
#include <yql/essentials/minikql/mkql_type_ops.h>
@@ -92,6 +93,18 @@ NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePrefixImplTableDesc(
const TTableColumns& implTableColumns,
const NKikimrSchemeOp::TTableDescription& indexTableDesc);
+NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc(
+ const NSchemeShard::TTableInfo::TPtr& baseTableInfo,
+ const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig,
+ const THashSet<TString>& indexDataColumns,
+ const NKikimrSchemeOp::TTableDescription& indexTableDesc);
+
+NKikimrSchemeOp::TTableDescription CalcFulltextImplTableDesc(
+ const NKikimrSchemeOp::TTableDescription& baseTableDescr,
+ const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig,
+ const THashSet<TString>& indexDataColumns,
+ const NKikimrSchemeOp::TTableDescription& indexTableDesc);
+
TTableColumns ExtractInfo(const NSchemeShard::TTableInfo::TPtr& tableInfo);
TTableColumns ExtractInfo(const NKikimrSchemeOp::TTableDescription& tableDesc);
TIndexColumns ExtractInfo(const NKikimrSchemeOp::TIndexCreationConfig& indexDesc);
@@ -133,7 +146,7 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat
return false;
}
- if (!IsCompatibleIndex(indexDesc.GetType(), baseTableColumns, indexKeys, error)) {
+ if (!IsCompatibleIndex(GetIndexType(indexDesc), baseTableColumns, indexKeys, error)) {
status = NKikimrScheme::EStatus::StatusInvalidParameter;
return false;
}
@@ -144,29 +157,66 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat
return false;
}
- implTableColumns = CalcTableImplDescription(indexDesc.GetType(), baseTableColumns, indexKeys);
-
- if (indexDesc.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
- //We have already checked this in IsCompatibleIndex
- Y_ABORT_UNLESS(indexKeys.KeyColumns.size() >= 1);
-
- if (indexKeys.KeyColumns.size() > 1 && !IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) {
- status = NKikimrScheme::EStatus::StatusInvalidParameter;
- return false;
+ implTableColumns = CalcTableImplDescription(GetIndexType(indexDesc), baseTableColumns, indexKeys);
+
+ switch (GetIndexType(indexDesc)) {
+ case NKikimrSchemeOp::EIndexTypeGlobal:
+ case NKikimrSchemeOp::EIndexTypeGlobalAsync:
+ case NKikimrSchemeOp::EIndexTypeGlobalUnique:
+ if (!IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) {
+ status = NKikimrScheme::EStatus::StatusInvalidParameter;
+ return false;
+ }
+ break;
+ case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
+ // We have already checked this in IsCompatibleIndex
+ Y_ABORT_UNLESS(indexKeys.KeyColumns.size() >= 1);
+
+ if (indexKeys.KeyColumns.size() > 1 && !IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) {
+ status = NKikimrScheme::EStatus::StatusInvalidParameter;
+ return false;
+ }
+
+ const TString& embeddingColumnName = indexKeys.KeyColumns.back();
+ Y_ABORT_UNLESS(baseColumnTypes.contains(embeddingColumnName));
+ auto typeInfo = baseColumnTypes.at(embeddingColumnName);
+
+ if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) {
+ status = NKikimrScheme::EStatus::StatusInvalidParameter;
+ error = TStringBuilder() << "Embedding column '" << embeddingColumnName << "' expected type 'String' but got " << NScheme::TypeName(typeInfo);
+ return false;
+ }
+ break;
}
-
- const TString& indexColumnName = indexKeys.KeyColumns.back();
- Y_ABORT_UNLESS(baseColumnTypes.contains(indexColumnName));
- auto typeInfo = baseColumnTypes.at(indexColumnName);
-
- if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) {
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext: {
+ // We have already checked this in IsCompatibleIndex
+ Y_ABORT_UNLESS(indexKeys.KeyColumns.size() >= 1);
+
+ // Here we only check that fulltext index columns matches table description
+ // the rest will be checked in NFulltext::ValidateSettings (called separately outside of CommonCheck)
+ if (!NKikimr::NFulltext::ValidateColumnsMatches(indexKeys.KeyColumns, indexDesc.GetFulltextIndexDescription().GetSettings(), error)) {
+ status = NKikimrScheme::EStatus::StatusInvalidParameter;
+ return false;
+ }
+
+ for (const auto& column : indexDesc.GetFulltextIndexDescription().GetSettings().columns()) {
+ if (column.has_analyzers()) {
+ auto typeInfo = baseColumnTypes.at(column.column());
+ // TODO: support utf-8 in fulltext index
+ if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) {
+ status = NKikimrScheme::EStatus::StatusInvalidParameter;
+ error = TStringBuilder() << "Fulltext column '" << column.column() << "' expected type 'String' but got " << NScheme::TypeName(typeInfo);
+ return false;
+ }
+ }
+ }
+
+ break;
+ }
+ default:
status = NKikimrScheme::EStatus::StatusInvalidParameter;
- error = TStringBuilder() << "Index column '" << indexColumnName << "' expected type 'String' but got " << NScheme::TypeName(typeInfo);
+ error = InvalidIndexType(indexDesc.GetType());
return false;
- }
- } else if (!IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) {
- status = NKikimrScheme::EStatus::StatusInvalidParameter;
- return false;
}
if (implTableColumns.Keys.size() > schemeLimits.MaxTableKeyColumns) {
diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp
index 6927f9054ee..5193a2ca149 100644
--- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp
+++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp
@@ -1,5 +1,6 @@
#include "ls_checks.h"
+#include <google/protobuf/text_format.h>
#include <ydb/public/api/protos/ydb_cms.pb.h>
#include <ydb/public/api/protos/ydb_coordination.pb.h>
#include <ydb/public/lib/scheme_types/scheme_type_id.h>
@@ -919,6 +920,41 @@ TCheckFunc KMeansTreeDescription(Ydb::Table::VectorIndexSettings_Metric metric,
};
}
+TCheckFunc SpecializedIndexDescription(const TString& proto) {
+ return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) {
+ switch (record.GetPathDescription().GetTableIndex().GetSpecializedIndexDescriptionCase()) {
+ case NKikimrSchemeOp::TIndexDescription::kVectorIndexKmeansTreeDescription: {
+ auto actual = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings();
+ Ydb::Table::KMeansTreeSettings expected;
+ UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected));
+ UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected),
+ TStringBuilder() << "Expected"
+ << expected.ShortDebugString()
+ << " but got "
+ << actual.ShortDebugString());
+ break;
+ }
+ case NKikimrSchemeOp::TIndexDescription::kFulltextIndexDescription: {
+ auto actual = record.GetPathDescription().GetTableIndex().GetFulltextIndexDescription().GetSettings();
+ Ydb::Table::FulltextIndexSettings expected;
+ UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected));
+ UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected),
+ TStringBuilder() << "Expected"
+ << expected.ShortDebugString()
+ << " but got "
+ << actual.ShortDebugString());
+ break;
+ }
+ case NKikimrSchemeOp::TIndexDescription::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: {
+ UNIT_ASSERT_C(proto == "SPECIALIZEDINDEXDESCRIPTION_NOT_SET",
+ TStringBuilder() << "Expected"
+ << proto
+ << " but got SPECIALIZEDINDEXDESCRIPTION_NOT_SET");
+ break;
+ }
+ }
+ };
+}
TCheckFunc SequenceName(const TString& name) {
return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) {
diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h
index 64a5bd1f35f..5c112f69195 100644
--- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h
+++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h
@@ -175,6 +175,8 @@ namespace NLs {
ui32 levels
);
+ TCheckFunc SpecializedIndexDescription(const TString& proto);
+
TCheckFunc SequenceName(const TString& name);
TCheckFunc SequenceIncrement(i64 increment);
TCheckFunc SequenceMaxValue(i64 maxValue);
diff --git a/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp b/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp
index 6ce43b1547e..ab528d3565a 100644
--- a/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp
+++ b/ydb/core/tx/schemeshard/ut_helpers/test_env.cpp
@@ -594,6 +594,7 @@ NSchemeShardUT_Private::TTestEnv::TTestEnv(TTestActorRuntime& runtime, const TTe
app.FeatureFlags.SetEnableTableDatetime64(true);
app.FeatureFlags.SetEnableVectorIndex(true);
app.FeatureFlags.SetEnableAddUniqueIndex(true);
+ app.FeatureFlags.SetEnableFulltextIndex(true);
app.FeatureFlags.SetEnableColumnStore(true);
app.FeatureFlags.SetEnableStrictAclCheck(opts.EnableStrictAclCheck_);
app.SetEnableMoveIndex(opts.EnableMoveIndex_);
diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp
new file mode 100644
index 00000000000..4b5b2740545
--- /dev/null
+++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp
@@ -0,0 +1,340 @@
+#include <ydb/core/base/path.h>
+#include <ydb/core/change_exchange/change_exchange.h>
+#include <ydb/core/scheme/scheme_tablecell.h>
+#include <ydb/core/testlib/tablet_helpers.h>
+#include <ydb/core/tx/schemeshard/schemeshard_utils.h>
+#include <ydb/core/tx/schemeshard/ut_helpers/helpers.h>
+
+
+using namespace NKikimr;
+using namespace NSchemeShard;
+using namespace NSchemeShardUT_Private;
+using namespace NKikimr::NTableIndex;
+using namespace NKikimr::NTableIndex::NFulltext;
+
+Y_UNIT_TEST_SUITE(TFulltextIndexTests) {
+ Y_UNIT_TEST(CreateTable) {
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ columns: {
+ column: "text"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_lowercase: true
+ }
+ }
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text" Type: "String" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: ["text"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()));
+ env.TestWaitNotification(runtime, txId);
+
+ NKikimrSchemeOp::TDescribeOptions opts;
+ opts.SetReturnChildren(true);
+ opts.SetShowPrivateTable(true);
+ Cout << DescribePath(runtime, "/MyRoot/texts/idx_fulltext/indexImplTable", opts).DebugString() << Endl;
+
+ for (ui32 reboot = 0; reboot < 2; reboot++) {
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathExist,
+ NLs::IndexType(NKikimrSchemeOp::EIndexTypeGlobalFulltext),
+ NLs::IndexState(NKikimrSchemeOp::EIndexStateReady),
+ NLs::IndexKeys({"text"}),
+ NLs::IndexDataColumns({"covered"}),
+ NLs::SpecializedIndexDescription(fulltextSettings),
+ NLs::ChildrenCount(1),
+ });
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext/indexImplTable"),{
+ NLs::PathExist,
+ NLs::CheckColumns("indexImplTable",
+ { NTableIndex::NFulltext::TokenColumn, "id", "covered" }, {},
+ { NTableIndex::NFulltext::TokenColumn, "id" }, true) });
+
+ Cerr << "Reboot SchemeShard.." << Endl;
+ TActorId sender = runtime.AllocateEdgeActor();
+ RebootTablet(runtime, TTestTxConfig::SchemeShard, sender);
+ }
+ }
+
+ Y_UNIT_TEST(CreateTablePrefix) { // not supported for now, maybe later
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ columns: {
+ column: "text"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_lowercase: true
+ }
+ }
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text" Type: "String" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: [ "another", "text"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter});
+ env.TestWaitNotification(runtime, txId);
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathNotExist,
+ });
+ }
+
+ Y_UNIT_TEST(CreateTableMultipleColumns) { // not supported for now, maybe later
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ columns: {
+ column: "text1"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_lowercase: true
+ }
+ }
+ columns: {
+ column: "text2"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_lowercase: true
+ }
+ }
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text1" Type: "String" }
+ Columns { Name: "text2" Type: "String" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: ["text1", "text2"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter});
+ env.TestWaitNotification(runtime, txId);
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathNotExist,
+ });
+ }
+
+ Y_UNIT_TEST(CreateTableNotText) {
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ columns: {
+ column: "text"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_lowercase: true
+ }
+ }
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text" Type: "Uint64" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: ["text"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter});
+ env.TestWaitNotification(runtime, txId);
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathNotExist,
+ });
+ }
+
+ Y_UNIT_TEST(CreateTableColumnsMismatch) {
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ columns: {
+ column: "text_wrong"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_lowercase: true
+ }
+ }
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text" Type: "String" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: ["text"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter});
+ env.TestWaitNotification(runtime, txId);
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathNotExist,
+ });
+ }
+
+ Y_UNIT_TEST(CreateTableNoColumnsSettings) {
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text" Type: "String" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: ["text"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter});
+ env.TestWaitNotification(runtime, txId);
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathNotExist,
+ });
+ }
+
+ Y_UNIT_TEST(CreateTableUnsupportedSettings) {
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+ ui64 txId = 100;
+
+ TString fulltextSettings = R"(
+ layout: FLAT
+ columns: {
+ column: "text"
+ analyzers: {
+ tokenizer: STANDARD
+ use_filter_edge_ngram: true
+ }
+ }
+ )";
+ TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"(
+ TableDescription {
+ Name: "texts"
+ Columns { Name: "id" Type: "Uint64" }
+ Columns { Name: "text" Type: "String" }
+ Columns { Name: "covered" Type: "String" }
+ Columns { Name: "another" Type: "Uint64" }
+ KeyColumnNames: ["id"]
+ }
+ IndexDescription {
+ Name: "idx_fulltext"
+ KeyColumnNames: ["text"]
+ DataColumnNames: ["covered"]
+ Type: EIndexTypeGlobalFulltext
+ FulltextIndexDescription: {
+ Settings: {
+ %s
+ }
+ }
+ }
+ )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter});
+ env.TestWaitNotification(runtime, txId);
+
+ TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{
+ NLs::PathNotExist,
+ });
+ }
+}
diff --git a/ydb/core/tx/schemeshard/ut_index/ya.make b/ydb/core/tx/schemeshard/ut_index/ya.make
index ddd365aeee7..cfb8db9df7b 100644
--- a/ydb/core/tx/schemeshard/ut_index/ya.make
+++ b/ydb/core/tx/schemeshard/ut_index/ya.make
@@ -22,6 +22,7 @@ SRCS(
ut_async_index.cpp
ut_unique_index.cpp
ut_vector_index.cpp
+ ut_fulltext_index.cpp
)
YQL_LAST_ABI_VERSION()
diff --git a/ydb/core/ydb_convert/table_description.cpp b/ydb/core/ydb_convert/table_description.cpp
index 43596f2a8a7..b882896a8ca 100644
--- a/ydb/core/ydb_convert/table_description.cpp
+++ b/ydb/core/ydb_convert/table_description.cpp
@@ -5,6 +5,7 @@
#include <ydb/core/base/appdata.h>
#include <ydb/core/base/path.h>
+#include <ydb/core/base/table_index.h>
#include <ydb/core/engine/mkql_proto.h>
#include <ydb/core/formats/arrow/switch/switch_type.h>
#include <ydb/core/protos/follower_group.pb.h>
@@ -1088,7 +1089,18 @@ void FillIndexDescriptionImpl(TYdbProto& out, const NKikimrSchemeOp::TTableDescr
break;
}
+ case NKikimrSchemeOp::EIndexTypeGlobalFulltext:
+ FillGlobalIndexSettings(
+ *index->mutable_global_fulltext_index()->mutable_settings(),
+ tableIndex.GetIndexImplTableDescriptions(0)
+ );
+
+ *index->mutable_global_fulltext_index()->mutable_fulltext_settings() = tableIndex.GetFulltextIndexDescription().GetSettings();
+
+ break;
default:
+ Y_DEBUG_ABORT_S(NTableIndex::InvalidIndexType(tableIndex.GetType()));
+
break;
};
@@ -1141,7 +1153,6 @@ bool FillIndexDescription(NKikimrSchemeOp::TIndexedTableCreationConfig& out,
}
// specific fields
- std::vector<NKikimrSchemeOp::TTableDescription> indexImplTableDescriptionsVector;
switch (index.type_case()) {
case Ydb::Table::TableIndex::kGlobalIndex:
indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobal);
@@ -1159,17 +1170,23 @@ bool FillIndexDescription(NKikimrSchemeOp::TIndexedTableCreationConfig& out,
indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree);
*indexDesc->MutableVectorIndexKmeansTreeDescription()->MutableSettings() = index.global_vector_kmeans_tree_index().vector_settings();
break;
+
+ case Ydb::Table::TableIndex::kGlobalFulltextIndex:
+ indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext);
+ *indexDesc->MutableFulltextIndexDescription()->MutableSettings() = index.global_fulltext_index().fulltext_settings();
+ break;
- default:
- // pass through
- // TODO: maybe return BAD_REQUEST?
+ case Ydb::Table::TableIndex::TYPE_NOT_SET:
+ // FIXME: python sdk can create a table with a secondary index without a type
+ // so it's not possible to return an invalid index type error here for now
break;
}
- if (!FillIndexTablePartitioning(indexImplTableDescriptionsVector, index, status, error)) {
+ std::vector<NKikimrSchemeOp::TTableDescription> indexImplTableDescriptions;
+ if (!FillIndexTablePartitioning(indexImplTableDescriptions, index, status, error)) {
return false;
}
- *indexDesc->MutableIndexImplTableDescriptions() = {indexImplTableDescriptionsVector.begin(), indexImplTableDescriptionsVector.end()};
+ *indexDesc->MutableIndexImplTableDescriptions() = {indexImplTableDescriptions.begin(), indexImplTableDescriptions.end()};
}
return true;
diff --git a/ydb/core/ydb_convert/table_settings.cpp b/ydb/core/ydb_convert/table_settings.cpp
index a7de2fd31ed..1e513664973 100644
--- a/ydb/core/ydb_convert/table_settings.cpp
+++ b/ydb/core/ydb_convert/table_settings.cpp
@@ -476,6 +476,13 @@ bool FillIndexTablePartitioning(
}
break;
}
+
+ case Ydb::Table::TableIndex::kGlobalFulltextIndex:
+ if (!fillIndexPartitioning(index.global_fulltext_index().settings(), indexImplTableDescriptions)) {
+ return false;
+ }
+ break;
+
case Ydb::Table::TableIndex::TYPE_NOT_SET:
break;
}
diff --git a/ydb/library/services/services.proto b/ydb/library/services/services.proto
index 93c74186a9c..3941e270baa 100644
--- a/ydb/library/services/services.proto
+++ b/ydb/library/services/services.proto
@@ -1119,5 +1119,6 @@ message TActivity {
SCHEME_BOARD_RESTORE_ACTOR = 669;
REPLICATION_CONTROLLER_RESOURCE_ID_RESOLVER = 670;
BS_VDISK_METADATA_ACTOR = 671;
+ BUILD_FULLTEXT_INDEX = 672;
};
};
diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto
index 9cb92adb9bc..e22ce614505 100644
--- a/ydb/public/api/protos/ydb_table.proto
+++ b/ydb/public/api/protos/ydb_table.proto
@@ -113,7 +113,149 @@ message GlobalVectorKMeansTreeIndex {
KMeansTreeSettings vector_settings = 3;
}
-// Represent secondary index
+message FulltextIndexSettings {
+ // Specifies the layout strategy for storing and updating the full-text index
+ enum Layout {
+ LAYOUT_UNSPECIFIED = 0;
+
+ // Uses a single flat inverted index table (indexImplTable)
+ // Example source table:
+ // ┌────┬────────────────────────────┐
+ // │ id │ text │
+ // ├────┼────────────────────────────┤
+ // │ 1 │ "The quick brown fox" │
+ // │ 2 │ "The quick blue hare" │
+ // └────┴────────────────────────────┘
+ // Example inverted index table (indexImplTable):
+ // ┌──────────────┬────┐
+ // │ __ydb_token │ id │
+ // ├──────────────┼────┤
+ // │ "blue" │ 2 │
+ // │ "brown" │ 1 │
+ // │ "fox" │ 1 │
+ // │ "hare" │ 2 │
+ // │ "quick" │ 1 │
+ // │ "quick" │ 2 │
+ // │ "The" │ 1 │
+ // │ "The" │ 2 │
+ // └──────────────┴────┘
+ // Supports a single column only
+ FLAT = 1;
+ }
+
+ // Specifies how text is tokenized during indexing
+ enum Tokenizer {
+ TOKENIZER_UNSPECIFIED = 0;
+
+ // Splits text only by whitespace
+ // Does not split on punctuation
+ // Example:
+ // Text: "foo-bar baz_lorem ipsum"
+ // Tokens: ["foo-bar", "baz_lorem", "ipsum"]
+ WHITESPACE = 1;
+
+ // Applies general language-aware tokenization
+ // Splits text on whitespace and punctuation
+ // Example:
+ // Text: "foo-bar baz_lorem ipsum"
+ // Tokens: ["foo", "bar", "baz", "lorem", "ipsum"]
+ STANDARD = 2;
+
+ // Treats the entire input as a single token
+ // No splitting is performed
+ // Example:
+ // Text: "Hello World!"
+ // Tokens: ["Hello World!"]
+ KEYWORD = 3;
+ }
+
+ // Represents text analyzers settings
+ message Analyzers {
+ // See Tokenizer enum
+ optional Tokenizer tokenizer = 1;
+
+ // Language used for language-sensitive operations like stopword filtering
+ // Example: language = "english"
+ // By default is not specified and no language-specific logic is applied
+ optional string language = 2;
+
+ // Whether to convert tokens to lowercase
+ // Example:
+ // Token: "Quick"
+ // Output: "quick"
+ optional bool use_filter_lowercase = 100;
+
+ // Whether to remove common stopwords like "the", "a", "is"
+ // Example: language = "english"
+ // Tokens: ["the", "quick", "brown"]
+ // Output: ["quick", "brown"]
+ optional bool use_filter_stopwords = 110;
+
+ // Whether to apply character n-gram indexing to each token
+ // Must be used with filter_ngram_min_length and filter_ngram_max_length
+ // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
+ // Token: "search"
+ // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
+ optional bool use_filter_ngram = 120;
+
+ // Whether to apply edge n-gram indexing (prefix-based) to each token
+ // Used with filter_ngram_min_length and filter_ngram_max_length
+ // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
+ // Token: "search"
+ // Output: ["sea", "sear"]
+ optional bool use_filter_edge_ngram = 121;
+
+ // Minimum length of n-grams to generate (inclusive)
+ // Must be used with use_filter_ngram or use_filter_edge_ngram
+ // Default value is 3
+ optional int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"];
+
+ // Maximum length of n-grams to generate (inclusive)
+ // Must be used with use_filter_ngram or use_filter_edge_ngram
+ // Default value is 4
+ optional int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"];
+
+ // Whether to filter tokens by their length
+ // Must be used with filter_length_min or filter_length_max
+ // Example: filter_length_min = 4, filter_length_max = 6
+ // Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
+ // Output: ["fooba", "foobar"]
+ optional bool use_filter_length = 130;
+
+ // Minimum token length to keep (inclusive)
+ // Must be used with use_filter_length
+ optional int32 filter_length_min = 131 [(Ydb.value) = ">= 0"];
+
+ // Maximum token length to keep (inclusive)
+ // Must be used with use_filter_length
+ optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];
+ }
+
+ // Represents text analyzers settings for a specific column
+ message ColumnAnalyzers {
+ // Name of the column to be indexed
+ optional string column = 1;
+
+ // Analyzer settings specific to this column
+ Analyzers analyzers = 2;
+ }
+
+ // See Layout enum
+ optional Layout layout = 1;
+
+ // List of columns and their fulltext settings
+ // Currently, this list should contain a single entry with specified analyzers
+ // Later, some columns may not use analyzers and will be indexed as-is
+ // This list must always match TableIndex.index_columns
+ repeated ColumnAnalyzers columns = 2;
+}
+
+message GlobalFulltextIndex {
+ GlobalIndexSettings settings = 1;
+ FulltextIndexSettings fulltext_settings = 2;
+}
+
+// Represent table index
message TableIndex {
// Name of index
string name = 1;
@@ -125,12 +267,13 @@ message TableIndex {
GlobalAsyncIndex global_async_index = 4;
GlobalUniqueIndex global_unique_index = 6;
GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7;
+ GlobalFulltextIndex global_fulltext_index = 8;
}
// list of columns content to be copied in to index table
repeated string data_columns = 5;
}
-// Represent secondary index with index state
+// Represent table index with index state
message TableIndexDescription {
enum Status {
STATUS_UNSPECIFIED = 0;
@@ -149,6 +292,7 @@ message TableIndexDescription {
GlobalAsyncIndex global_async_index = 5;
GlobalUniqueIndex global_unique_index = 8;
GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 9;
+ GlobalFulltextIndex global_fulltext_index = 10;
}
Status status = 4;
// list of columns content to be copied in to index table
@@ -648,7 +792,7 @@ message CreateTableRequest {
// Table profile
TableProfile profile = 5;
Ydb.Operations.OperationParams operation_params = 6;
- // List of secondary indexes
+ // List of table indexes
repeated TableIndex indexes = 7;
// Table rows time to live settings
TtlSettings ttl_settings = 8;
@@ -726,9 +870,9 @@ message AlterTableRequest {
TtlSettings set_ttl_settings = 7;
google.protobuf.Empty drop_ttl_settings = 8;
}
- // Add secondary indexes
+ // Add table indexes
repeated TableIndex add_indexes = 9;
- // Remove secondary indexes
+ // Remove table indexes
repeated string drop_indexes = 10;
// Change table storage settings
StorageSettings alter_storage_settings = 11;
diff --git a/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp b/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp
index eb2efdb8e63..c339dc804af 100644
--- a/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp
+++ b/ydb/services/ydb/backup_ut/ydb_backup_ut.cpp
@@ -2410,6 +2410,9 @@ Y_UNIT_TEST_SUITE(BackupRestore) {
case EIndexTypeGlobalUnique:
case EIndexTypeGlobalVectorKmeansTree:
return TestTableWithIndexBackupRestore(Value);
+ case EIndexTypeGlobalFulltext:
+ // TODO: will be added later
+ break;
case EIndexTypeInvalid:
break; // not applicable
default:
@@ -3242,6 +3245,9 @@ Y_UNIT_TEST_SUITE(BackupRestoreS3) {
case EIndexTypeGlobalVectorKmeansTree:
TestTableWithIndexBackupRestore(Value);
break;
+ case EIndexTypeGlobalFulltext:
+ // TODO: will be added later
+ break;
case EIndexTypeInvalid:
break; // not applicable
default: