diff options
author | kungurtsev <[email protected]> | 2025-05-22 14:52:20 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2025-05-22 14:52:20 +0200 |
commit | 0323e5e78ecf518557e4e5cfad9f1d5b6827baa6 (patch) | |
tree | 6ce065291be1c0cfe7baa4f98d876a1db40ddad3 | |
parent | f25d7be4b4c78257c58a785ca5398d57822169eb (diff) |
Mark leaf cluster ids with the highest bit set (#18646)
-rw-r--r-- | ydb/core/base/table_index.cpp | 12 | ||||
-rw-r--r-- | ydb/core/base/table_index.h | 5 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/common_helper.h | 1 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/kmeans_helper.cpp | 35 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/kmeans_helper.h | 18 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/local_kmeans.cpp | 30 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/prefix_kmeans.cpp | 21 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp | 16 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp | 141 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp | 86 | ||||
-rw-r--r-- | ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp | 86 | ||||
-rw-r--r-- | ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp | 32 | ||||
-rw-r--r-- | ydb/core/tx/schemeshard/schemeshard_info_types.h | 1 |
13 files changed, 292 insertions, 192 deletions
diff --git a/ydb/core/base/table_index.cpp b/ydb/core/base/table_index.cpp index f3c828d8db1..3dbac3e564c 100644 --- a/ydb/core/base/table_index.cpp +++ b/ydb/core/base/table_index.cpp @@ -189,4 +189,16 @@ bool IsBuildImplTable(std::string_view tableName) { || tableName.ends_with(NTableVectorKmeansTreeIndex::BuildSuffix1); } +static constexpr TClusterId PostingParentFlag = (1ull << 63ull); + +// Note: if cluster id is too big, something is wrong with cluster enumeration +void EnsureNoPostingParentFlag(TClusterId parent) { + Y_ENSURE((parent & PostingParentFlag) == 0); +} + +TClusterId SetPostingParentFlag(TClusterId parent) { + EnsureNoPostingParentFlag(parent); + return (parent | PostingParentFlag); +} + } diff --git a/ydb/core/base/table_index.h b/ydb/core/base/table_index.h index 55652ac78d5..eebb0815cb6 100644 --- a/ydb/core/base/table_index.h +++ b/ydb/core/base/table_index.h @@ -38,9 +38,12 @@ bool IsImplTable(std::string_view tableName); bool IsBuildImplTable(std::string_view tableName); using TClusterId = ui64; - inline constexpr auto ClusterIdType = Ydb::Type::UINT64; inline constexpr const char* ClusterIdTypeName = "Uint64"; +void EnsureNoPostingParentFlag(TClusterId parent); + +TClusterId SetPostingParentFlag(TClusterId parent); + } } diff --git a/ydb/core/tx/datashard/build_index/common_helper.h b/ydb/core/tx/datashard/build_index/common_helper.h index 81c020fae55..3487fa552c4 100644 --- a/ydb/core/tx/datashard/build_index/common_helper.h +++ b/ydb/core/tx/datashard/build_index/common_helper.h @@ -5,6 +5,7 @@ #include <ydb/library/actors/core/log.h> namespace NKikimr::NDataShard { +using namespace NTableIndex; #define LOG_T(stream) LOG_TRACE_S (*TlsActivationContext, NKikimrServices::BUILD_INDEX, stream) #define LOG_D(stream) LOG_DEBUG_S (*TlsActivationContext, NKikimrServices::BUILD_INDEX, stream) diff --git a/ydb/core/tx/datashard/build_index/kmeans_helper.cpp b/ydb/core/tx/datashard/build_index/kmeans_helper.cpp index 17326a93018..ca950741bff 100644 --- a/ydb/core/tx/datashard/build_index/kmeans_helper.cpp +++ b/ydb/core/tx/datashard/build_index/kmeans_helper.cpp @@ -5,7 +5,7 @@ namespace NKikimr::NDataShard::NKMeans { -TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to) { +TTableRange CreateRangeFrom(const TUserTable& table, TClusterId parent, TCell& from, TCell& to) { if (parent == 0) { return table.GetTableRange(); } @@ -28,7 +28,26 @@ NTable::TLead CreateLeadFrom(const TTableRange& range) { return lead; } -void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row) { +void AddRowToLevel(TBufferData& buffer, TClusterId parent, TClusterId child, const TString& embedding, bool isPostingLevel) { + if (isPostingLevel) { + child = SetPostingParentFlag(child); + } else { + EnsureNoPostingParentFlag(child); + } + + std::array<TCell, 2> pk; + pk[0] = TCell::Make(parent); + pk[1] = TCell::Make(child); + + std::array<TCell, 1> data; + data[0] = TCell{embedding}; + + buffer.AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data)); +} + +void AddRowMainToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row) { + EnsureNoPostingParentFlag(parent); + std::array<TCell, 1> cells; cells[0] = TCell::Make(parent); auto pk = TSerializedCellVec::Serialize(cells); @@ -37,9 +56,11 @@ void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArra TSerializedCellVec{key}); } -void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, +void AddRowMainToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos) { + parent = SetPostingParentFlag(parent); + std::array<TCell, 1> cells; cells[0] = TCell::Make(parent); auto pk = TSerializedCellVec::Serialize(cells); @@ -48,9 +69,11 @@ void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TAr TSerializedCellVec{key}); } -void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, +void AddRowBuildToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 prefixColumns) { + EnsureNoPostingParentFlag(parent); + std::array<TCell, 1> cells; cells[0] = TCell::Make(parent); auto pk = TSerializedCellVec::Serialize(cells); @@ -59,9 +82,11 @@ void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArr TSerializedCellVec{key}); } -void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, +void AddRowBuildToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos, ui32 prefixColumns) { + parent = SetPostingParentFlag(parent); + std::array<TCell, 1> cells; cells[0] = TCell::Make(parent); auto pk = TSerializedCellVec::Serialize(cells); diff --git a/ydb/core/tx/datashard/build_index/kmeans_helper.h b/ydb/core/tx/datashard/build_index/kmeans_helper.h index 63de9a4d52c..90d58086340 100644 --- a/ydb/core/tx/datashard/build_index/kmeans_helper.h +++ b/ydb/core/tx/datashard/build_index/kmeans_helper.h @@ -44,7 +44,7 @@ Y_PURE_FUNCTION TTriWayDotProduct<TRes> CosineImpl(const ui8* lhs, const ui8* rh return {static_cast<TRes>(ll), static_cast<TRes>(lr), static_cast<TRes>(rr)}; } -TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to); +TTableRange CreateRangeFrom(const TUserTable& table, TClusterId parent, TCell& from, TCell& to); NTable::TLead CreateLeadFrom(const TTableRange& range); @@ -138,15 +138,17 @@ struct TMaxInnerProductSimilarity : TMetric<TCoord> { } }; -void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row); +void AddRowToLevel(TBufferData& buffer, TClusterId parent, TClusterId child, const TString& embedding, bool isPostingLevel); -void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, +void AddRowMainToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row); + +void AddRowMainToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos); -void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, +void AddRowBuildToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 prefixColumns = 1); -void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, +void AddRowBuildToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos, ui32 prefixColumns = 1); TTags MakeUploadTags(const TUserTable& table, const TProtoStringType& embedding, @@ -410,16 +412,16 @@ public: return true; } - ui32 FindCluster(TArrayRef<const TCell> row, NTable::TPos embeddingPos) + std::optional<ui32> FindCluster(TArrayRef<const TCell> row, NTable::TPos embeddingPos) { Y_ASSERT(embeddingPos < row.size()); const auto embedding = row.at(embeddingPos).AsRef(); if (!IsExpectedSize<TCoord>(embedding, Dimensions)) { - return Max<ui32>(); + return {}; } auto min = TMetric::Init(); - ui32 closest = Max<ui32>(); + std::optional<ui32> closest = {}; for (size_t i = 0; const auto& cluster : Clusters) { auto distance = TMetric::Distance(cluster.data(), embedding.data(), Dimensions); if (distance < min) { diff --git a/ydb/core/tx/datashard/build_index/local_kmeans.cpp b/ydb/core/tx/datashard/build_index/local_kmeans.cpp index 733733c4ed4..c6d82368a64 100644 --- a/ydb/core/tx/datashard/build_index/local_kmeans.cpp +++ b/ydb/core/tx/datashard/build_index/local_kmeans.cpp @@ -436,48 +436,46 @@ private: void FeedKMeans(TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - Clusters.AggregateToCluster(pos, row.at(EmbeddingPos).Data()); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + Clusters.AggregateToCluster(*pos, row.at(EmbeddingPos).Data()); } } void FeedUploadMain2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowMain2Build(*PostingBuf, Child + pos, key, row); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowMainToBuild(*PostingBuf, Child + *pos, key, row); } } void FeedUploadMain2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowMain2Posting(*PostingBuf, Child + pos, key, row, DataPos); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowMainToPosting(*PostingBuf, Child + *pos, key, row, DataPos); } } void FeedUploadBuild2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowBuild2Build(*PostingBuf, Child + pos, key, row); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowBuildToBuild(*PostingBuf, Child + *pos, key, row); } } void FeedUploadBuild2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowBuild2Posting(*PostingBuf, Child + pos, key, row, DataPos); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowBuildToPosting(*PostingBuf, Child + *pos, key, row, DataPos); } } void FormLevelRows() { - std::array<TCell, 2> pk; - std::array<TCell, 1> data; + const bool isPostingLevel = UploadState == NKikimrTxDataShard::UPLOAD_MAIN_TO_POSTING + || UploadState == NKikimrTxDataShard::UPLOAD_BUILD_TO_POSTING; + for (NTable::TPos pos = 0; const auto& row : Clusters.GetClusters()) { - pk[0] = TCell::Make(Parent); - pk[1] = TCell::Make(Child + pos); - data[0] = TCell{row}; - LevelBuf->AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data)); + AddRowToLevel(*LevelBuf, Parent, Child + pos, row, isPostingLevel); ++pos; } } diff --git a/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp b/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp index d56ad48254f..aba4c4e55ec 100644 --- a/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp +++ b/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp @@ -431,34 +431,31 @@ private: void FeedKMeans(TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - Clusters.AggregateToCluster(pos, row.at(EmbeddingPos).Data()); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + Clusters.AggregateToCluster(*pos, row.at(EmbeddingPos).Data()); } } void FeedUploadBuild2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowBuild2Build(*PostingBuf, Child + pos, key, row, PrefixColumns); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowBuildToBuild(*PostingBuf, Child + *pos, key, row, PrefixColumns); } } void FeedUploadBuild2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowBuild2Posting(*PostingBuf, Child + pos, key, row, DataPos, PrefixColumns); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowBuildToPosting(*PostingBuf, Child + *pos, key, row, DataPos, PrefixColumns); } } void FormLevelRows() { - std::array<TCell, 2> pk; - std::array<TCell, 1> data; + const bool isPostingLevel = UploadState == NKikimrTxDataShard::UPLOAD_BUILD_TO_POSTING; + for (NTable::TPos pos = 0; const auto& row : Clusters.GetClusters()) { - pk[0] = TCell::Make(Parent); - pk[1] = TCell::Make(Child + pos); - data[0] = TCell{row}; - LevelBuf->AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data)); + AddRowToLevel(*LevelBuf, Parent, Child + pos, row, isPostingLevel); ++pos; } } diff --git a/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp b/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp index ac61488159f..1c647ed6d4c 100644 --- a/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp +++ b/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp @@ -271,29 +271,29 @@ private: void FeedUploadMain2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowMain2Build(*PostingBuf, Child + pos, key, row); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowMainToBuild(*PostingBuf, Child + *pos, key, row); } } void FeedUploadMain2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowMain2Posting(*PostingBuf, Child + pos, key, row, DataPos); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowMainToPosting(*PostingBuf, Child + *pos, key, row, DataPos); } } void FeedUploadBuild2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowBuild2Build(*PostingBuf, Child + pos, key, row); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowBuildToBuild(*PostingBuf, Child + *pos, key, row); } } void FeedUploadBuild2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row) { - if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) { - AddRowBuild2Posting(*PostingBuf, Child + pos, key, row, DataPos); + if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) { + AddRowBuildToPosting(*PostingBuf, Child + *pos, key, row, DataPos); } } }; diff --git a/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp b/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp index b8743ac4dd1..cad4285aa9e 100644 --- a/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp +++ b/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp @@ -51,7 +51,7 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { VectorIndexSettings settings; settings.set_vector_dimension(2); - settings.set_vector_type(VectorIndexSettings::VECTOR_TYPE_FLOAT); + settings.set_vector_type(VectorIndexSettings::VECTOR_TYPE_UINT8); settings.set_metric(VectorIndexSettings::DISTANCE_COSINE); *rec.MutableSettings() = settings; @@ -235,6 +235,17 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { options.EnableOutOfOrder(true); // TODO(mbkkt) what is it? options.Shards(1); CreateMainTable(server, sender, options); + // Upsert some initial values + ExecSQL(server, sender, + R"( + UPSERT INTO `/Root/table-main` + (key, embedding, data) + VALUES )" + "(1, \"\x30\x30\3\", \"one\")," + "(2, \"\x31\x31\3\", \"two\")," + "(3, \"\x32\x32\3\", \"three\")," + "(4, \"\x65\x65\3\", \"four\")," + "(5, \"\x75\x75\3\", \"five\");"); DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvLocalKMeansRequest& request) { request.SetTabletId(0); @@ -300,6 +311,42 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { }, "[ { <main>: Error: Should be requested partition on at least two rows } { <main>: Error: Unknown embedding column: some } ]"); } + Y_UNIT_TEST (TooManyClusters) { + TPortManager pm; + TServerSettings serverSettings(pm.GetPort(2134)); + serverSettings.SetDomainName("Root"); + + Tests::TServer::TPtr server = new TServer(serverSettings); + auto& runtime = *server->GetRuntime(); + auto sender = runtime.AllocateEdgeActor(); + + runtime.SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_DEBUG); + runtime.SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE); + + InitRoot(server, sender); + + TShardedTableOptions options; + options.EnableOutOfOrder(true); + options.Shards(1); + CreateMainTable(server, sender, options); + // Upsert some initial values + ExecSQL(server, sender, + R"( + UPSERT INTO `/Root/table-main` + (key, embedding, data) + VALUES )" + "(1, \"\x30\x30\3\", \"one\")," + "(2, \"\x31\x31\3\", \"two\")," + "(3, \"\x32\x32\3\", \"three\")," + "(4, \"\x65\x65\3\", \"four\")," + "(5, \"\x75\x75\3\", \"five\");"); + + // TODO: https://github.com/ydb-platform/ydb/issues/18656 + // DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvLocalKMeansRequest& request) { + // request.SetChild(Max<ui64>() - 100); + // }, TStringBuilder() << ""); + } + Y_UNIT_TEST (MainToPosting) { TPortManager pm; TServerSettings serverSettings(pm.GetPort(2134)); @@ -350,13 +397,13 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { auto [level, posting] = DoLocalKMeans(server, sender, 0, 0, seed, k, NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 1, __ydb_centroid = mm\3\n" - "__ydb_parent = 0, __ydb_id = 2, __ydb_centroid = 11\3\n"); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 4, data = four\n" - "__ydb_parent = 1, key = 5, data = five\n" - "__ydb_parent = 2, key = 1, data = one\n" - "__ydb_parent = 2, key = 2, data = two\n" - "__ydb_parent = 2, key = 3, data = three\n"); + UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 9223372036854775809, __ydb_centroid = mm\3\n" + "__ydb_parent = 0, __ydb_id = 9223372036854775810, __ydb_centroid = 11\3\n"); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 4, data = four\n" + "__ydb_parent = 9223372036854775809, key = 5, data = five\n" + "__ydb_parent = 9223372036854775810, key = 1, data = one\n" + "__ydb_parent = 9223372036854775810, key = 2, data = two\n" + "__ydb_parent = 9223372036854775810, key = 3, data = three\n"); recreate(); } @@ -365,13 +412,13 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { auto [level, posting] = DoLocalKMeans(server, sender, 0, 0, seed, k, NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 1, __ydb_centroid = 11\3\n" - "__ydb_parent = 0, __ydb_id = 2, __ydb_centroid = mm\3\n"); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n" - "__ydb_parent = 1, key = 2, data = two\n" - "__ydb_parent = 1, key = 3, data = three\n" - "__ydb_parent = 2, key = 4, data = four\n" - "__ydb_parent = 2, key = 5, data = five\n"); + UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 9223372036854775809, __ydb_centroid = 11\3\n" + "__ydb_parent = 0, __ydb_id = 9223372036854775810, __ydb_centroid = mm\3\n"); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n" + "__ydb_parent = 9223372036854775809, key = 2, data = two\n" + "__ydb_parent = 9223372036854775809, key = 3, data = three\n" + "__ydb_parent = 9223372036854775810, key = 4, data = four\n" + "__ydb_parent = 9223372036854775810, key = 5, data = five\n"); recreate(); } seed = 32; @@ -381,12 +428,12 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { auto [level, posting] = DoLocalKMeans(server, sender, 0, 0, seed, k, NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); - UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 1, __ydb_centroid = II\3\n"); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n" - "__ydb_parent = 1, key = 2, data = two\n" - "__ydb_parent = 1, key = 3, data = three\n" - "__ydb_parent = 1, key = 4, data = four\n" - "__ydb_parent = 1, key = 5, data = five\n"); + UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 9223372036854775809, __ydb_centroid = II\3\n"); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n" + "__ydb_parent = 9223372036854775809, key = 2, data = two\n" + "__ydb_parent = 9223372036854775809, key = 3, data = three\n" + "__ydb_parent = 9223372036854775809, key = 4, data = four\n" + "__ydb_parent = 9223372036854775809, key = 5, data = five\n"); recreate(); } } @@ -532,30 +579,30 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { seed = 0; for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) { auto [level, posting] = DoLocalKMeans(server, sender, 40, 40, seed, k, - NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = mm\3\n" - "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = 11\3\n"); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 4, data = four\n" - "__ydb_parent = 41, key = 5, data = five\n" - "__ydb_parent = 42, key = 1, data = one\n" - "__ydb_parent = 42, key = 2, data = two\n" - "__ydb_parent = 42, key = 3, data = three\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, distance); + UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = mm\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = 11\3\n"); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 4, data = four\n" + "__ydb_parent = 9223372036854775849, key = 5, data = five\n" + "__ydb_parent = 9223372036854775850, key = 1, data = one\n" + "__ydb_parent = 9223372036854775850, key = 2, data = two\n" + "__ydb_parent = 9223372036854775850, key = 3, data = three\n"); recreate(); } seed = 111; for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) { auto [level, posting] = DoLocalKMeans(server, sender, 40, 40, seed, k, - NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = 11\3\n" - "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = mm\3\n"); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n" - "__ydb_parent = 41, key = 2, data = two\n" - "__ydb_parent = 41, key = 3, data = three\n" - "__ydb_parent = 42, key = 4, data = four\n" - "__ydb_parent = 42, key = 5, data = five\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, distance); + UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = 11\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = mm\3\n"); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n" + "__ydb_parent = 9223372036854775849, key = 2, data = two\n" + "__ydb_parent = 9223372036854775849, key = 3, data = three\n" + "__ydb_parent = 9223372036854775850, key = 4, data = four\n" + "__ydb_parent = 9223372036854775850, key = 5, data = five\n"); recreate(); } seed = 32; @@ -563,14 +610,14 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) { VectorIndexSettings::DISTANCE_COSINE}) { auto [level, posting] = DoLocalKMeans(server, sender, 40, 40, seed, k, - NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); - UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = II\3\n"); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n" - "__ydb_parent = 41, key = 2, data = two\n" - "__ydb_parent = 41, key = 3, data = three\n" - "__ydb_parent = 41, key = 4, data = four\n" - "__ydb_parent = 41, key = 5, data = five\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); + UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = II\3\n"); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n" + "__ydb_parent = 9223372036854775849, key = 2, data = two\n" + "__ydb_parent = 9223372036854775849, key = 3, data = three\n" + "__ydb_parent = 9223372036854775849, key = 4, data = four\n" + "__ydb_parent = 9223372036854775849, key = 5, data = five\n"); recreate(); } } diff --git a/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp b/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp index 2f7afaa6527..b18d51902e2 100644 --- a/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp +++ b/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp @@ -373,24 +373,24 @@ Y_UNIT_TEST_SUITE (TTxDataShardPrefixKMeansScan) { "user = user-2, __ydb_id = 43\n" ); UNIT_ASSERT_VALUES_EQUAL(level, - "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = mm\3\n" - "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = 11\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = mm\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = 11\3\n" - "__ydb_parent = 43, __ydb_id = 44, __ydb_centroid = 11\3\n" - "__ydb_parent = 43, __ydb_id = 45, __ydb_centroid = mm\3\n" + "__ydb_parent = 43, __ydb_id = 9223372036854775852, __ydb_centroid = 11\3\n" + "__ydb_parent = 43, __ydb_id = 9223372036854775853, __ydb_centroid = mm\3\n" ); UNIT_ASSERT_VALUES_EQUAL(posting, - "__ydb_parent = 41, key = 14, data = 1-four\n" - "__ydb_parent = 41, key = 15, data = 1-five\n" - "__ydb_parent = 42, key = 11, data = 1-one\n" - "__ydb_parent = 42, key = 12, data = 1-two\n" - "__ydb_parent = 42, key = 13, data = 1-three\n" - - "__ydb_parent = 44, key = 21, data = 2-one\n" - "__ydb_parent = 44, key = 22, data = 2-two\n" - "__ydb_parent = 44, key = 23, data = 2-three\n" - "__ydb_parent = 45, key = 24, data = 2-four\n" - "__ydb_parent = 45, key = 25, data = 2-five\n" + "__ydb_parent = 9223372036854775849, key = 14, data = 1-four\n" + "__ydb_parent = 9223372036854775849, key = 15, data = 1-five\n" + "__ydb_parent = 9223372036854775850, key = 11, data = 1-one\n" + "__ydb_parent = 9223372036854775850, key = 12, data = 1-two\n" + "__ydb_parent = 9223372036854775850, key = 13, data = 1-three\n" + + "__ydb_parent = 9223372036854775852, key = 21, data = 2-one\n" + "__ydb_parent = 9223372036854775852, key = 22, data = 2-two\n" + "__ydb_parent = 9223372036854775852, key = 23, data = 2-three\n" + "__ydb_parent = 9223372036854775853, key = 24, data = 2-four\n" + "__ydb_parent = 9223372036854775853, key = 25, data = 2-five\n" ); recreate(); }} @@ -407,24 +407,24 @@ Y_UNIT_TEST_SUITE (TTxDataShardPrefixKMeansScan) { "user = user-2, __ydb_id = 43\n" ); UNIT_ASSERT_VALUES_EQUAL(level, - "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = 11\3\n" - "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = mm\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = 11\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = mm\3\n" - "__ydb_parent = 43, __ydb_id = 44, __ydb_centroid = 11\3\n" - "__ydb_parent = 43, __ydb_id = 45, __ydb_centroid = mm\3\n" + "__ydb_parent = 43, __ydb_id = 9223372036854775852, __ydb_centroid = 11\3\n" + "__ydb_parent = 43, __ydb_id = 9223372036854775853, __ydb_centroid = mm\3\n" ); UNIT_ASSERT_VALUES_EQUAL(posting, - "__ydb_parent = 41, key = 11, data = 1-one\n" - "__ydb_parent = 41, key = 12, data = 1-two\n" - "__ydb_parent = 41, key = 13, data = 1-three\n" - "__ydb_parent = 42, key = 14, data = 1-four\n" - "__ydb_parent = 42, key = 15, data = 1-five\n" - - "__ydb_parent = 44, key = 21, data = 2-one\n" - "__ydb_parent = 44, key = 22, data = 2-two\n" - "__ydb_parent = 44, key = 23, data = 2-three\n" - "__ydb_parent = 45, key = 24, data = 2-four\n" - "__ydb_parent = 45, key = 25, data = 2-five\n" + "__ydb_parent = 9223372036854775849, key = 11, data = 1-one\n" + "__ydb_parent = 9223372036854775849, key = 12, data = 1-two\n" + "__ydb_parent = 9223372036854775849, key = 13, data = 1-three\n" + "__ydb_parent = 9223372036854775850, key = 14, data = 1-four\n" + "__ydb_parent = 9223372036854775850, key = 15, data = 1-five\n" + + "__ydb_parent = 9223372036854775852, key = 21, data = 2-one\n" + "__ydb_parent = 9223372036854775852, key = 22, data = 2-two\n" + "__ydb_parent = 9223372036854775852, key = 23, data = 2-three\n" + "__ydb_parent = 9223372036854775853, key = 24, data = 2-four\n" + "__ydb_parent = 9223372036854775853, key = 25, data = 2-five\n" ); recreate(); }} @@ -440,22 +440,22 @@ Y_UNIT_TEST_SUITE (TTxDataShardPrefixKMeansScan) { "user = user-2, __ydb_id = 43\n" ); UNIT_ASSERT_VALUES_EQUAL(level, - "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = II\3\n" + "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = II\3\n" - "__ydb_parent = 43, __ydb_id = 44, __ydb_centroid = II\3\n" + "__ydb_parent = 43, __ydb_id = 9223372036854775852, __ydb_centroid = II\3\n" ); UNIT_ASSERT_VALUES_EQUAL(posting, - "__ydb_parent = 41, key = 11, data = 1-one\n" - "__ydb_parent = 41, key = 12, data = 1-two\n" - "__ydb_parent = 41, key = 13, data = 1-three\n" - "__ydb_parent = 41, key = 14, data = 1-four\n" - "__ydb_parent = 41, key = 15, data = 1-five\n" - - "__ydb_parent = 44, key = 21, data = 2-one\n" - "__ydb_parent = 44, key = 22, data = 2-two\n" - "__ydb_parent = 44, key = 23, data = 2-three\n" - "__ydb_parent = 44, key = 24, data = 2-four\n" - "__ydb_parent = 44, key = 25, data = 2-five\n" + "__ydb_parent = 9223372036854775849, key = 11, data = 1-one\n" + "__ydb_parent = 9223372036854775849, key = 12, data = 1-two\n" + "__ydb_parent = 9223372036854775849, key = 13, data = 1-three\n" + "__ydb_parent = 9223372036854775849, key = 14, data = 1-four\n" + "__ydb_parent = 9223372036854775849, key = 15, data = 1-five\n" + + "__ydb_parent = 9223372036854775852, key = 21, data = 2-one\n" + "__ydb_parent = 9223372036854775852, key = 22, data = 2-two\n" + "__ydb_parent = 9223372036854775852, key = 23, data = 2-three\n" + "__ydb_parent = 9223372036854775852, key = 24, data = 2-four\n" + "__ydb_parent = 9223372036854775852, key = 25, data = 2-five\n" ); recreate(); }} diff --git a/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp b/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp index 9c8212987d0..ea77d33aadc 100644 --- a/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp +++ b/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp @@ -145,6 +145,8 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { } auto posting = ReadShardedTable(server, kPostingTable); + Cerr << "Posting:" << Endl; + Cerr << posting << Endl; return std::move(posting); } @@ -305,13 +307,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { "11\3", }; auto posting = DoReshuffleKMeans(server, sender, 0, level, - NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 4, data = four\n" - "__ydb_parent = 1, key = 5, data = five\n" - "__ydb_parent = 2, key = 1, data = one\n" - "__ydb_parent = 2, key = 2, data = two\n" - "__ydb_parent = 2, key = 3, data = three\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, distance); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 4, data = four\n" + "__ydb_parent = 9223372036854775809, key = 5, data = five\n" + "__ydb_parent = 9223372036854775810, key = 1, data = one\n" + "__ydb_parent = 9223372036854775810, key = 2, data = two\n" + "__ydb_parent = 9223372036854775810, key = 3, data = three\n"); recreate(); } for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) { @@ -320,13 +322,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { "mm\3", }; auto posting = DoReshuffleKMeans(server, sender, 0, level, - NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n" - "__ydb_parent = 1, key = 2, data = two\n" - "__ydb_parent = 1, key = 3, data = three\n" - "__ydb_parent = 2, key = 4, data = four\n" - "__ydb_parent = 2, key = 5, data = five\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, distance); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n" + "__ydb_parent = 9223372036854775809, key = 2, data = two\n" + "__ydb_parent = 9223372036854775809, key = 3, data = three\n" + "__ydb_parent = 9223372036854775810, key = 4, data = four\n" + "__ydb_parent = 9223372036854775810, key = 5, data = five\n"); recreate(); } for (auto similarity : {VectorIndexSettings::SIMILARITY_INNER_PRODUCT, VectorIndexSettings::SIMILARITY_COSINE, @@ -336,13 +338,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { "II\3", }; auto posting = DoReshuffleKMeans(server, sender, 0, level, - NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n" - "__ydb_parent = 1, key = 2, data = two\n" - "__ydb_parent = 1, key = 3, data = three\n" - "__ydb_parent = 1, key = 4, data = four\n" - "__ydb_parent = 1, key = 5, data = five\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n" + "__ydb_parent = 9223372036854775809, key = 2, data = two\n" + "__ydb_parent = 9223372036854775809, key = 3, data = three\n" + "__ydb_parent = 9223372036854775809, key = 4, data = four\n" + "__ydb_parent = 9223372036854775809, key = 5, data = five\n"); recreate(); } } @@ -479,13 +481,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { "11\3", }; auto posting = DoReshuffleKMeans(server, sender, 40, level, - NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 4, data = four\n" - "__ydb_parent = 41, key = 5, data = five\n" - "__ydb_parent = 42, key = 1, data = one\n" - "__ydb_parent = 42, key = 2, data = two\n" - "__ydb_parent = 42, key = 3, data = three\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, distance); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 4, data = four\n" + "__ydb_parent = 9223372036854775849, key = 5, data = five\n" + "__ydb_parent = 9223372036854775850, key = 1, data = one\n" + "__ydb_parent = 9223372036854775850, key = 2, data = two\n" + "__ydb_parent = 9223372036854775850, key = 3, data = three\n"); recreate(); } for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) { @@ -494,13 +496,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { "mm\3", }; auto posting = DoReshuffleKMeans(server, sender, 40, level, - NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, distance); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n" - "__ydb_parent = 41, key = 2, data = two\n" - "__ydb_parent = 41, key = 3, data = three\n" - "__ydb_parent = 42, key = 4, data = four\n" - "__ydb_parent = 42, key = 5, data = five\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, distance); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n" + "__ydb_parent = 9223372036854775849, key = 2, data = two\n" + "__ydb_parent = 9223372036854775849, key = 3, data = three\n" + "__ydb_parent = 9223372036854775850, key = 4, data = four\n" + "__ydb_parent = 9223372036854775850, key = 5, data = five\n"); recreate(); } for (auto similarity : {VectorIndexSettings::SIMILARITY_INNER_PRODUCT, VectorIndexSettings::SIMILARITY_COSINE, @@ -510,13 +512,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) { "II\3", }; auto posting = DoReshuffleKMeans(server, sender, 40, level, - NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, - VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); - UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n" - "__ydb_parent = 41, key = 2, data = two\n" - "__ydb_parent = 41, key = 3, data = three\n" - "__ydb_parent = 41, key = 4, data = four\n" - "__ydb_parent = 41, key = 5, data = five\n"); + NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING, + VectorIndexSettings::VECTOR_TYPE_UINT8, similarity); + UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n" + "__ydb_parent = 9223372036854775849, key = 2, data = two\n" + "__ydb_parent = 9223372036854775849, key = 3, data = three\n" + "__ydb_parent = 9223372036854775849, key = 4, data = four\n" + "__ydb_parent = 9223372036854775849, key = 5, data = five\n"); recreate(); } } diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp index 5a700bcd2e6..6d55717ed0d 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp @@ -45,12 +45,13 @@ class TUploadSampleK: public TActorBootstrapped<TUploadSampleK> { protected: TString LogPrefix; - TString TargetTable; + const TString TargetTable; + const bool IsPostingLevel; const NKikimrIndexBuilder::TIndexBuildScanSettings ScanSettings; - TActorId ResponseActorId; - ui64 BuildIndexId = 0; + const TActorId ResponseActorId; + const ui64 BuildIndexId = 0; TIndexBuildInfo::TSample::TRows Init; std::shared_ptr<NTxProxy::TUploadTypes> Types; @@ -59,13 +60,14 @@ protected: TActorId Uploader; ui32 RetryCount = 0; ui32 RowsBytes = 0; - NTableIndex::TClusterId Parent = 0; + const NTableIndex::TClusterId Parent = 0; NTableIndex::TClusterId Child = 0; NDataShard::TUploadStatus UploadStatus; public: TUploadSampleK(TString targetTable, + bool isPostingLevel, const NKikimrIndexBuilder::TIndexBuildScanSettings& scanSettings, const TActorId& responseActorId, ui64 buildIndexId, @@ -73,6 +75,7 @@ public: NTableIndex::TClusterId parent, NTableIndex::TClusterId child) : TargetTable(std::move(targetTable)) + , IsPostingLevel(isPostingLevel) , ScanSettings(scanSettings) , ResponseActorId(responseActorId) , BuildIndexId(buildIndexId) @@ -108,16 +111,23 @@ public: void Bootstrap() { Rows = std::make_shared<NTxProxy::TUploadRows>(); Rows->reserve(Init.size()); - std::array<TCell, 2> PrimaryKeys; - PrimaryKeys[0] = TCell::Make(Parent); + std::array<TCell, 2> pk; + pk[0] = TCell::Make(Parent); for (auto& [_, row] : Init) { RowsBytes += row.size(); - PrimaryKeys[1] = TCell::Make(Child++); + auto child = Child++; + if (IsPostingLevel) { + child = SetPostingParentFlag(child); + } else { + EnsureNoPostingParentFlag(child); + } + pk[1] = TCell::Make(child); + // TODO(mbkkt) we can avoid serialization of PrimaryKeys every iter - Rows->emplace_back(TSerializedCellVec{PrimaryKeys}, std::move(row)); + Rows->emplace_back(TSerializedCellVec{pk}, std::move(row)); } Init = {}; // release memory - RowsBytes += Rows->size() * TSerializedCellVec::SerializedSize(PrimaryKeys); + RowsBytes += Rows->size() * TSerializedCellVec::SerializedSize(pk); Types = std::make_shared<NTxProxy::TUploadTypes>(3); Ydb::Type type; @@ -748,12 +758,14 @@ private: buildInfo.Sample.MakeStrictTop(buildInfo.KMeans.K); auto path = GetBuildPath(Self, buildInfo, NTableIndex::NTableVectorKmeansTreeIndex::LevelTable); Y_ASSERT(buildInfo.Sample.Rows.size() <= buildInfo.KMeans.K); - auto actor = new TUploadSampleK(path.PathString(), + auto actor = new TUploadSampleK(path.PathString(), !buildInfo.KMeans.NeedsAnotherLevel(), buildInfo.ScanSettings, Self->SelfId(), ui64(BuildId), buildInfo.Sample.Rows, buildInfo.KMeans.Parent, buildInfo.KMeans.Child); TActivationContext::AsActorContext().MakeFor(Self->SelfId()).Register(actor); buildInfo.Sample.State = TIndexBuildInfo::TSample::EState::Upload; + + LOG_N("TTxBuildProgress: TUploadSampleK: " << buildInfo); } void ClearAfterFill(const TActorContext& ctx, TIndexBuildInfo& buildInfo) { diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index 2a3c875a4a7..9f6483fbd2e 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -56,6 +56,7 @@ namespace NKikimr { namespace NSchemeShard { +using namespace NTableIndex; class TSchemeShard; |