summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkungurtsev <[email protected]>2025-05-22 14:52:20 +0200
committerGitHub <[email protected]>2025-05-22 14:52:20 +0200
commit0323e5e78ecf518557e4e5cfad9f1d5b6827baa6 (patch)
tree6ce065291be1c0cfe7baa4f98d876a1db40ddad3
parentf25d7be4b4c78257c58a785ca5398d57822169eb (diff)
Mark leaf cluster ids with the highest bit set (#18646)
-rw-r--r--ydb/core/base/table_index.cpp12
-rw-r--r--ydb/core/base/table_index.h5
-rw-r--r--ydb/core/tx/datashard/build_index/common_helper.h1
-rw-r--r--ydb/core/tx/datashard/build_index/kmeans_helper.cpp35
-rw-r--r--ydb/core/tx/datashard/build_index/kmeans_helper.h18
-rw-r--r--ydb/core/tx/datashard/build_index/local_kmeans.cpp30
-rw-r--r--ydb/core/tx/datashard/build_index/prefix_kmeans.cpp21
-rw-r--r--ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp16
-rw-r--r--ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp141
-rw-r--r--ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp86
-rw-r--r--ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp86
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp32
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_info_types.h1
13 files changed, 292 insertions, 192 deletions
diff --git a/ydb/core/base/table_index.cpp b/ydb/core/base/table_index.cpp
index f3c828d8db1..3dbac3e564c 100644
--- a/ydb/core/base/table_index.cpp
+++ b/ydb/core/base/table_index.cpp
@@ -189,4 +189,16 @@ bool IsBuildImplTable(std::string_view tableName) {
|| tableName.ends_with(NTableVectorKmeansTreeIndex::BuildSuffix1);
}
+static constexpr TClusterId PostingParentFlag = (1ull << 63ull);
+
+// Note: if cluster id is too big, something is wrong with cluster enumeration
+void EnsureNoPostingParentFlag(TClusterId parent) {
+ Y_ENSURE((parent & PostingParentFlag) == 0);
+}
+
+TClusterId SetPostingParentFlag(TClusterId parent) {
+ EnsureNoPostingParentFlag(parent);
+ return (parent | PostingParentFlag);
+}
+
}
diff --git a/ydb/core/base/table_index.h b/ydb/core/base/table_index.h
index 55652ac78d5..eebb0815cb6 100644
--- a/ydb/core/base/table_index.h
+++ b/ydb/core/base/table_index.h
@@ -38,9 +38,12 @@ bool IsImplTable(std::string_view tableName);
bool IsBuildImplTable(std::string_view tableName);
using TClusterId = ui64;
-
inline constexpr auto ClusterIdType = Ydb::Type::UINT64;
inline constexpr const char* ClusterIdTypeName = "Uint64";
+void EnsureNoPostingParentFlag(TClusterId parent);
+
+TClusterId SetPostingParentFlag(TClusterId parent);
+
}
}
diff --git a/ydb/core/tx/datashard/build_index/common_helper.h b/ydb/core/tx/datashard/build_index/common_helper.h
index 81c020fae55..3487fa552c4 100644
--- a/ydb/core/tx/datashard/build_index/common_helper.h
+++ b/ydb/core/tx/datashard/build_index/common_helper.h
@@ -5,6 +5,7 @@
#include <ydb/library/actors/core/log.h>
namespace NKikimr::NDataShard {
+using namespace NTableIndex;
#define LOG_T(stream) LOG_TRACE_S (*TlsActivationContext, NKikimrServices::BUILD_INDEX, stream)
#define LOG_D(stream) LOG_DEBUG_S (*TlsActivationContext, NKikimrServices::BUILD_INDEX, stream)
diff --git a/ydb/core/tx/datashard/build_index/kmeans_helper.cpp b/ydb/core/tx/datashard/build_index/kmeans_helper.cpp
index 17326a93018..ca950741bff 100644
--- a/ydb/core/tx/datashard/build_index/kmeans_helper.cpp
+++ b/ydb/core/tx/datashard/build_index/kmeans_helper.cpp
@@ -5,7 +5,7 @@
namespace NKikimr::NDataShard::NKMeans {
-TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to) {
+TTableRange CreateRangeFrom(const TUserTable& table, TClusterId parent, TCell& from, TCell& to) {
if (parent == 0) {
return table.GetTableRange();
}
@@ -28,7 +28,26 @@ NTable::TLead CreateLeadFrom(const TTableRange& range) {
return lead;
}
-void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row) {
+void AddRowToLevel(TBufferData& buffer, TClusterId parent, TClusterId child, const TString& embedding, bool isPostingLevel) {
+ if (isPostingLevel) {
+ child = SetPostingParentFlag(child);
+ } else {
+ EnsureNoPostingParentFlag(child);
+ }
+
+ std::array<TCell, 2> pk;
+ pk[0] = TCell::Make(parent);
+ pk[1] = TCell::Make(child);
+
+ std::array<TCell, 1> data;
+ data[0] = TCell{embedding};
+
+ buffer.AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data));
+}
+
+void AddRowMainToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row) {
+ EnsureNoPostingParentFlag(parent);
+
std::array<TCell, 1> cells;
cells[0] = TCell::Make(parent);
auto pk = TSerializedCellVec::Serialize(cells);
@@ -37,9 +56,11 @@ void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArra
TSerializedCellVec{key});
}
-void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
+void AddRowMainToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
ui32 dataPos)
{
+ parent = SetPostingParentFlag(parent);
+
std::array<TCell, 1> cells;
cells[0] = TCell::Make(parent);
auto pk = TSerializedCellVec::Serialize(cells);
@@ -48,9 +69,11 @@ void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TAr
TSerializedCellVec{key});
}
-void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
+void AddRowBuildToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
ui32 prefixColumns)
{
+ EnsureNoPostingParentFlag(parent);
+
std::array<TCell, 1> cells;
cells[0] = TCell::Make(parent);
auto pk = TSerializedCellVec::Serialize(cells);
@@ -59,9 +82,11 @@ void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArr
TSerializedCellVec{key});
}
-void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
+void AddRowBuildToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
ui32 dataPos, ui32 prefixColumns)
{
+ parent = SetPostingParentFlag(parent);
+
std::array<TCell, 1> cells;
cells[0] = TCell::Make(parent);
auto pk = TSerializedCellVec::Serialize(cells);
diff --git a/ydb/core/tx/datashard/build_index/kmeans_helper.h b/ydb/core/tx/datashard/build_index/kmeans_helper.h
index 63de9a4d52c..90d58086340 100644
--- a/ydb/core/tx/datashard/build_index/kmeans_helper.h
+++ b/ydb/core/tx/datashard/build_index/kmeans_helper.h
@@ -44,7 +44,7 @@ Y_PURE_FUNCTION TTriWayDotProduct<TRes> CosineImpl(const ui8* lhs, const ui8* rh
return {static_cast<TRes>(ll), static_cast<TRes>(lr), static_cast<TRes>(rr)};
}
-TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to);
+TTableRange CreateRangeFrom(const TUserTable& table, TClusterId parent, TCell& from, TCell& to);
NTable::TLead CreateLeadFrom(const TTableRange& range);
@@ -138,15 +138,17 @@ struct TMaxInnerProductSimilarity : TMetric<TCoord> {
}
};
-void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row);
+void AddRowToLevel(TBufferData& buffer, TClusterId parent, TClusterId child, const TString& embedding, bool isPostingLevel);
-void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
+void AddRowMainToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row);
+
+void AddRowMainToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
ui32 dataPos);
-void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
+void AddRowBuildToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
ui32 prefixColumns = 1);
-void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
+void AddRowBuildToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
ui32 dataPos, ui32 prefixColumns = 1);
TTags MakeUploadTags(const TUserTable& table, const TProtoStringType& embedding,
@@ -410,16 +412,16 @@ public:
return true;
}
- ui32 FindCluster(TArrayRef<const TCell> row, NTable::TPos embeddingPos)
+ std::optional<ui32> FindCluster(TArrayRef<const TCell> row, NTable::TPos embeddingPos)
{
Y_ASSERT(embeddingPos < row.size());
const auto embedding = row.at(embeddingPos).AsRef();
if (!IsExpectedSize<TCoord>(embedding, Dimensions)) {
- return Max<ui32>();
+ return {};
}
auto min = TMetric::Init();
- ui32 closest = Max<ui32>();
+ std::optional<ui32> closest = {};
for (size_t i = 0; const auto& cluster : Clusters) {
auto distance = TMetric::Distance(cluster.data(), embedding.data(), Dimensions);
if (distance < min) {
diff --git a/ydb/core/tx/datashard/build_index/local_kmeans.cpp b/ydb/core/tx/datashard/build_index/local_kmeans.cpp
index 733733c4ed4..c6d82368a64 100644
--- a/ydb/core/tx/datashard/build_index/local_kmeans.cpp
+++ b/ydb/core/tx/datashard/build_index/local_kmeans.cpp
@@ -436,48 +436,46 @@ private:
void FeedKMeans(TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- Clusters.AggregateToCluster(pos, row.at(EmbeddingPos).Data());
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ Clusters.AggregateToCluster(*pos, row.at(EmbeddingPos).Data());
}
}
void FeedUploadMain2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowMain2Build(*PostingBuf, Child + pos, key, row);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowMainToBuild(*PostingBuf, Child + *pos, key, row);
}
}
void FeedUploadMain2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowMain2Posting(*PostingBuf, Child + pos, key, row, DataPos);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowMainToPosting(*PostingBuf, Child + *pos, key, row, DataPos);
}
}
void FeedUploadBuild2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowBuild2Build(*PostingBuf, Child + pos, key, row);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowBuildToBuild(*PostingBuf, Child + *pos, key, row);
}
}
void FeedUploadBuild2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowBuild2Posting(*PostingBuf, Child + pos, key, row, DataPos);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowBuildToPosting(*PostingBuf, Child + *pos, key, row, DataPos);
}
}
void FormLevelRows()
{
- std::array<TCell, 2> pk;
- std::array<TCell, 1> data;
+ const bool isPostingLevel = UploadState == NKikimrTxDataShard::UPLOAD_MAIN_TO_POSTING
+ || UploadState == NKikimrTxDataShard::UPLOAD_BUILD_TO_POSTING;
+
for (NTable::TPos pos = 0; const auto& row : Clusters.GetClusters()) {
- pk[0] = TCell::Make(Parent);
- pk[1] = TCell::Make(Child + pos);
- data[0] = TCell{row};
- LevelBuf->AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data));
+ AddRowToLevel(*LevelBuf, Parent, Child + pos, row, isPostingLevel);
++pos;
}
}
diff --git a/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp b/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp
index d56ad48254f..aba4c4e55ec 100644
--- a/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp
+++ b/ydb/core/tx/datashard/build_index/prefix_kmeans.cpp
@@ -431,34 +431,31 @@ private:
void FeedKMeans(TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- Clusters.AggregateToCluster(pos, row.at(EmbeddingPos).Data());
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ Clusters.AggregateToCluster(*pos, row.at(EmbeddingPos).Data());
}
}
void FeedUploadBuild2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowBuild2Build(*PostingBuf, Child + pos, key, row, PrefixColumns);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowBuildToBuild(*PostingBuf, Child + *pos, key, row, PrefixColumns);
}
}
void FeedUploadBuild2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowBuild2Posting(*PostingBuf, Child + pos, key, row, DataPos, PrefixColumns);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowBuildToPosting(*PostingBuf, Child + *pos, key, row, DataPos, PrefixColumns);
}
}
void FormLevelRows()
{
- std::array<TCell, 2> pk;
- std::array<TCell, 1> data;
+ const bool isPostingLevel = UploadState == NKikimrTxDataShard::UPLOAD_BUILD_TO_POSTING;
+
for (NTable::TPos pos = 0; const auto& row : Clusters.GetClusters()) {
- pk[0] = TCell::Make(Parent);
- pk[1] = TCell::Make(Child + pos);
- data[0] = TCell{row};
- LevelBuf->AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data));
+ AddRowToLevel(*LevelBuf, Parent, Child + pos, row, isPostingLevel);
++pos;
}
}
diff --git a/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp b/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp
index ac61488159f..1c647ed6d4c 100644
--- a/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp
+++ b/ydb/core/tx/datashard/build_index/reshuffle_kmeans.cpp
@@ -271,29 +271,29 @@ private:
void FeedUploadMain2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowMain2Build(*PostingBuf, Child + pos, key, row);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowMainToBuild(*PostingBuf, Child + *pos, key, row);
}
}
void FeedUploadMain2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowMain2Posting(*PostingBuf, Child + pos, key, row, DataPos);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowMainToPosting(*PostingBuf, Child + *pos, key, row, DataPos);
}
}
void FeedUploadBuild2Build(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowBuild2Build(*PostingBuf, Child + pos, key, row);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowBuildToBuild(*PostingBuf, Child + *pos, key, row);
}
}
void FeedUploadBuild2Posting(TArrayRef<const TCell> key, TArrayRef<const TCell> row)
{
- if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos != Max<ui32>()) {
- AddRowBuild2Posting(*PostingBuf, Child + pos, key, row, DataPos);
+ if (auto pos = Clusters.FindCluster(row, EmbeddingPos); pos) {
+ AddRowBuildToPosting(*PostingBuf, Child + *pos, key, row, DataPos);
}
}
};
diff --git a/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp b/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp
index b8743ac4dd1..cad4285aa9e 100644
--- a/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp
+++ b/ydb/core/tx/datashard/build_index/ut/ut_local_kmeans.cpp
@@ -51,7 +51,7 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
VectorIndexSettings settings;
settings.set_vector_dimension(2);
- settings.set_vector_type(VectorIndexSettings::VECTOR_TYPE_FLOAT);
+ settings.set_vector_type(VectorIndexSettings::VECTOR_TYPE_UINT8);
settings.set_metric(VectorIndexSettings::DISTANCE_COSINE);
*rec.MutableSettings() = settings;
@@ -235,6 +235,17 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
options.EnableOutOfOrder(true); // TODO(mbkkt) what is it?
options.Shards(1);
CreateMainTable(server, sender, options);
+ // Upsert some initial values
+ ExecSQL(server, sender,
+ R"(
+ UPSERT INTO `/Root/table-main`
+ (key, embedding, data)
+ VALUES )"
+ "(1, \"\x30\x30\3\", \"one\"),"
+ "(2, \"\x31\x31\3\", \"two\"),"
+ "(3, \"\x32\x32\3\", \"three\"),"
+ "(4, \"\x65\x65\3\", \"four\"),"
+ "(5, \"\x75\x75\3\", \"five\");");
DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvLocalKMeansRequest& request) {
request.SetTabletId(0);
@@ -300,6 +311,42 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
}, "[ { <main>: Error: Should be requested partition on at least two rows } { <main>: Error: Unknown embedding column: some } ]");
}
+ Y_UNIT_TEST (TooManyClusters) {
+ TPortManager pm;
+ TServerSettings serverSettings(pm.GetPort(2134));
+ serverSettings.SetDomainName("Root");
+
+ Tests::TServer::TPtr server = new TServer(serverSettings);
+ auto& runtime = *server->GetRuntime();
+ auto sender = runtime.AllocateEdgeActor();
+
+ runtime.SetLogPriority(NKikimrServices::TX_DATASHARD, NLog::PRI_DEBUG);
+ runtime.SetLogPriority(NKikimrServices::BUILD_INDEX, NLog::PRI_TRACE);
+
+ InitRoot(server, sender);
+
+ TShardedTableOptions options;
+ options.EnableOutOfOrder(true);
+ options.Shards(1);
+ CreateMainTable(server, sender, options);
+ // Upsert some initial values
+ ExecSQL(server, sender,
+ R"(
+ UPSERT INTO `/Root/table-main`
+ (key, embedding, data)
+ VALUES )"
+ "(1, \"\x30\x30\3\", \"one\"),"
+ "(2, \"\x31\x31\3\", \"two\"),"
+ "(3, \"\x32\x32\3\", \"three\"),"
+ "(4, \"\x65\x65\3\", \"four\"),"
+ "(5, \"\x75\x75\3\", \"five\");");
+
+ // TODO: https://github.com/ydb-platform/ydb/issues/18656
+ // DoBadRequest(server, sender, [](NKikimrTxDataShard::TEvLocalKMeansRequest& request) {
+ // request.SetChild(Max<ui64>() - 100);
+ // }, TStringBuilder() << "");
+ }
+
Y_UNIT_TEST (MainToPosting) {
TPortManager pm;
TServerSettings serverSettings(pm.GetPort(2134));
@@ -350,13 +397,13 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
auto [level, posting] = DoLocalKMeans(server, sender, 0, 0, seed, k,
NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 1, __ydb_centroid = mm\3\n"
- "__ydb_parent = 0, __ydb_id = 2, __ydb_centroid = 11\3\n");
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 4, data = four\n"
- "__ydb_parent = 1, key = 5, data = five\n"
- "__ydb_parent = 2, key = 1, data = one\n"
- "__ydb_parent = 2, key = 2, data = two\n"
- "__ydb_parent = 2, key = 3, data = three\n");
+ UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 9223372036854775809, __ydb_centroid = mm\3\n"
+ "__ydb_parent = 0, __ydb_id = 9223372036854775810, __ydb_centroid = 11\3\n");
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775809, key = 5, data = five\n"
+ "__ydb_parent = 9223372036854775810, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775810, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775810, key = 3, data = three\n");
recreate();
}
@@ -365,13 +412,13 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
auto [level, posting] = DoLocalKMeans(server, sender, 0, 0, seed, k,
NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 1, __ydb_centroid = 11\3\n"
- "__ydb_parent = 0, __ydb_id = 2, __ydb_centroid = mm\3\n");
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n"
- "__ydb_parent = 1, key = 2, data = two\n"
- "__ydb_parent = 1, key = 3, data = three\n"
- "__ydb_parent = 2, key = 4, data = four\n"
- "__ydb_parent = 2, key = 5, data = five\n");
+ UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 9223372036854775809, __ydb_centroid = 11\3\n"
+ "__ydb_parent = 0, __ydb_id = 9223372036854775810, __ydb_centroid = mm\3\n");
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775809, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775809, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775810, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775810, key = 5, data = five\n");
recreate();
}
seed = 32;
@@ -381,12 +428,12 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
auto [level, posting] = DoLocalKMeans(server, sender, 0, 0, seed, k,
NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
- UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 1, __ydb_centroid = II\3\n");
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n"
- "__ydb_parent = 1, key = 2, data = two\n"
- "__ydb_parent = 1, key = 3, data = three\n"
- "__ydb_parent = 1, key = 4, data = four\n"
- "__ydb_parent = 1, key = 5, data = five\n");
+ UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 0, __ydb_id = 9223372036854775809, __ydb_centroid = II\3\n");
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775809, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775809, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775809, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775809, key = 5, data = five\n");
recreate();
}
}
@@ -532,30 +579,30 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
seed = 0;
for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) {
auto [level, posting] = DoLocalKMeans(server, sender, 40, 40, seed, k,
- NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = mm\3\n"
- "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = 11\3\n");
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 4, data = four\n"
- "__ydb_parent = 41, key = 5, data = five\n"
- "__ydb_parent = 42, key = 1, data = one\n"
- "__ydb_parent = 42, key = 2, data = two\n"
- "__ydb_parent = 42, key = 3, data = three\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
+ UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = mm\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = 11\3\n");
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775849, key = 5, data = five\n"
+ "__ydb_parent = 9223372036854775850, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775850, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775850, key = 3, data = three\n");
recreate();
}
seed = 111;
for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) {
auto [level, posting] = DoLocalKMeans(server, sender, 40, 40, seed, k,
- NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = 11\3\n"
- "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = mm\3\n");
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n"
- "__ydb_parent = 41, key = 2, data = two\n"
- "__ydb_parent = 41, key = 3, data = three\n"
- "__ydb_parent = 42, key = 4, data = four\n"
- "__ydb_parent = 42, key = 5, data = five\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
+ UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = 11\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = mm\3\n");
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775849, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775849, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775850, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775850, key = 5, data = five\n");
recreate();
}
seed = 32;
@@ -563,14 +610,14 @@ Y_UNIT_TEST_SUITE(TTxDataShardLocalKMeansScan) {
VectorIndexSettings::DISTANCE_COSINE})
{
auto [level, posting] = DoLocalKMeans(server, sender, 40, 40, seed, k,
- NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
- UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = II\3\n");
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n"
- "__ydb_parent = 41, key = 2, data = two\n"
- "__ydb_parent = 41, key = 3, data = three\n"
- "__ydb_parent = 41, key = 4, data = four\n"
- "__ydb_parent = 41, key = 5, data = five\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
+ UNIT_ASSERT_VALUES_EQUAL(level, "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = II\3\n");
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775849, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775849, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775849, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775849, key = 5, data = five\n");
recreate();
}
}
diff --git a/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp b/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp
index 2f7afaa6527..b18d51902e2 100644
--- a/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp
+++ b/ydb/core/tx/datashard/build_index/ut/ut_prefix_kmeans.cpp
@@ -373,24 +373,24 @@ Y_UNIT_TEST_SUITE (TTxDataShardPrefixKMeansScan) {
"user = user-2, __ydb_id = 43\n"
);
UNIT_ASSERT_VALUES_EQUAL(level,
- "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = mm\3\n"
- "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = 11\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = mm\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = 11\3\n"
- "__ydb_parent = 43, __ydb_id = 44, __ydb_centroid = 11\3\n"
- "__ydb_parent = 43, __ydb_id = 45, __ydb_centroid = mm\3\n"
+ "__ydb_parent = 43, __ydb_id = 9223372036854775852, __ydb_centroid = 11\3\n"
+ "__ydb_parent = 43, __ydb_id = 9223372036854775853, __ydb_centroid = mm\3\n"
);
UNIT_ASSERT_VALUES_EQUAL(posting,
- "__ydb_parent = 41, key = 14, data = 1-four\n"
- "__ydb_parent = 41, key = 15, data = 1-five\n"
- "__ydb_parent = 42, key = 11, data = 1-one\n"
- "__ydb_parent = 42, key = 12, data = 1-two\n"
- "__ydb_parent = 42, key = 13, data = 1-three\n"
-
- "__ydb_parent = 44, key = 21, data = 2-one\n"
- "__ydb_parent = 44, key = 22, data = 2-two\n"
- "__ydb_parent = 44, key = 23, data = 2-three\n"
- "__ydb_parent = 45, key = 24, data = 2-four\n"
- "__ydb_parent = 45, key = 25, data = 2-five\n"
+ "__ydb_parent = 9223372036854775849, key = 14, data = 1-four\n"
+ "__ydb_parent = 9223372036854775849, key = 15, data = 1-five\n"
+ "__ydb_parent = 9223372036854775850, key = 11, data = 1-one\n"
+ "__ydb_parent = 9223372036854775850, key = 12, data = 1-two\n"
+ "__ydb_parent = 9223372036854775850, key = 13, data = 1-three\n"
+
+ "__ydb_parent = 9223372036854775852, key = 21, data = 2-one\n"
+ "__ydb_parent = 9223372036854775852, key = 22, data = 2-two\n"
+ "__ydb_parent = 9223372036854775852, key = 23, data = 2-three\n"
+ "__ydb_parent = 9223372036854775853, key = 24, data = 2-four\n"
+ "__ydb_parent = 9223372036854775853, key = 25, data = 2-five\n"
);
recreate();
}}
@@ -407,24 +407,24 @@ Y_UNIT_TEST_SUITE (TTxDataShardPrefixKMeansScan) {
"user = user-2, __ydb_id = 43\n"
);
UNIT_ASSERT_VALUES_EQUAL(level,
- "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = 11\3\n"
- "__ydb_parent = 40, __ydb_id = 42, __ydb_centroid = mm\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = 11\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775850, __ydb_centroid = mm\3\n"
- "__ydb_parent = 43, __ydb_id = 44, __ydb_centroid = 11\3\n"
- "__ydb_parent = 43, __ydb_id = 45, __ydb_centroid = mm\3\n"
+ "__ydb_parent = 43, __ydb_id = 9223372036854775852, __ydb_centroid = 11\3\n"
+ "__ydb_parent = 43, __ydb_id = 9223372036854775853, __ydb_centroid = mm\3\n"
);
UNIT_ASSERT_VALUES_EQUAL(posting,
- "__ydb_parent = 41, key = 11, data = 1-one\n"
- "__ydb_parent = 41, key = 12, data = 1-two\n"
- "__ydb_parent = 41, key = 13, data = 1-three\n"
- "__ydb_parent = 42, key = 14, data = 1-four\n"
- "__ydb_parent = 42, key = 15, data = 1-five\n"
-
- "__ydb_parent = 44, key = 21, data = 2-one\n"
- "__ydb_parent = 44, key = 22, data = 2-two\n"
- "__ydb_parent = 44, key = 23, data = 2-three\n"
- "__ydb_parent = 45, key = 24, data = 2-four\n"
- "__ydb_parent = 45, key = 25, data = 2-five\n"
+ "__ydb_parent = 9223372036854775849, key = 11, data = 1-one\n"
+ "__ydb_parent = 9223372036854775849, key = 12, data = 1-two\n"
+ "__ydb_parent = 9223372036854775849, key = 13, data = 1-three\n"
+ "__ydb_parent = 9223372036854775850, key = 14, data = 1-four\n"
+ "__ydb_parent = 9223372036854775850, key = 15, data = 1-five\n"
+
+ "__ydb_parent = 9223372036854775852, key = 21, data = 2-one\n"
+ "__ydb_parent = 9223372036854775852, key = 22, data = 2-two\n"
+ "__ydb_parent = 9223372036854775852, key = 23, data = 2-three\n"
+ "__ydb_parent = 9223372036854775853, key = 24, data = 2-four\n"
+ "__ydb_parent = 9223372036854775853, key = 25, data = 2-five\n"
);
recreate();
}}
@@ -440,22 +440,22 @@ Y_UNIT_TEST_SUITE (TTxDataShardPrefixKMeansScan) {
"user = user-2, __ydb_id = 43\n"
);
UNIT_ASSERT_VALUES_EQUAL(level,
- "__ydb_parent = 40, __ydb_id = 41, __ydb_centroid = II\3\n"
+ "__ydb_parent = 40, __ydb_id = 9223372036854775849, __ydb_centroid = II\3\n"
- "__ydb_parent = 43, __ydb_id = 44, __ydb_centroid = II\3\n"
+ "__ydb_parent = 43, __ydb_id = 9223372036854775852, __ydb_centroid = II\3\n"
);
UNIT_ASSERT_VALUES_EQUAL(posting,
- "__ydb_parent = 41, key = 11, data = 1-one\n"
- "__ydb_parent = 41, key = 12, data = 1-two\n"
- "__ydb_parent = 41, key = 13, data = 1-three\n"
- "__ydb_parent = 41, key = 14, data = 1-four\n"
- "__ydb_parent = 41, key = 15, data = 1-five\n"
-
- "__ydb_parent = 44, key = 21, data = 2-one\n"
- "__ydb_parent = 44, key = 22, data = 2-two\n"
- "__ydb_parent = 44, key = 23, data = 2-three\n"
- "__ydb_parent = 44, key = 24, data = 2-four\n"
- "__ydb_parent = 44, key = 25, data = 2-five\n"
+ "__ydb_parent = 9223372036854775849, key = 11, data = 1-one\n"
+ "__ydb_parent = 9223372036854775849, key = 12, data = 1-two\n"
+ "__ydb_parent = 9223372036854775849, key = 13, data = 1-three\n"
+ "__ydb_parent = 9223372036854775849, key = 14, data = 1-four\n"
+ "__ydb_parent = 9223372036854775849, key = 15, data = 1-five\n"
+
+ "__ydb_parent = 9223372036854775852, key = 21, data = 2-one\n"
+ "__ydb_parent = 9223372036854775852, key = 22, data = 2-two\n"
+ "__ydb_parent = 9223372036854775852, key = 23, data = 2-three\n"
+ "__ydb_parent = 9223372036854775852, key = 24, data = 2-four\n"
+ "__ydb_parent = 9223372036854775852, key = 25, data = 2-five\n"
);
recreate();
}}
diff --git a/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp b/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp
index 9c8212987d0..ea77d33aadc 100644
--- a/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp
+++ b/ydb/core/tx/datashard/build_index/ut/ut_reshuffle_kmeans.cpp
@@ -145,6 +145,8 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
}
auto posting = ReadShardedTable(server, kPostingTable);
+ Cerr << "Posting:" << Endl;
+ Cerr << posting << Endl;
return std::move(posting);
}
@@ -305,13 +307,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
"11\3",
};
auto posting = DoReshuffleKMeans(server, sender, 0, level,
- NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 4, data = four\n"
- "__ydb_parent = 1, key = 5, data = five\n"
- "__ydb_parent = 2, key = 1, data = one\n"
- "__ydb_parent = 2, key = 2, data = two\n"
- "__ydb_parent = 2, key = 3, data = three\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775809, key = 5, data = five\n"
+ "__ydb_parent = 9223372036854775810, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775810, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775810, key = 3, data = three\n");
recreate();
}
for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) {
@@ -320,13 +322,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
"mm\3",
};
auto posting = DoReshuffleKMeans(server, sender, 0, level,
- NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n"
- "__ydb_parent = 1, key = 2, data = two\n"
- "__ydb_parent = 1, key = 3, data = three\n"
- "__ydb_parent = 2, key = 4, data = four\n"
- "__ydb_parent = 2, key = 5, data = five\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775809, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775809, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775810, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775810, key = 5, data = five\n");
recreate();
}
for (auto similarity : {VectorIndexSettings::SIMILARITY_INNER_PRODUCT, VectorIndexSettings::SIMILARITY_COSINE,
@@ -336,13 +338,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
"II\3",
};
auto posting = DoReshuffleKMeans(server, sender, 0, level,
- NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 1, key = 1, data = one\n"
- "__ydb_parent = 1, key = 2, data = two\n"
- "__ydb_parent = 1, key = 3, data = three\n"
- "__ydb_parent = 1, key = 4, data = four\n"
- "__ydb_parent = 1, key = 5, data = five\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775809, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775809, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775809, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775809, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775809, key = 5, data = five\n");
recreate();
}
}
@@ -479,13 +481,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
"11\3",
};
auto posting = DoReshuffleKMeans(server, sender, 40, level,
- NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 4, data = four\n"
- "__ydb_parent = 41, key = 5, data = five\n"
- "__ydb_parent = 42, key = 1, data = one\n"
- "__ydb_parent = 42, key = 2, data = two\n"
- "__ydb_parent = 42, key = 3, data = three\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775849, key = 5, data = five\n"
+ "__ydb_parent = 9223372036854775850, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775850, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775850, key = 3, data = three\n");
recreate();
}
for (auto distance : {VectorIndexSettings::DISTANCE_MANHATTAN, VectorIndexSettings::DISTANCE_EUCLIDEAN}) {
@@ -494,13 +496,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
"mm\3",
};
auto posting = DoReshuffleKMeans(server, sender, 40, level,
- NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n"
- "__ydb_parent = 41, key = 2, data = two\n"
- "__ydb_parent = 41, key = 3, data = three\n"
- "__ydb_parent = 42, key = 4, data = four\n"
- "__ydb_parent = 42, key = 5, data = five\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, distance);
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775849, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775849, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775850, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775850, key = 5, data = five\n");
recreate();
}
for (auto similarity : {VectorIndexSettings::SIMILARITY_INNER_PRODUCT, VectorIndexSettings::SIMILARITY_COSINE,
@@ -510,13 +512,13 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
"II\3",
};
auto posting = DoReshuffleKMeans(server, sender, 40, level,
- NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
- VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
- UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 41, key = 1, data = one\n"
- "__ydb_parent = 41, key = 2, data = two\n"
- "__ydb_parent = 41, key = 3, data = three\n"
- "__ydb_parent = 41, key = 4, data = four\n"
- "__ydb_parent = 41, key = 5, data = five\n");
+ NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING,
+ VectorIndexSettings::VECTOR_TYPE_UINT8, similarity);
+ UNIT_ASSERT_VALUES_EQUAL(posting, "__ydb_parent = 9223372036854775849, key = 1, data = one\n"
+ "__ydb_parent = 9223372036854775849, key = 2, data = two\n"
+ "__ydb_parent = 9223372036854775849, key = 3, data = three\n"
+ "__ydb_parent = 9223372036854775849, key = 4, data = four\n"
+ "__ydb_parent = 9223372036854775849, key = 5, data = five\n");
recreate();
}
}
diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp
index 5a700bcd2e6..6d55717ed0d 100644
--- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp
@@ -45,12 +45,13 @@ class TUploadSampleK: public TActorBootstrapped<TUploadSampleK> {
protected:
TString LogPrefix;
- TString TargetTable;
+ const TString TargetTable;
+ const bool IsPostingLevel;
const NKikimrIndexBuilder::TIndexBuildScanSettings ScanSettings;
- TActorId ResponseActorId;
- ui64 BuildIndexId = 0;
+ const TActorId ResponseActorId;
+ const ui64 BuildIndexId = 0;
TIndexBuildInfo::TSample::TRows Init;
std::shared_ptr<NTxProxy::TUploadTypes> Types;
@@ -59,13 +60,14 @@ protected:
TActorId Uploader;
ui32 RetryCount = 0;
ui32 RowsBytes = 0;
- NTableIndex::TClusterId Parent = 0;
+ const NTableIndex::TClusterId Parent = 0;
NTableIndex::TClusterId Child = 0;
NDataShard::TUploadStatus UploadStatus;
public:
TUploadSampleK(TString targetTable,
+ bool isPostingLevel,
const NKikimrIndexBuilder::TIndexBuildScanSettings& scanSettings,
const TActorId& responseActorId,
ui64 buildIndexId,
@@ -73,6 +75,7 @@ public:
NTableIndex::TClusterId parent,
NTableIndex::TClusterId child)
: TargetTable(std::move(targetTable))
+ , IsPostingLevel(isPostingLevel)
, ScanSettings(scanSettings)
, ResponseActorId(responseActorId)
, BuildIndexId(buildIndexId)
@@ -108,16 +111,23 @@ public:
void Bootstrap() {
Rows = std::make_shared<NTxProxy::TUploadRows>();
Rows->reserve(Init.size());
- std::array<TCell, 2> PrimaryKeys;
- PrimaryKeys[0] = TCell::Make(Parent);
+ std::array<TCell, 2> pk;
+ pk[0] = TCell::Make(Parent);
for (auto& [_, row] : Init) {
RowsBytes += row.size();
- PrimaryKeys[1] = TCell::Make(Child++);
+ auto child = Child++;
+ if (IsPostingLevel) {
+ child = SetPostingParentFlag(child);
+ } else {
+ EnsureNoPostingParentFlag(child);
+ }
+ pk[1] = TCell::Make(child);
+
// TODO(mbkkt) we can avoid serialization of PrimaryKeys every iter
- Rows->emplace_back(TSerializedCellVec{PrimaryKeys}, std::move(row));
+ Rows->emplace_back(TSerializedCellVec{pk}, std::move(row));
}
Init = {}; // release memory
- RowsBytes += Rows->size() * TSerializedCellVec::SerializedSize(PrimaryKeys);
+ RowsBytes += Rows->size() * TSerializedCellVec::SerializedSize(pk);
Types = std::make_shared<NTxProxy::TUploadTypes>(3);
Ydb::Type type;
@@ -748,12 +758,14 @@ private:
buildInfo.Sample.MakeStrictTop(buildInfo.KMeans.K);
auto path = GetBuildPath(Self, buildInfo, NTableIndex::NTableVectorKmeansTreeIndex::LevelTable);
Y_ASSERT(buildInfo.Sample.Rows.size() <= buildInfo.KMeans.K);
- auto actor = new TUploadSampleK(path.PathString(),
+ auto actor = new TUploadSampleK(path.PathString(), !buildInfo.KMeans.NeedsAnotherLevel(),
buildInfo.ScanSettings, Self->SelfId(), ui64(BuildId),
buildInfo.Sample.Rows, buildInfo.KMeans.Parent, buildInfo.KMeans.Child);
TActivationContext::AsActorContext().MakeFor(Self->SelfId()).Register(actor);
buildInfo.Sample.State = TIndexBuildInfo::TSample::EState::Upload;
+
+ LOG_N("TTxBuildProgress: TUploadSampleK: " << buildInfo);
}
void ClearAfterFill(const TActorContext& ctx, TIndexBuildInfo& buildInfo) {
diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h
index 2a3c875a4a7..9f6483fbd2e 100644
--- a/ydb/core/tx/schemeshard/schemeshard_info_types.h
+++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h
@@ -56,6 +56,7 @@
namespace NKikimr {
namespace NSchemeShard {
+using namespace NTableIndex;
class TSchemeShard;