summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrisenberg <[email protected]>2026-04-22 14:31:31 +0300
committerGitHub <[email protected]>2026-04-22 14:31:31 +0300
commit59554ea611ced63df1bb2ec8b1df46cf83d30ecc (patch)
treec27a6ff2799df369fb9a713a4a84ae7f96cf580e
parent23b5824fe3ca7745558138234adb49b1c2cef7fb (diff)
Speed up building bloom_ngram index (#36689)
-rw-r--r--ydb/core/tx/columnshard/blobs_action/abstract/ya.make1
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h8
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp37
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h81
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp5
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h3
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp4
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h1
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp86
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp49
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make9
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make1
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h5
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp172
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make3
-rw-r--r--ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make1
16 files changed, 266 insertions, 200 deletions
diff --git a/ydb/core/tx/columnshard/blobs_action/abstract/ya.make b/ydb/core/tx/columnshard/blobs_action/abstract/ya.make
index 1ce1d711629..d53a409da29 100644
--- a/ydb/core/tx/columnshard/blobs_action/abstract/ya.make
+++ b/ydb/core/tx/columnshard/blobs_action/abstract/ya.make
@@ -23,6 +23,7 @@ PEERDIR(
ydb/core/tx/columnshard/data_sharing/protos
ydb/core/tx/columnshard/blobs_action/events
ydb/core/tx/columnshard/blobs_action/protos
+ ydb/core/tx/columnshard/blobs_action/counters
)
END()
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h
index d23651c6301..b45f6c0c485 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h
@@ -1,8 +1,12 @@
#pragma once
+
+#include "array_power2.h"
+
#include <ydb/library/conclusion/result.h>
#include <ydb/library/conclusion/status.h>
#include <library/cpp/object_factory/object_factory.h>
+
#include <util/generic/bitmap.h>
#include <util/generic/string.h>
@@ -33,6 +37,7 @@ public:
class IBitsStorageConstructor {
private:
virtual TString DoSerializeToString(TDynBitMap&& bm) const = 0;
+ virtual TString DoSerializeToString(const TArrayPower2BitsStorage& storage) const = 0;
virtual TConclusion<std::shared_ptr<IBitsStorageViewer>> DoRestore(const TString& data) const = 0;
public:
@@ -44,6 +49,9 @@ public:
[[nodiscard]] TString SerializeToString(TDynBitMap&& bm) const {
return DoSerializeToString(std::move(bm));
}
+ [[nodiscard]] TString SerializeToString(const TArrayPower2BitsStorage& storage) const {
+ return DoSerializeToString(storage);
+ }
[[nodiscard]] TConclusion<std::shared_ptr<IBitsStorageViewer>> Restore(const TString& data) const {
return DoRestore(data);
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp
new file mode 100644
index 00000000000..b0308497693
--- /dev/null
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp
@@ -0,0 +1,37 @@
+#include "array_power2.h"
+
+#include <util/stream/output.h>
+#include <util/stream/str.h>
+#include <util/ysaveload.h>
+
+namespace NKikimr::NOlap::NIndexes {
+
+TString TArrayPower2BitsStorage::SerializeDynBitMapCompatible() const {
+ // TBitSetStorage requires serialization format of TDynBitMap, so reproduce it here
+ TString result;
+ result.reserve(DataSize * sizeof(ui64) + 2 * sizeof(ui64));
+ TStringOutput out(result);
+
+ ::Save(&out, ui8(sizeof(ui64)));
+ ::Save(&out, ui64(BitSize()));
+ ::SavePodArray(&out, Data.get(), DataSize);
+ return result;
+}
+
+TArrayPower2BitsStorage TArrayPower2BitsStorage::Fold(ui32 times) const {
+ Y_ABORT_UNLESS(std::has_single_bit(times));
+ Y_ABORT_UNLESS(times < DataSize);
+ auto newDataSize = DataSize / times;
+ std::unique_ptr<ui64[]> newData(new ui64[newDataSize]);
+ std::fill(newData.get(), newData.get() + newDataSize, 0);
+ for (size_t i = 0; i < times; ++i) {
+ auto start = newDataSize * i;
+ for (size_t j = 0; j < newDataSize; ++j) {
+ newData[j] |= Data[start + j];
+ }
+ }
+
+ return TArrayPower2BitsStorage(newDataSize, std::move(newData));
+}
+
+} // namespace NKikimr::NOlap::NIndexes
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h
new file mode 100644
index 00000000000..9822dcfb009
--- /dev/null
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/system/yassert.h>
+
+#include <algorithm>
+#include <bit>
+#include <climits>
+#include <memory>
+
+namespace NKikimr::NOlap::NIndexes {
+
+// Bitset builder over an array with size a power of 2. In comparison to util/generic/bitmap.h impls it simultaneously
+// * does absolute minimum of operations on insertion
+// * has runtime-defined size
+class TArrayPower2BitsStorage {
+private:
+ const ui32 DataSize;
+ // ui64[] is slightly faster than ui8[], perhaps because there is no conversion inside operator()
+ std::unique_ptr<ui64[]> Data;
+ const ui32 SizeMask;
+ static_assert(CHAR_BIT == 8);
+ static constexpr ui64 BitsInItem = sizeof(ui64) * CHAR_BIT;
+ static constexpr ui64 ItemMask = BitsInItem - 1;
+ static constexpr ui64 SizeShift = std::popcount(ItemMask);
+
+ TArrayPower2BitsStorage(ui32 dataSize, std::unique_ptr<ui64[]> data)
+ : DataSize(dataSize)
+ , Data(std::move(data))
+ , SizeMask(DataSize - 1) {
+ Y_ABORT_UNLESS(std::has_single_bit(dataSize));
+ }
+
+ Y_FORCE_INLINE ui64 ItemMaskForHash(ui64 hash) const {
+ return (ui64{1} << (hash & ItemMask));
+ }
+
+ Y_FORCE_INLINE ui64 IndexForHash(ui64 hash) const {
+ return (hash >> SizeShift) & SizeMask;
+ }
+
+public:
+ TArrayPower2BitsStorage(ui32 bitSize)
+ : DataSize(bitSize / BitsInItem)
+ , Data(new ui64[DataSize])
+ , SizeMask(DataSize - 1) {
+ Y_ABORT_UNLESS(bitSize >= BitsInItem);
+ std::fill(Data.get(), Data.get() + DataSize, 0);
+ Y_ABORT_UNLESS(std::has_single_bit(bitSize));
+ }
+
+ TString SerializeToString() const {
+ return TString(reinterpret_cast<char*>(Data.get()), DataSize * sizeof(ui64));
+ }
+
+ TString SerializeDynBitMapCompatible() const;
+
+ Y_FORCE_INLINE void operator()(const ui64 hash) {
+ Data[IndexForHash(hash)] |= ItemMaskForHash(hash);
+ }
+
+ bool Get(const ui64 hash) const {
+ return Data[IndexForHash(hash)] & ItemMaskForHash(hash);
+ }
+
+ ui32 CountSetBits() const {
+ ui32 result = 0;
+ for (size_t i = 0; i < DataSize; ++i) {
+ result += std::popcount(Data[i]);
+ }
+ return result;
+ }
+
+ ui32 BitSize() const {
+ return DataSize * BitsInItem;
+ }
+
+ TArrayPower2BitsStorage Fold(ui32 times) const;
+};
+
+} // namespace NKikimr::NOlap::NIndexes
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp
index d5ca883a19a..f812ef3afec 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp
@@ -14,6 +14,11 @@ TString TBitSetStorageConstructor::DoSerializeToString(TDynBitMap&& bm) const {
return result;
}
+
+TString TBitSetStorageConstructor::DoSerializeToString(const TArrayPower2BitsStorage& storage) const {
+ return storage.SerializeDynBitMapCompatible();
+}
+
TConclusion<std::shared_ptr<IBitsStorageViewer>> TBitSetStorageConstructor::DoRestore(const TString& data) const {
try {
TStringInput input(data);
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h
index e9affb2b856..18bb88fefc1 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h
@@ -29,7 +29,8 @@ public:
}
private:
- virtual TString DoSerializeToString(TDynBitMap&& bm) const override;
+ virtual TString DoSerializeToString(TDynBitMap&& bitsVector) const override;
+ virtual TString DoSerializeToString(const TArrayPower2BitsStorage& storage) const override;
virtual TConclusion<std::shared_ptr<IBitsStorageViewer>> DoRestore(const TString& data) const override;
static inline const auto Registrator = TFactory::TRegistrator<TBitSetStorageConstructor>(GetClassNameStatic());
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp
index a416a43d8a5..2a40e5b40fe 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp
@@ -41,4 +41,8 @@ TString TFixStringBitsStorageConstructor::DoSerializeToString(TDynBitMap&& bitsV
return result;
}
+TString TFixStringBitsStorageConstructor::DoSerializeToString(const TArrayPower2BitsStorage& storage) const {
+ return storage.SerializeToString();
+}
+
} // namespace NKikimr::NOlap::NIndexes
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h
index 5ee804ae98b..8a09475d58f 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h
@@ -30,6 +30,7 @@ public:
private:
virtual TString DoSerializeToString(TDynBitMap&& bitsVector) const override;
+ virtual TString DoSerializeToString(const TArrayPower2BitsStorage& storage) const override;
virtual TConclusion<std::shared_ptr<IBitsStorageViewer>> DoRestore(const TString& data) const override {
return std::make_shared<TFixStringBitsStorage>(data);
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp
new file mode 100644
index 00000000000..9213d04fe1c
--- /dev/null
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp
@@ -0,0 +1,86 @@
+#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h>
+#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h>
+#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/random/fast.h>
+
+#include <unordered_set>
+
+namespace NKikimr::NOlap::NIndexes {
+
+void CheckInsertedValues(const std::unordered_set<ui64>& inserted, ui32 size, const IBitsStorageViewer& storage) {
+ for (ui64 i = 0; i < size; ++i) {
+ if (inserted.contains(i)) {
+ UNIT_ASSERT_C(storage.Get(i), TStringBuilder() << "value " << i << " must be set but is not");
+ } else {
+ UNIT_ASSERT_C(!storage.Get(i), TStringBuilder() << "value " << i << " must not be set but it is");
+ }
+ }
+}
+
+void TestSerializeRestoreArrayStorage(IBitsStorageConstructor& constructor) {
+ constexpr ui32 size = 16384;
+
+ TFastRng64 rng(100500);
+ std::unordered_set<ui64> inserted;
+ TArrayPower2BitsStorage storage(size);
+ for (ui32 i = 0; i < size / 2; ++i) {
+ auto item = rng.Uniform(size);
+ storage(item);
+ inserted.insert(item);
+ }
+ TString serialized = constructor.SerializeToString(storage);
+ auto restoredConclusion = constructor.Restore(serialized);
+ UNIT_ASSERT_C(restoredConclusion.IsSuccess(), restoredConclusion.GetErrorMessage());
+ auto restored = restoredConclusion.DetachResult();
+
+ CheckInsertedValues(inserted, size, *restored);
+}
+
+void TestSerializeRestoreDynBitMap(IBitsStorageConstructor& constructor) {
+ constexpr ui32 size = 16384;
+
+ TFastRng64 rng(100500);
+ std::unordered_set<ui64> inserted;
+ TDynBitMap bitmap;
+ bitmap.Reserve(size);
+ for (ui32 i = 0; i < size / 2; ++i) {
+ auto item = rng.Uniform(size);
+ bitmap.Set(item);
+ inserted.insert(item);
+ }
+ TString serialized = constructor.SerializeToString(std::move(bitmap));
+ auto restoredConclusion = constructor.Restore(serialized);
+ UNIT_ASSERT_C(restoredConclusion.IsSuccess(), restoredConclusion.GetErrorMessage());
+ auto restored = restoredConclusion.DetachResult();
+
+ CheckInsertedValues(inserted, size, *restored);
+}
+
+
+
+Y_UNIT_TEST_SUITE(TBitsStorageTests) {
+ Y_UNIT_TEST(TestSerializeRestoreArrayStorageToFixString) {
+ TFixStringBitsStorageConstructor constructor;
+ TestSerializeRestoreArrayStorage(constructor);
+ }
+
+ Y_UNIT_TEST(TestSerializeRestoreArrayStorageToBitset) {
+ TBitSetStorageConstructor constructor;
+ TestSerializeRestoreArrayStorage(constructor);
+ }
+
+
+ Y_UNIT_TEST(TestSerializeRestoreDynBitMapToFixString) {
+ TFixStringBitsStorageConstructor constructor;
+ TestSerializeRestoreDynBitMap(constructor);
+ }
+
+ Y_UNIT_TEST(TestSerializeRestoreDynBitMapToBitset) {
+ TBitSetStorageConstructor constructor;
+ TestSerializeRestoreDynBitMap(constructor);
+ }
+}
+
+} // namespace NKikimr::NOlap::NIndexes
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp
deleted file mode 100644
index 6e34d63fb2a..00000000000
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h>
-#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h>
-
-#include <library/cpp/testing/unittest/registar.h>
-#include <util/random/fast.h>
-
-#include <unordered_set>
-
-namespace NKikimr::NOlap::NIndexes {
-
-void TestSerializeRestore(IBitsStorageConstructor& constructor) {
- ui32 size = 16384;
-
- TFastRng64 rng(100500);
- std::unordered_set<ui64> inserted;
- TDynBitMap bitmap;
- bitmap.Reserve(size);
- for (ui32 i = 0; i < size / 2; ++i) {
- auto item = rng.Uniform(size);
- bitmap.Set(item);
- inserted.insert(item);
- }
- TString serialized = constructor.SerializeToString(std::move(bitmap));
- auto restoredConclusion = constructor.Restore(serialized);
- UNIT_ASSERT_C(restoredConclusion.IsSuccess(), restoredConclusion.GetErrorMessage());
- auto restored = restoredConclusion.DetachResult();
-
- for (ui64 i = 0; i < size; ++i) {
- if (inserted.contains(i)) {
- UNIT_ASSERT_C(restored->Get(i), TStringBuilder() << "value " << i << " must be set but is not");
- } else {
- UNIT_ASSERT_C(!restored->Get(i), TStringBuilder() << "value " << i << " must not be set but it is");
- }
- }
-}
-
-Y_UNIT_TEST_SUITE(TBitsStorageTests) {
- Y_UNIT_TEST(TestSerializeRestoreFixString) {
- TFixStringBitsStorageConstructor constructor;
- TestSerializeRestore(constructor);
- }
-
- Y_UNIT_TEST(TestSerializeRestoreBitset) {
- TBitSetStorageConstructor constructor;
- TestSerializeRestore(constructor);
- }
-}
-
-} // namespace NKikimr::NOlap::NIndexes
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make
index 6529a3e5a58..85e33ae6871 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make
@@ -1,7 +1,12 @@
UNITTEST_FOR(ydb/core/tx/columnshard/engines/storage/indexes/bits_storage)
SRCS(
- ut_bits_storage.cpp
+ bits_storage_ut.cpp
)
-END() \ No newline at end of file
+PEERDIR(
+ yql/essentials/public/udf/service/stub
+ yql/essentials/sql/pg_dummy
+)
+
+END()
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make
index f23cff41c81..308d4f32e6c 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make
@@ -2,6 +2,7 @@ LIBRARY()
SRCS(
abstract.cpp
+ array_power2.cpp
GLOBAL bitset.cpp
GLOBAL fix_string.cpp
)
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h
index a2db12a4522..0f0316570f0 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h
@@ -7,6 +7,9 @@
#include <ydb/core/tx/columnshard/engines/storage/indexes/helper/index_defaults.h>
#include <ydb/library/conclusion/status.h>
+#include <util/generic/string.h>
+#include <util/system/types.h>
+
namespace NKikimr::NOlap::NIndexes::NBloomNGramm {
struct TDerivedSettings {
@@ -26,7 +29,9 @@ public:
static constexpr ui32 MaxHashesCount = 8;
static constexpr ui32 DeprecatedRecordsCount = 10000;
static constexpr ui32 MinFilterSizeBytes = 128;
+ static constexpr ui32 MinFilterSizeBits = MinFilterSizeBytes * 8;
static constexpr ui32 MaxFilterSizeBytes = 1 << 20;
+ static constexpr ui32 MaxFilterSizeBits = MaxFilterSizeBytes * 8;
static constexpr ui32 MinRecordsCount = 128;
static constexpr ui32 MaxRecordsCount = 1000000;
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp
index 15beef0b083..ad4ee2278aa 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp
@@ -4,6 +4,7 @@
#include <ydb/core/formats/arrow/hash/calcer.h>
#include <ydb/core/tx/columnshard/engines/storage/indexes/helper/case_helper.h>
#include <ydb/core/tx/columnshard/engines/storage/chunks/data.h>
+#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h>
#include <ydb/core/tx/program/program.h>
#include <ydb/core/tx/schemeshard/olap/schema/schema.h>
@@ -185,88 +186,6 @@ public:
}
};
-class TVectorInserter {
-private:
- TDynBitMap Values;
- const ui32 Size;
-
-public:
- TDynBitMap ExtractBits() {
- return std::move(Values);
- }
-
- TVectorInserter(const ui32 bitsSize)
- : Size(bitsSize)
- {
- AFL_VERIFY(bitsSize);
- Values.Reserve(bitsSize);
- }
-
- void operator()(const ui64 hash) {
- Values.Set(hash % Size);
- }
-};
-
-template <ui64 BitsSize>
-class TVectorInserterPower2 {
-private:
- TBitMapOps<TFixedBitMapTraits<BitsSize, ui64>> Values;
- static constexpr ui32 SizeMask = BitsSize - 1;
- static_assert(((BitsSize - 1) & BitsSize) == 0);
-
-public:
- TBitMapOps<TFixedBitMapTraits<BitsSize, ui64>> ExtractBits() {
- return std::move(Values);
- }
-
- void operator()(const ui64 hash) {
- Values.Set(hash & SizeMask);
- }
-};
-
-namespace {
-
-template <ui64 Size>
-class TBitmapDetector {
-private:
- const TSkipBitmapIndex* Meta;
- const ui32 ExtSize;
- static constexpr ui64 NextSize = Size >> 1;
-
-public:
- TBitmapDetector(const TSkipBitmapIndex* meta, const ui32 size)
- : Meta(meta)
- , ExtSize(size)
- {
- AFL_VERIFY(ExtSize <= Size);
- }
-
- template <class TFiller>
- TString Detector(const TFiller& filler) const {
- if (ExtSize == Size) {
- TVectorInserterPower2<Size> inserter;
- filler(inserter);
- return Meta->GetBitsStorageConstructor()->SerializeToString(inserter.ExtractBits());
- } else {
- return TBitmapDetector<NextSize>(Meta, ExtSize).Detector(filler);
- }
- }
-};
-
-template <>
-class TBitmapDetector<0> {
-public:
- TBitmapDetector(const TSkipBitmapIndex* /*meta*/, const ui32 /*size*/) {
- AFL_VERIFY(false);
- }
-
- template <class TFiller>
- static TString Detector(const TFiller& /*filler*/) {
- AFL_VERIFY(false);
- return "";
- }
-};
-} // namespace
namespace {
@@ -305,13 +224,11 @@ std::vector<std::shared_ptr<NChunks::TPortionIndexChunk>> TIndexMeta::DoBuildInd
if (!UseOldSizing) {
static constexpr ui64 BitsPerUi64 = sizeof(ui64) * CHAR_BIT;
static constexpr ui64 MaxBitsSize = static_cast<ui64>(TConstants::MaxFilterSizeBytes) * CHAR_BIT;
- static constexpr ui64 MaxChunkCount = MaxBitsSize / BitsPerUi64;
- TVectorInserter maxInserter(MaxBitsSize);
- VisitAllChunksWithBuilder(reader, GetDataExtractor(), NGrammSize, builder, maxInserter);
+ TArrayPower2BitsStorage maxStorage(MaxBitsSize);
+ VisitAllChunksWithBuilder(reader, GetDataExtractor(), NGrammSize, builder, maxStorage);
- auto maxBits = maxInserter.ExtractBits();
- const ui64 setBitsCount = maxBits.Count();
+ const ui64 setBitsCount = maxStorage.CountSetBits();
const double m = static_cast<double>(MaxBitsSize);
const double k = static_cast<double>(HashesCount);
@@ -323,26 +240,10 @@ std::vector<std::shared_ptr<NChunks::TPortionIndexChunk>> TIndexMeta::DoBuildInd
const double requestedBitsSizeDouble = std::ceil((-k * estimatedUniqueCount) / std::log(1.0 - std::pow(FalsePositiveProbability, 1.0 / k)));
const ui64 requestedBitsSize = std::max<ui64>(BitsPerUi64, static_cast<ui64>(requestedBitsSizeDouble));
const ui32 targetSize = std::min<ui64>(MaxBitsSize, std::bit_ceil(requestedBitsSize));
- const size_t targetChunkCount = targetSize / BitsPerUi64;
- const ui64* srcChunks = maxBits.GetChunks();
- std::vector<ui64> folded(targetChunkCount, 0);
- for (size_t i = 0; i < MaxChunkCount; ++i) {
- folded[i % targetChunkCount] |= srcChunks[i];
- }
+ auto foldedStorage = targetSize < MaxBitsSize ? maxStorage.Fold(MaxBitsSize / targetSize) : std::move(maxStorage);
- TDynBitMap resultBits;
- resultBits.Reserve(targetSize);
- for (size_t i = 0; i < targetChunkCount; ++i) {
- ui64 chunk = folded[i];
- const size_t base = i * BitsPerUi64;
- while (chunk) {
- resultBits.Set(base + CountTrailingZeroBits(chunk));
- chunk &= chunk - 1;
- }
- }
-
- TString indexData = GetBitsStorageConstructor()->SerializeToString(std::move(resultBits));
+ TString indexData = GetBitsStorageConstructor()->SerializeToString(foldedStorage);
return { std::make_shared<NChunks::TPortionIndexChunk>(TChunkAddress(GetIndexId(), 0), recordsCount, indexData.size(), indexData) };
}
@@ -358,52 +259,29 @@ std::vector<std::shared_ptr<NChunks::TPortionIndexChunk>> TIndexMeta::DoBuildInd
}
size = std::max<ui32>(16, size);
- const auto doFillFilter = [&](auto& inserter) {
- for (reader.Start(); reader.IsCorrect();) {
- AFL_VERIFY(reader.GetColumnsCount() == 1);
- for (auto&& r : reader) {
- GetDataExtractor()->VisitAll(
- r.GetCurrentChunk(),
- [&](const std::shared_ptr<arrow::Array>& arr, const ui32 /*hashBase*/) {
- builder.FillNGrammHashes(NGrammSize, arr, inserter);
- },
- [&](const NArrow::NAccessor::TBinaryJsonValueView& data, const ui32 /*hashBase*/) {
- auto view = data.GetScalarOptional();
- if (!view.has_value()) {
- return;
- }
-
- builder.BuildNGramms(view->data(), view->size(), {}, NGrammSize, inserter);
- });
- }
+ TArrayPower2BitsStorage storage(size);
+ for (reader.Start(); reader.IsCorrect();) {
+ AFL_VERIFY(reader.GetColumnsCount() == 1);
+ for (auto&& r : reader) {
+ GetDataExtractor()->VisitAll(
+ r.GetCurrentChunk(),
+ [&](const std::shared_ptr<arrow::Array>& arr, const ui32 /*hashBase*/) {
+ builder.FillNGrammHashes(NGrammSize, arr, storage);
+ },
+ [&](const NArrow::NAccessor::TBinaryJsonValueView& data, const ui32 /*hashBase*/) {
+ auto view = data.GetScalarOptional();
+ if (!view.has_value()) {
+ return;
+ }
- reader.ReadNext(reader.begin()->GetCurrentChunk()->GetRecordsCount());
+ builder.BuildNGramms(view->data(), view->size(), {}, NGrammSize, storage);
+ });
}
- };
- TString indexData;
- if ((size & (size - 1)) == 0) {
- if (size == 1024) {
- indexData = TBitmapDetector<1024>(this, 1024).Detector(doFillFilter);
- } else if (size == 2048) {
- indexData = TBitmapDetector<2048>(this, 2048).Detector(doFillFilter);
- } else if (size == 4096) {
- indexData = TBitmapDetector<4096>(this, 4096).Detector(doFillFilter);
- } else if (size == 4096 * 2) {
- indexData = TBitmapDetector<4096 * 2>(this, 4096 * 2).Detector(doFillFilter);
- } else if (size == 4096 * 4) {
- indexData = TBitmapDetector<4096 * 4>(this, 4096 * 4).Detector(doFillFilter);
- } else if (size == 4096 * 8) {
- indexData = TBitmapDetector<4096 * 8>(this, 4096 * 8).Detector(doFillFilter);
- } else if (size == 4096 * 16) {
- indexData = TBitmapDetector<4096 * 16>(this, 4096 * 16).Detector(doFillFilter);
- }
- }
- if (!indexData) {
- TVectorInserter inserter(size);
- doFillFilter(inserter);
- indexData = GetBitsStorageConstructor()->SerializeToString(inserter.ExtractBits());
+ reader.ReadNext(reader.begin()->GetCurrentChunk()->GetRecordsCount());
}
+
+ TString indexData = GetBitsStorageConstructor()->SerializeToString(storage);
return { std::make_shared<NChunks::TPortionIndexChunk>(TChunkAddress(GetIndexId(), 0), recordsCount, indexData.size(), indexData) };
}
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make
index cf9ff42b330..1e5401a0b63 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make
@@ -8,9 +8,10 @@ SRCS(
PEERDIR(
ydb/core/protos
- ydb/core/formats/arrow
+ ydb/core/formats/arrow/hash
ydb/core/tx/columnshard/engines/storage/indexes/portions
ydb/core/tx/columnshard/engines/storage/indexes/helper
+ ydb/core/tx/columnshard/engines/storage/indexes/skip_index
ydb/library/conclusion
)
diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make
index 23b77adf127..d36230ebf12 100644
--- a/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make
+++ b/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make
@@ -9,6 +9,7 @@ SRCS(
PEERDIR(
ydb/core/tx/columnshard/engines/scheme/abstract
ydb/core/tx/columnshard/engines/scheme/indexes/abstract
+ ydb/core/tx/columnshard/engines/storage/indexes/bits_storage
)
END()