diff options
| author | risenberg <[email protected]> | 2026-04-22 14:31:31 +0300 |
|---|---|---|
| committer | GitHub <[email protected]> | 2026-04-22 14:31:31 +0300 |
| commit | 59554ea611ced63df1bb2ec8b1df46cf83d30ecc (patch) | |
| tree | c27a6ff2799df369fb9a713a4a84ae7f96cf580e | |
| parent | 23b5824fe3ca7745558138234adb49b1c2cef7fb (diff) | |
Speed up building bloom_ngram index (#36689)
16 files changed, 266 insertions, 200 deletions
diff --git a/ydb/core/tx/columnshard/blobs_action/abstract/ya.make b/ydb/core/tx/columnshard/blobs_action/abstract/ya.make index 1ce1d711629..d53a409da29 100644 --- a/ydb/core/tx/columnshard/blobs_action/abstract/ya.make +++ b/ydb/core/tx/columnshard/blobs_action/abstract/ya.make @@ -23,6 +23,7 @@ PEERDIR( ydb/core/tx/columnshard/data_sharing/protos ydb/core/tx/columnshard/blobs_action/events ydb/core/tx/columnshard/blobs_action/protos + ydb/core/tx/columnshard/blobs_action/counters ) END() diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h index d23651c6301..b45f6c0c485 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/abstract.h @@ -1,8 +1,12 @@ #pragma once + +#include "array_power2.h" + #include <ydb/library/conclusion/result.h> #include <ydb/library/conclusion/status.h> #include <library/cpp/object_factory/object_factory.h> + #include <util/generic/bitmap.h> #include <util/generic/string.h> @@ -33,6 +37,7 @@ public: class IBitsStorageConstructor { private: virtual TString DoSerializeToString(TDynBitMap&& bm) const = 0; + virtual TString DoSerializeToString(const TArrayPower2BitsStorage& storage) const = 0; virtual TConclusion<std::shared_ptr<IBitsStorageViewer>> DoRestore(const TString& data) const = 0; public: @@ -44,6 +49,9 @@ public: [[nodiscard]] TString SerializeToString(TDynBitMap&& bm) const { return DoSerializeToString(std::move(bm)); } + [[nodiscard]] TString SerializeToString(const TArrayPower2BitsStorage& storage) const { + return DoSerializeToString(storage); + } [[nodiscard]] TConclusion<std::shared_ptr<IBitsStorageViewer>> Restore(const TString& data) const { return DoRestore(data); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp new file mode 100644 index 00000000000..b0308497693 --- /dev/null +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.cpp @@ -0,0 +1,37 @@ +#include "array_power2.h" + +#include <util/stream/output.h> +#include <util/stream/str.h> +#include <util/ysaveload.h> + +namespace NKikimr::NOlap::NIndexes { + +TString TArrayPower2BitsStorage::SerializeDynBitMapCompatible() const { + // TBitSetStorage requires serialization format of TDynBitMap, so reproduce it here + TString result; + result.reserve(DataSize * sizeof(ui64) + 2 * sizeof(ui64)); + TStringOutput out(result); + + ::Save(&out, ui8(sizeof(ui64))); + ::Save(&out, ui64(BitSize())); + ::SavePodArray(&out, Data.get(), DataSize); + return result; +} + +TArrayPower2BitsStorage TArrayPower2BitsStorage::Fold(ui32 times) const { + Y_ABORT_UNLESS(std::has_single_bit(times)); + Y_ABORT_UNLESS(times < DataSize); + auto newDataSize = DataSize / times; + std::unique_ptr<ui64[]> newData(new ui64[newDataSize]); + std::fill(newData.get(), newData.get() + newDataSize, 0); + for (size_t i = 0; i < times; ++i) { + auto start = newDataSize * i; + for (size_t j = 0; j < newDataSize; ++j) { + newData[j] |= Data[start + j]; + } + } + + return TArrayPower2BitsStorage(newDataSize, std::move(newData)); +} + +} // namespace NKikimr::NOlap::NIndexes diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h new file mode 100644 index 00000000000..9822dcfb009 --- /dev/null +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h @@ -0,0 +1,81 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/system/yassert.h> + +#include <algorithm> +#include <bit> +#include <climits> +#include <memory> + +namespace NKikimr::NOlap::NIndexes { + +// Bitset builder over an array with size a power of 2. In comparison to util/generic/bitmap.h impls it simultaneously +// * does absolute minimum of operations on insertion +// * has runtime-defined size +class TArrayPower2BitsStorage { +private: + const ui32 DataSize; + // ui64[] is slightly faster than ui8[], perhaps because there is no conversion inside operator() + std::unique_ptr<ui64[]> Data; + const ui32 SizeMask; + static_assert(CHAR_BIT == 8); + static constexpr ui64 BitsInItem = sizeof(ui64) * CHAR_BIT; + static constexpr ui64 ItemMask = BitsInItem - 1; + static constexpr ui64 SizeShift = std::popcount(ItemMask); + + TArrayPower2BitsStorage(ui32 dataSize, std::unique_ptr<ui64[]> data) + : DataSize(dataSize) + , Data(std::move(data)) + , SizeMask(DataSize - 1) { + Y_ABORT_UNLESS(std::has_single_bit(dataSize)); + } + + Y_FORCE_INLINE ui64 ItemMaskForHash(ui64 hash) const { + return (ui64{1} << (hash & ItemMask)); + } + + Y_FORCE_INLINE ui64 IndexForHash(ui64 hash) const { + return (hash >> SizeShift) & SizeMask; + } + +public: + TArrayPower2BitsStorage(ui32 bitSize) + : DataSize(bitSize / BitsInItem) + , Data(new ui64[DataSize]) + , SizeMask(DataSize - 1) { + Y_ABORT_UNLESS(bitSize >= BitsInItem); + std::fill(Data.get(), Data.get() + DataSize, 0); + Y_ABORT_UNLESS(std::has_single_bit(bitSize)); + } + + TString SerializeToString() const { + return TString(reinterpret_cast<char*>(Data.get()), DataSize * sizeof(ui64)); + } + + TString SerializeDynBitMapCompatible() const; + + Y_FORCE_INLINE void operator()(const ui64 hash) { + Data[IndexForHash(hash)] |= ItemMaskForHash(hash); + } + + bool Get(const ui64 hash) const { + return Data[IndexForHash(hash)] & ItemMaskForHash(hash); + } + + ui32 CountSetBits() const { + ui32 result = 0; + for (size_t i = 0; i < DataSize; ++i) { + result += std::popcount(Data[i]); + } + return result; + } + + ui32 BitSize() const { + return DataSize * BitsInItem; + } + + TArrayPower2BitsStorage Fold(ui32 times) const; +}; + +} // namespace NKikimr::NOlap::NIndexes diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp index d5ca883a19a..f812ef3afec 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.cpp @@ -14,6 +14,11 @@ TString TBitSetStorageConstructor::DoSerializeToString(TDynBitMap&& bm) const { return result; } + +TString TBitSetStorageConstructor::DoSerializeToString(const TArrayPower2BitsStorage& storage) const { + return storage.SerializeDynBitMapCompatible(); +} + TConclusion<std::shared_ptr<IBitsStorageViewer>> TBitSetStorageConstructor::DoRestore(const TString& data) const { try { TStringInput input(data); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h index e9affb2b856..18bb88fefc1 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h @@ -29,7 +29,8 @@ public: } private: - virtual TString DoSerializeToString(TDynBitMap&& bm) const override; + virtual TString DoSerializeToString(TDynBitMap&& bitsVector) const override; + virtual TString DoSerializeToString(const TArrayPower2BitsStorage& storage) const override; virtual TConclusion<std::shared_ptr<IBitsStorageViewer>> DoRestore(const TString& data) const override; static inline const auto Registrator = TFactory::TRegistrator<TBitSetStorageConstructor>(GetClassNameStatic()); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp index a416a43d8a5..2a40e5b40fe 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.cpp @@ -41,4 +41,8 @@ TString TFixStringBitsStorageConstructor::DoSerializeToString(TDynBitMap&& bitsV return result; } +TString TFixStringBitsStorageConstructor::DoSerializeToString(const TArrayPower2BitsStorage& storage) const { + return storage.SerializeToString(); +} + } // namespace NKikimr::NOlap::NIndexes diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h index 5ee804ae98b..8a09475d58f 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h @@ -30,6 +30,7 @@ public: private: virtual TString DoSerializeToString(TDynBitMap&& bitsVector) const override; + virtual TString DoSerializeToString(const TArrayPower2BitsStorage& storage) const override; virtual TConclusion<std::shared_ptr<IBitsStorageViewer>> DoRestore(const TString& data) const override { return std::make_shared<TFixStringBitsStorage>(data); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp new file mode 100644 index 00000000000..9213d04fe1c --- /dev/null +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/bits_storage_ut.cpp @@ -0,0 +1,86 @@ +#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h> +#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h> +#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h> + +#include <library/cpp/testing/unittest/registar.h> +#include <util/random/fast.h> + +#include <unordered_set> + +namespace NKikimr::NOlap::NIndexes { + +void CheckInsertedValues(const std::unordered_set<ui64>& inserted, ui32 size, const IBitsStorageViewer& storage) { + for (ui64 i = 0; i < size; ++i) { + if (inserted.contains(i)) { + UNIT_ASSERT_C(storage.Get(i), TStringBuilder() << "value " << i << " must be set but is not"); + } else { + UNIT_ASSERT_C(!storage.Get(i), TStringBuilder() << "value " << i << " must not be set but it is"); + } + } +} + +void TestSerializeRestoreArrayStorage(IBitsStorageConstructor& constructor) { + constexpr ui32 size = 16384; + + TFastRng64 rng(100500); + std::unordered_set<ui64> inserted; + TArrayPower2BitsStorage storage(size); + for (ui32 i = 0; i < size / 2; ++i) { + auto item = rng.Uniform(size); + storage(item); + inserted.insert(item); + } + TString serialized = constructor.SerializeToString(storage); + auto restoredConclusion = constructor.Restore(serialized); + UNIT_ASSERT_C(restoredConclusion.IsSuccess(), restoredConclusion.GetErrorMessage()); + auto restored = restoredConclusion.DetachResult(); + + CheckInsertedValues(inserted, size, *restored); +} + +void TestSerializeRestoreDynBitMap(IBitsStorageConstructor& constructor) { + constexpr ui32 size = 16384; + + TFastRng64 rng(100500); + std::unordered_set<ui64> inserted; + TDynBitMap bitmap; + bitmap.Reserve(size); + for (ui32 i = 0; i < size / 2; ++i) { + auto item = rng.Uniform(size); + bitmap.Set(item); + inserted.insert(item); + } + TString serialized = constructor.SerializeToString(std::move(bitmap)); + auto restoredConclusion = constructor.Restore(serialized); + UNIT_ASSERT_C(restoredConclusion.IsSuccess(), restoredConclusion.GetErrorMessage()); + auto restored = restoredConclusion.DetachResult(); + + CheckInsertedValues(inserted, size, *restored); +} + + + +Y_UNIT_TEST_SUITE(TBitsStorageTests) { + Y_UNIT_TEST(TestSerializeRestoreArrayStorageToFixString) { + TFixStringBitsStorageConstructor constructor; + TestSerializeRestoreArrayStorage(constructor); + } + + Y_UNIT_TEST(TestSerializeRestoreArrayStorageToBitset) { + TBitSetStorageConstructor constructor; + TestSerializeRestoreArrayStorage(constructor); + } + + + Y_UNIT_TEST(TestSerializeRestoreDynBitMapToFixString) { + TFixStringBitsStorageConstructor constructor; + TestSerializeRestoreDynBitMap(constructor); + } + + Y_UNIT_TEST(TestSerializeRestoreDynBitMapToBitset) { + TBitSetStorageConstructor constructor; + TestSerializeRestoreDynBitMap(constructor); + } +} + +} // namespace NKikimr::NOlap::NIndexes diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp deleted file mode 100644 index 6e34d63fb2a..00000000000 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ut_bits_storage.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/bitset.h> -#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/fix_string.h> - -#include <library/cpp/testing/unittest/registar.h> -#include <util/random/fast.h> - -#include <unordered_set> - -namespace NKikimr::NOlap::NIndexes { - -void TestSerializeRestore(IBitsStorageConstructor& constructor) { - ui32 size = 16384; - - TFastRng64 rng(100500); - std::unordered_set<ui64> inserted; - TDynBitMap bitmap; - bitmap.Reserve(size); - for (ui32 i = 0; i < size / 2; ++i) { - auto item = rng.Uniform(size); - bitmap.Set(item); - inserted.insert(item); - } - TString serialized = constructor.SerializeToString(std::move(bitmap)); - auto restoredConclusion = constructor.Restore(serialized); - UNIT_ASSERT_C(restoredConclusion.IsSuccess(), restoredConclusion.GetErrorMessage()); - auto restored = restoredConclusion.DetachResult(); - - for (ui64 i = 0; i < size; ++i) { - if (inserted.contains(i)) { - UNIT_ASSERT_C(restored->Get(i), TStringBuilder() << "value " << i << " must be set but is not"); - } else { - UNIT_ASSERT_C(!restored->Get(i), TStringBuilder() << "value " << i << " must not be set but it is"); - } - } -} - -Y_UNIT_TEST_SUITE(TBitsStorageTests) { - Y_UNIT_TEST(TestSerializeRestoreFixString) { - TFixStringBitsStorageConstructor constructor; - TestSerializeRestore(constructor); - } - - Y_UNIT_TEST(TestSerializeRestoreBitset) { - TBitSetStorageConstructor constructor; - TestSerializeRestore(constructor); - } -} - -} // namespace NKikimr::NOlap::NIndexes diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make index 6529a3e5a58..85e33ae6871 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ut/ya.make @@ -1,7 +1,12 @@ UNITTEST_FOR(ydb/core/tx/columnshard/engines/storage/indexes/bits_storage) SRCS( - ut_bits_storage.cpp + bits_storage_ut.cpp ) -END()
\ No newline at end of file +PEERDIR( + yql/essentials/public/udf/service/stub + yql/essentials/sql/pg_dummy +) + +END() diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make index f23cff41c81..308d4f32e6c 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/ya.make @@ -2,6 +2,7 @@ LIBRARY() SRCS( abstract.cpp + array_power2.cpp GLOBAL bitset.cpp GLOBAL fix_string.cpp ) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h index a2db12a4522..0f0316570f0 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h @@ -7,6 +7,9 @@ #include <ydb/core/tx/columnshard/engines/storage/indexes/helper/index_defaults.h> #include <ydb/library/conclusion/status.h> +#include <util/generic/string.h> +#include <util/system/types.h> + namespace NKikimr::NOlap::NIndexes::NBloomNGramm { struct TDerivedSettings { @@ -26,7 +29,9 @@ public: static constexpr ui32 MaxHashesCount = 8; static constexpr ui32 DeprecatedRecordsCount = 10000; static constexpr ui32 MinFilterSizeBytes = 128; + static constexpr ui32 MinFilterSizeBits = MinFilterSizeBytes * 8; static constexpr ui32 MaxFilterSizeBytes = 1 << 20; + static constexpr ui32 MaxFilterSizeBits = MaxFilterSizeBytes * 8; static constexpr ui32 MinRecordsCount = 128; static constexpr ui32 MaxRecordsCount = 1000000; diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index 15beef0b083..ad4ee2278aa 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -4,6 +4,7 @@ #include <ydb/core/formats/arrow/hash/calcer.h> #include <ydb/core/tx/columnshard/engines/storage/indexes/helper/case_helper.h> #include <ydb/core/tx/columnshard/engines/storage/chunks/data.h> +#include <ydb/core/tx/columnshard/engines/storage/indexes/bits_storage/array_power2.h> #include <ydb/core/tx/program/program.h> #include <ydb/core/tx/schemeshard/olap/schema/schema.h> @@ -185,88 +186,6 @@ public: } }; -class TVectorInserter { -private: - TDynBitMap Values; - const ui32 Size; - -public: - TDynBitMap ExtractBits() { - return std::move(Values); - } - - TVectorInserter(const ui32 bitsSize) - : Size(bitsSize) - { - AFL_VERIFY(bitsSize); - Values.Reserve(bitsSize); - } - - void operator()(const ui64 hash) { - Values.Set(hash % Size); - } -}; - -template <ui64 BitsSize> -class TVectorInserterPower2 { -private: - TBitMapOps<TFixedBitMapTraits<BitsSize, ui64>> Values; - static constexpr ui32 SizeMask = BitsSize - 1; - static_assert(((BitsSize - 1) & BitsSize) == 0); - -public: - TBitMapOps<TFixedBitMapTraits<BitsSize, ui64>> ExtractBits() { - return std::move(Values); - } - - void operator()(const ui64 hash) { - Values.Set(hash & SizeMask); - } -}; - -namespace { - -template <ui64 Size> -class TBitmapDetector { -private: - const TSkipBitmapIndex* Meta; - const ui32 ExtSize; - static constexpr ui64 NextSize = Size >> 1; - -public: - TBitmapDetector(const TSkipBitmapIndex* meta, const ui32 size) - : Meta(meta) - , ExtSize(size) - { - AFL_VERIFY(ExtSize <= Size); - } - - template <class TFiller> - TString Detector(const TFiller& filler) const { - if (ExtSize == Size) { - TVectorInserterPower2<Size> inserter; - filler(inserter); - return Meta->GetBitsStorageConstructor()->SerializeToString(inserter.ExtractBits()); - } else { - return TBitmapDetector<NextSize>(Meta, ExtSize).Detector(filler); - } - } -}; - -template <> -class TBitmapDetector<0> { -public: - TBitmapDetector(const TSkipBitmapIndex* /*meta*/, const ui32 /*size*/) { - AFL_VERIFY(false); - } - - template <class TFiller> - static TString Detector(const TFiller& /*filler*/) { - AFL_VERIFY(false); - return ""; - } -}; -} // namespace namespace { @@ -305,13 +224,11 @@ std::vector<std::shared_ptr<NChunks::TPortionIndexChunk>> TIndexMeta::DoBuildInd if (!UseOldSizing) { static constexpr ui64 BitsPerUi64 = sizeof(ui64) * CHAR_BIT; static constexpr ui64 MaxBitsSize = static_cast<ui64>(TConstants::MaxFilterSizeBytes) * CHAR_BIT; - static constexpr ui64 MaxChunkCount = MaxBitsSize / BitsPerUi64; - TVectorInserter maxInserter(MaxBitsSize); - VisitAllChunksWithBuilder(reader, GetDataExtractor(), NGrammSize, builder, maxInserter); + TArrayPower2BitsStorage maxStorage(MaxBitsSize); + VisitAllChunksWithBuilder(reader, GetDataExtractor(), NGrammSize, builder, maxStorage); - auto maxBits = maxInserter.ExtractBits(); - const ui64 setBitsCount = maxBits.Count(); + const ui64 setBitsCount = maxStorage.CountSetBits(); const double m = static_cast<double>(MaxBitsSize); const double k = static_cast<double>(HashesCount); @@ -323,26 +240,10 @@ std::vector<std::shared_ptr<NChunks::TPortionIndexChunk>> TIndexMeta::DoBuildInd const double requestedBitsSizeDouble = std::ceil((-k * estimatedUniqueCount) / std::log(1.0 - std::pow(FalsePositiveProbability, 1.0 / k))); const ui64 requestedBitsSize = std::max<ui64>(BitsPerUi64, static_cast<ui64>(requestedBitsSizeDouble)); const ui32 targetSize = std::min<ui64>(MaxBitsSize, std::bit_ceil(requestedBitsSize)); - const size_t targetChunkCount = targetSize / BitsPerUi64; - const ui64* srcChunks = maxBits.GetChunks(); - std::vector<ui64> folded(targetChunkCount, 0); - for (size_t i = 0; i < MaxChunkCount; ++i) { - folded[i % targetChunkCount] |= srcChunks[i]; - } + auto foldedStorage = targetSize < MaxBitsSize ? maxStorage.Fold(MaxBitsSize / targetSize) : std::move(maxStorage); - TDynBitMap resultBits; - resultBits.Reserve(targetSize); - for (size_t i = 0; i < targetChunkCount; ++i) { - ui64 chunk = folded[i]; - const size_t base = i * BitsPerUi64; - while (chunk) { - resultBits.Set(base + CountTrailingZeroBits(chunk)); - chunk &= chunk - 1; - } - } - - TString indexData = GetBitsStorageConstructor()->SerializeToString(std::move(resultBits)); + TString indexData = GetBitsStorageConstructor()->SerializeToString(foldedStorage); return { std::make_shared<NChunks::TPortionIndexChunk>(TChunkAddress(GetIndexId(), 0), recordsCount, indexData.size(), indexData) }; } @@ -358,52 +259,29 @@ std::vector<std::shared_ptr<NChunks::TPortionIndexChunk>> TIndexMeta::DoBuildInd } size = std::max<ui32>(16, size); - const auto doFillFilter = [&](auto& inserter) { - for (reader.Start(); reader.IsCorrect();) { - AFL_VERIFY(reader.GetColumnsCount() == 1); - for (auto&& r : reader) { - GetDataExtractor()->VisitAll( - r.GetCurrentChunk(), - [&](const std::shared_ptr<arrow::Array>& arr, const ui32 /*hashBase*/) { - builder.FillNGrammHashes(NGrammSize, arr, inserter); - }, - [&](const NArrow::NAccessor::TBinaryJsonValueView& data, const ui32 /*hashBase*/) { - auto view = data.GetScalarOptional(); - if (!view.has_value()) { - return; - } - - builder.BuildNGramms(view->data(), view->size(), {}, NGrammSize, inserter); - }); - } + TArrayPower2BitsStorage storage(size); + for (reader.Start(); reader.IsCorrect();) { + AFL_VERIFY(reader.GetColumnsCount() == 1); + for (auto&& r : reader) { + GetDataExtractor()->VisitAll( + r.GetCurrentChunk(), + [&](const std::shared_ptr<arrow::Array>& arr, const ui32 /*hashBase*/) { + builder.FillNGrammHashes(NGrammSize, arr, storage); + }, + [&](const NArrow::NAccessor::TBinaryJsonValueView& data, const ui32 /*hashBase*/) { + auto view = data.GetScalarOptional(); + if (!view.has_value()) { + return; + } - reader.ReadNext(reader.begin()->GetCurrentChunk()->GetRecordsCount()); + builder.BuildNGramms(view->data(), view->size(), {}, NGrammSize, storage); + }); } - }; - TString indexData; - if ((size & (size - 1)) == 0) { - if (size == 1024) { - indexData = TBitmapDetector<1024>(this, 1024).Detector(doFillFilter); - } else if (size == 2048) { - indexData = TBitmapDetector<2048>(this, 2048).Detector(doFillFilter); - } else if (size == 4096) { - indexData = TBitmapDetector<4096>(this, 4096).Detector(doFillFilter); - } else if (size == 4096 * 2) { - indexData = TBitmapDetector<4096 * 2>(this, 4096 * 2).Detector(doFillFilter); - } else if (size == 4096 * 4) { - indexData = TBitmapDetector<4096 * 4>(this, 4096 * 4).Detector(doFillFilter); - } else if (size == 4096 * 8) { - indexData = TBitmapDetector<4096 * 8>(this, 4096 * 8).Detector(doFillFilter); - } else if (size == 4096 * 16) { - indexData = TBitmapDetector<4096 * 16>(this, 4096 * 16).Detector(doFillFilter); - } - } - if (!indexData) { - TVectorInserter inserter(size); - doFillFilter(inserter); - indexData = GetBitsStorageConstructor()->SerializeToString(inserter.ExtractBits()); + reader.ReadNext(reader.begin()->GetCurrentChunk()->GetRecordsCount()); } + + TString indexData = GetBitsStorageConstructor()->SerializeToString(storage); return { std::make_shared<NChunks::TPortionIndexChunk>(TChunkAddress(GetIndexId(), 0), recordsCount, indexData.size(), indexData) }; } diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make index cf9ff42b330..1e5401a0b63 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make @@ -8,9 +8,10 @@ SRCS( PEERDIR( ydb/core/protos - ydb/core/formats/arrow + ydb/core/formats/arrow/hash ydb/core/tx/columnshard/engines/storage/indexes/portions ydb/core/tx/columnshard/engines/storage/indexes/helper + ydb/core/tx/columnshard/engines/storage/indexes/skip_index ydb/library/conclusion ) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make index 23b77adf127..d36230ebf12 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/indexes/skip_index/ya.make @@ -9,6 +9,7 @@ SRCS( PEERDIR( ydb/core/tx/columnshard/engines/scheme/abstract ydb/core/tx/columnshard/engines/scheme/indexes/abstract + ydb/core/tx/columnshard/engines/storage/indexes/bits_storage ) END() |
