diff options
author | deniskhalikov <[email protected]> | 2025-03-31 15:42:14 +0300 |
---|---|---|
committer | deniskhalikov <[email protected]> | 2025-03-31 15:58:39 +0300 |
commit | 06c199b566aca570faf8661b96ea873a229f923a (patch) | |
tree | 63cb078ebb2702770eb89056fba85fa2b6a9a7db /yql/essentials/core/histogram/eq_width_histogram.cpp | |
parent | 3e9cb82822c4878edee4ef5852b16116424db357 (diff) |
Add equal width histogram.
This patch add equal width histogram
for use in assessing predicate selectivity for CBO.
commit_hash:8d2d3a7f89fb9b2261af1b6ba201e8d2a8b098b7
Diffstat (limited to 'yql/essentials/core/histogram/eq_width_histogram.cpp')
-rw-r--r-- | yql/essentials/core/histogram/eq_width_histogram.cpp | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/yql/essentials/core/histogram/eq_width_histogram.cpp b/yql/essentials/core/histogram/eq_width_histogram.cpp new file mode 100644 index 00000000000..3c5a452fdbd --- /dev/null +++ b/yql/essentials/core/histogram/eq_width_histogram.cpp @@ -0,0 +1,73 @@ +#include "eq_width_histogram.h" + +namespace NKikimr { + +TEqWidthHistogram::TEqWidthHistogram(ui32 numBuckets, EHistogramValueType valueType) + : ValueType(valueType), Buckets(numBuckets) { + // Exptected at least one bucket for histogram. + Y_ASSERT(numBuckets >= 1); +} + +TEqWidthHistogram::TEqWidthHistogram(const char *str, ui64 size) { + Y_ASSERT(str && size); + const ui32 numBuckets = *reinterpret_cast<const ui32 *>(str); + Y_ABORT_UNLESS(GetBinarySize(numBuckets) == size); + ui32 offset = sizeof(ui32); + ValueType = *reinterpret_cast<const EHistogramValueType *>(str + offset); + offset += sizeof(EHistogramValueType); + Buckets = TVector<TBucket>(numBuckets); + for (ui32 i = 0; i < numBuckets; ++i) { + std::memcpy(&Buckets[i], reinterpret_cast<const char *>(str + offset), sizeof(TBucket)); + offset += sizeof(TBucket); + } +} + +ui64 TEqWidthHistogram::GetBinarySize(ui32 nBuckets) const { + return sizeof(ui32) + sizeof(EHistogramValueType) + sizeof(TBucket) * nBuckets; +} + +// Binary layout: +// [4 byte: number of buckets][1 byte: value type] +// [sizeof(Bucket)[0]... sizeof(Bucket)[n]]. +std::unique_ptr<char> TEqWidthHistogram::Serialize(ui64 &binarySize) const { + binarySize = GetBinarySize(GetNumBuckets()); + std::unique_ptr<char> binaryData(new char[binarySize]); + ui32 offset = 0; + const ui32 numBuckets = GetNumBuckets(); + // 4 byte - number of buckets. + std::memcpy(binaryData.get(), &numBuckets, sizeof(ui32)); + offset += sizeof(ui32); + // 1 byte - values type. + std::memcpy(binaryData.get() + offset, &ValueType, sizeof(EHistogramValueType)); + offset += sizeof(EHistogramValueType); + // Buckets. + for (ui32 i = 0; i < numBuckets; ++i) { + std::memcpy(binaryData.get() + offset, &Buckets[i], sizeof(TBucket)); + offset += sizeof(TBucket); + } + return binaryData; +} + +TEqWidthHistogramEstimator::TEqWidthHistogramEstimator(std::shared_ptr<TEqWidthHistogram> histogram) + : Histogram(histogram) { + const auto numBuckets = Histogram->GetNumBuckets(); + PrefixSum = TVector<ui64>(numBuckets); + SuffixSum = TVector<ui64>(numBuckets); + CreatePrefixSum(numBuckets); + CreateSuffixSum(numBuckets); +} + +void TEqWidthHistogramEstimator::CreatePrefixSum(ui32 numBuckets) { + PrefixSum[0] = Histogram->GetNumElementsInBucket(0); + for (ui32 i = 1; i < numBuckets; ++i) { + PrefixSum[i] = PrefixSum[i - 1] + Histogram->GetNumElementsInBucket(i); + } +} + +void TEqWidthHistogramEstimator::CreateSuffixSum(ui32 numBuckets) { + SuffixSum[numBuckets - 1] = Histogram->GetNumElementsInBucket(numBuckets - 1); + for (i32 i = static_cast<i32>(numBuckets) - 2; i >= 0; --i) { + SuffixSum[i] = SuffixSum[i + 1] + Histogram->GetNumElementsInBucket(i); + } +} +} // namespace NKikimr |