diff options
author | deniskhalikov <[email protected]> | 2025-03-31 15:42:14 +0300 |
---|---|---|
committer | deniskhalikov <[email protected]> | 2025-03-31 15:58:39 +0300 |
commit | 06c199b566aca570faf8661b96ea873a229f923a (patch) | |
tree | 63cb078ebb2702770eb89056fba85fa2b6a9a7db | |
parent | 3e9cb82822c4878edee4ef5852b16116424db357 (diff) |
Add equal width histogram.
This patch add equal width histogram
for use in assessing predicate selectivity for CBO.
commit_hash:8d2d3a7f89fb9b2261af1b6ba201e8d2a8b098b7
-rw-r--r-- | yql/essentials/core/histogram/eq_width_histogram.cpp | 73 | ||||
-rw-r--r-- | yql/essentials/core/histogram/eq_width_histogram.h | 228 | ||||
-rw-r--r-- | yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp | 127 | ||||
-rw-r--r-- | yql/essentials/core/histogram/ut/ya.make | 8 | ||||
-rw-r--r-- | yql/essentials/core/histogram/ya.make | 12 | ||||
-rw-r--r-- | yql/essentials/core/minsketch/ut/ya.make | 11 | ||||
-rw-r--r-- | yql/essentials/core/ya.make | 1 | ||||
-rw-r--r-- | yql/essentials/core/yql_statistics.cpp | 10 | ||||
-rw-r--r-- | yql/essentials/core/yql_statistics.h | 2 |
9 files changed, 462 insertions, 10 deletions
diff --git a/yql/essentials/core/histogram/eq_width_histogram.cpp b/yql/essentials/core/histogram/eq_width_histogram.cpp new file mode 100644 index 00000000000..3c5a452fdbd --- /dev/null +++ b/yql/essentials/core/histogram/eq_width_histogram.cpp @@ -0,0 +1,73 @@ +#include "eq_width_histogram.h" + +namespace NKikimr { + +TEqWidthHistogram::TEqWidthHistogram(ui32 numBuckets, EHistogramValueType valueType) + : ValueType(valueType), Buckets(numBuckets) { + // Exptected at least one bucket for histogram. + Y_ASSERT(numBuckets >= 1); +} + +TEqWidthHistogram::TEqWidthHistogram(const char *str, ui64 size) { + Y_ASSERT(str && size); + const ui32 numBuckets = *reinterpret_cast<const ui32 *>(str); + Y_ABORT_UNLESS(GetBinarySize(numBuckets) == size); + ui32 offset = sizeof(ui32); + ValueType = *reinterpret_cast<const EHistogramValueType *>(str + offset); + offset += sizeof(EHistogramValueType); + Buckets = TVector<TBucket>(numBuckets); + for (ui32 i = 0; i < numBuckets; ++i) { + std::memcpy(&Buckets[i], reinterpret_cast<const char *>(str + offset), sizeof(TBucket)); + offset += sizeof(TBucket); + } +} + +ui64 TEqWidthHistogram::GetBinarySize(ui32 nBuckets) const { + return sizeof(ui32) + sizeof(EHistogramValueType) + sizeof(TBucket) * nBuckets; +} + +// Binary layout: +// [4 byte: number of buckets][1 byte: value type] +// [sizeof(Bucket)[0]... sizeof(Bucket)[n]]. +std::unique_ptr<char> TEqWidthHistogram::Serialize(ui64 &binarySize) const { + binarySize = GetBinarySize(GetNumBuckets()); + std::unique_ptr<char> binaryData(new char[binarySize]); + ui32 offset = 0; + const ui32 numBuckets = GetNumBuckets(); + // 4 byte - number of buckets. + std::memcpy(binaryData.get(), &numBuckets, sizeof(ui32)); + offset += sizeof(ui32); + // 1 byte - values type. + std::memcpy(binaryData.get() + offset, &ValueType, sizeof(EHistogramValueType)); + offset += sizeof(EHistogramValueType); + // Buckets. + for (ui32 i = 0; i < numBuckets; ++i) { + std::memcpy(binaryData.get() + offset, &Buckets[i], sizeof(TBucket)); + offset += sizeof(TBucket); + } + return binaryData; +} + +TEqWidthHistogramEstimator::TEqWidthHistogramEstimator(std::shared_ptr<TEqWidthHistogram> histogram) + : Histogram(histogram) { + const auto numBuckets = Histogram->GetNumBuckets(); + PrefixSum = TVector<ui64>(numBuckets); + SuffixSum = TVector<ui64>(numBuckets); + CreatePrefixSum(numBuckets); + CreateSuffixSum(numBuckets); +} + +void TEqWidthHistogramEstimator::CreatePrefixSum(ui32 numBuckets) { + PrefixSum[0] = Histogram->GetNumElementsInBucket(0); + for (ui32 i = 1; i < numBuckets; ++i) { + PrefixSum[i] = PrefixSum[i - 1] + Histogram->GetNumElementsInBucket(i); + } +} + +void TEqWidthHistogramEstimator::CreateSuffixSum(ui32 numBuckets) { + SuffixSum[numBuckets - 1] = Histogram->GetNumElementsInBucket(numBuckets - 1); + for (i32 i = static_cast<i32>(numBuckets) - 2; i >= 0; --i) { + SuffixSum[i] = SuffixSum[i + 1] + Histogram->GetNumElementsInBucket(i); + } +} +} // namespace NKikimr diff --git a/yql/essentials/core/histogram/eq_width_histogram.h b/yql/essentials/core/histogram/eq_width_histogram.h new file mode 100644 index 00000000000..97c660af76b --- /dev/null +++ b/yql/essentials/core/histogram/eq_width_histogram.h @@ -0,0 +1,228 @@ +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/stream/output.h> +#include <util/system/types.h> +#include <cmath> + +namespace NKikimr { + + // Helper functions to work with histogram values. +template <typename T> +inline T LoadFrom(const ui8 *storage) { + T val; + std::memcpy(&val, storage, sizeof(T)); + return val; +} +template <typename T> +inline void StoreTo(ui8 *storage, T value) { + std::memcpy(storage, &value, sizeof(T)); +} +template <typename T> +inline bool CmpEqual(T left, T right) { + return left == right; +} +template <> +inline bool CmpEqual(double left, double right) { + return std::fabs(left - right) < std::numeric_limits<double>::epsilon(); +} +template <typename T> +inline bool CmpLess(T left, T right) { + return left < right; +} + +// Represents value types supported by histogram. +enum class EHistogramValueType : ui8 { Int16, Int32, Int64, Uint16, Uint32, Uint64, Double, NotSupported }; + +// Bucket storage size for Equal width histogram. +constexpr const ui32 EqWidthHistogramBucketStorageSize = 8; + +// This class represents an `Equal-width` histogram. +// Each bucket represents a range of contiguous values of equal width, and the +// aggregate summary stored in the bucket is the number of rows whose value lies +// within that range. +class TEqWidthHistogram { + public: +#pragma pack(push, 1) + struct TBucket { + // The number of values in a bucket. + ui64 Count{0}; + // The `start` value of a bucket, the `end` of the bucket is a next start. + // [start = start[i], end = start[i + 1]) + ui8 Start[EqWidthHistogramBucketStorageSize]; + }; + struct TBucketRange { + ui8 Start[EqWidthHistogramBucketStorageSize]; + ui8 End[EqWidthHistogramBucketStorageSize]; + }; +#pragma pack(pop) + + // Have to specify the number of buckets and type of the values. + TEqWidthHistogram(ui32 numBuckets = 1, EHistogramValueType type = EHistogramValueType::Int32); + // From serialized data. + TEqWidthHistogram(const char *str, ui64 size); + + // Adds the given `val` to a histogram. + template <typename T> + void AddElement(T val) { + const auto index = FindBucketIndex(val); + // The given `index` in range [0, numBuckets - 1]. + const T bucketValue = LoadFrom<T>(Buckets[index].Start); + if (!index || ((CmpEqual<T>(bucketValue, val) || CmpLess<T>(bucketValue, val)))) { + Buckets[index].Count++; + } else { + Buckets[index - 1].Count++; + } + } + + // Returns an index of the bucket which stores the given `val`. + // Returned index in range [0, numBuckets - 1]. + // Not using `std::lower_bound()` here because need an index to map to `suffix` and `prefix` sum. + template <typename T> + ui32 FindBucketIndex(T val) const { + ui32 start = 0; + ui32 end = GetNumBuckets() - 1; + while (start < end) { + auto it = start + (end - start) / 2; + if (CmpLess<T>(LoadFrom<T>(Buckets[it].Start), val)) { + start = it + 1; + } else { + end = it; + } + } + return start; + } + + // Returns a number of buckets in a histogram. + ui32 GetNumBuckets() const { return Buckets.size(); } + + template <typename T> + ui32 GetBucketWidth() const { + Y_ASSERT(GetNumBuckets()); + if (GetNumBuckets() == 1) { + return std::max(static_cast<ui32>(LoadFrom<T>(Buckets.front().Start)), 1U); + } else { + return std::max(static_cast<ui32>(LoadFrom<T>(Buckets[1].Start) - LoadFrom<T>(Buckets[0].Start)), 1U); + } + } + + template <> + ui32 GetBucketWidth<double>() const { + return 1; + } + + // Returns histogram type. + EHistogramValueType GetType() const { return ValueType; } + // Returns a number of elements in a bucket by the given `index`. + ui64 GetNumElementsInBucket(ui32 index) const { return Buckets[index].Count; } + + // Initializes buckets with a given `range`. + template <typename T> + void InitializeBuckets(const TBucketRange &range) { + Y_ASSERT(CmpLess<T>(LoadFrom<T>(range.Start), LoadFrom<T>(range.End))); + T rangeLen = LoadFrom<T>(range.End) - LoadFrom<T>(range.Start); + std::memcpy(Buckets[0].Start, range.Start, sizeof(range.Start)); + for (ui32 i = 1; i < GetNumBuckets(); ++i) { + const T prevStart = LoadFrom<T>(Buckets[i - 1].Start); + StoreTo<T>(Buckets[i].Start, prevStart + rangeLen); + } + } + + // Seriailizes to a binary representation + std::unique_ptr<char> Serialize(ui64 &binSize) const; + // Returns buckets. + const TVector<TBucket> &GetBuckets() const { return Buckets; } + + template <typename T> + void Aggregate(const TEqWidthHistogram &other) { + if ((this->ValueType != other.GetType()) || (!BucketsEqual<T>(other))) { + // Should we fail? + return; + } + for (ui32 i = 0; i < Buckets.size(); ++i) { + Buckets[i].Count += other.GetBuckets()[i].Count; + } + } + + private: + template <typename T> + bool BucketsEqual(const TEqWidthHistogram &other) { + if (Buckets.size() != other.GetNumBuckets()) { + return false; + } + for (ui32 i = 0; i < Buckets.size(); ++i) { + if (!CmpEqual<T>(LoadFrom<T>(Buckets[i].Start), LoadFrom<T>(GetBuckets()[i].Start))) { + return false; + } + } + return true; + } + + // Returns binary size of the histogram. + ui64 GetBinarySize(ui32 nBuckets) const; + EHistogramValueType ValueType; + TVector<TBucket> Buckets; +}; + +// This class represents a machinery to estimate a value in a histogram. +class TEqWidthHistogramEstimator { + public: + TEqWidthHistogramEstimator(std::shared_ptr<TEqWidthHistogram> histogram); + + // Methods to estimate values. + template <typename T> + ui64 EstimateLessOrEqual(T val) const { + return EstimateOrEqual<T>(val, PrefixSum); + } + + template <typename T> + ui64 EstimateGreaterOrEqual(T val) const { + return EstimateOrEqual<T>(val, SuffixSum); + } + + template <typename T> + ui64 EstimateLess(T val) const { + return EstimateNotEqual<T>(val, PrefixSum); + } + + template <typename T> + ui64 EstimateGreater(T val) const { + return EstimateNotEqual<T>(val, SuffixSum); + } + + template <typename T> + ui64 EstimateEqual(T val) const { + const auto index = Histogram->FindBucketIndex(val); + // Assuming uniform distribution. + return std::max(1U, static_cast<ui32>(Histogram->GetNumElementsInBucket(index) / Histogram->template GetBucketWidth<T>())); + } + + // Returns the total number elements in histogram. + // Could be used to adjust scale. + ui64 GetNumElements() const { return PrefixSum.back(); } + + private: + template <typename T> + ui64 EstimateOrEqual(T val, const TVector<ui64> &sumArray) const { + const auto index = Histogram->FindBucketIndex(val); + return sumArray[index]; + } + + template <typename T> + ui64 EstimateNotEqual(T val, const TVector<ui64> &sumArray) const { + const auto index = Histogram->FindBucketIndex(val); + // Take the previous backet if it's not the first one. + if (!index) { + return sumArray[index]; + } + return sumArray[index - 1]; + } + + void CreatePrefixSum(ui32 numBuckets); + void CreateSuffixSum(ui32 numBuckets); + std::shared_ptr<TEqWidthHistogram> Histogram; + TVector<ui64> PrefixSum; + TVector<ui64> SuffixSum; +}; +} // namespace NKikimr diff --git a/yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp b/yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp new file mode 100644 index 00000000000..9c1b1d969fe --- /dev/null +++ b/yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp @@ -0,0 +1,127 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include "eq_width_histogram.h" + +namespace NKikimr { + +template <typename T> +bool EqualHistograms(const std::shared_ptr<TEqWidthHistogram> &left, const std::shared_ptr<TEqWidthHistogram> &right) { + // Not expecting any nullptr. + if (!left || !right) return false; + + if (left->GetNumBuckets() != right->GetNumBuckets()) { + return false; + } + if (left->GetType() != right->GetType()) { + return false; + } + + for (ui32 i = 0; i < left->GetNumBuckets(); ++i) { + const auto &leftBucket = left->GetBuckets()[i]; + const auto &rightBucket = right->GetBuckets()[i]; + if (leftBucket.Count != rightBucket.Count) { + return false; + } + if (!CmpEqual<T>(LoadFrom<T>(leftBucket.Start), LoadFrom<T>(rightBucket.Start))) { + return false; + } + } + + return true; +} + +template <typename T> +std::shared_ptr<TEqWidthHistogram> CreateHistogram(ui32 numBuckets, T start, T range, EHistogramValueType valueType) { + std::shared_ptr<TEqWidthHistogram> histogram(std::make_shared<TEqWidthHistogram>(numBuckets, valueType)); + TEqWidthHistogram::TBucketRange bucketRange; + StoreTo<T>(bucketRange.Start, start); + StoreTo<T>(bucketRange.End, range); + histogram->InitializeBuckets<T>(bucketRange); + return histogram; +} + +template <typename T> +void PopulateHistogram(std::shared_ptr<TEqWidthHistogram> histogram, const std::pair<ui32, ui32> &range) { + for (ui32 i = range.first; i < range.second; ++i) { + histogram->AddElement<T>(i); + } +} + +template <typename T> +void TestHistogramBasic(ui32 numBuckets, std::pair<ui32, ui32> range, std::pair<T, T> bucketRange, + EHistogramValueType valueType, std::pair<T, ui64> less, std::pair<T, ui64> greater) { + auto histogram = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType); + UNIT_ASSERT_VALUES_EQUAL(histogram->GetNumBuckets(), numBuckets); + PopulateHistogram<T>(histogram, range); + TEqWidthHistogramEstimator estimator(histogram); + UNIT_ASSERT_VALUES_EQUAL(estimator.EstimateLessOrEqual<T>(less.first), less.second); + UNIT_ASSERT_VALUES_EQUAL(estimator.EstimateGreaterOrEqual<T>(greater.first), greater.second); +} + +template <typename T> +void TestHistogramSerialization(ui32 numBuckets, std::pair<ui32, ui32> range, std::pair<T, T> bucketRange, + EHistogramValueType valueType) { + auto histogram = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType); + UNIT_ASSERT(histogram); + PopulateHistogram<T>(histogram, range); + ui64 binarySize = 0; + auto binaryData = histogram->Serialize(binarySize); + UNIT_ASSERT(binaryData && binarySize); + TString hString(binaryData.get(), binarySize); + auto histogramFromString = std::make_shared<TEqWidthHistogram>(hString.data(), hString.size()); + UNIT_ASSERT(histogramFromString); + UNIT_ASSERT(EqualHistograms<T>(histogram, histogramFromString)); +} + +template <typename T> +void TestHistogramAggregate(ui32 numBuckets, std::pair<ui32, ui32> range, std::pair<T, T> bucketRange, + EHistogramValueType valueType, ui32 numCombine, const TVector<ui64> &resultCount) { + auto histogram = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType); + UNIT_ASSERT(histogram); + PopulateHistogram<T>(histogram, range); + auto histogramToAdd = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType); + PopulateHistogram<T>(histogramToAdd, range); + UNIT_ASSERT(histogram); + for (ui32 i = 0; i < numCombine; ++i) histogram->template Aggregate<T>(*histogramToAdd); + for (ui32 i = 0; i < histogram->GetNumBuckets(); ++i) { + UNIT_ASSERT(histogram->GetBuckets()[i].Count == resultCount[i]); + } +} + +Y_UNIT_TEST_SUITE(EqWidthHistogram) { + Y_UNIT_TEST(Basic) { + TestHistogramBasic<ui32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Uint32, + /*{value, result}=*/{9, 10}, + /*{value, result}=*/{10, 0}); + TestHistogramBasic<ui64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Uint64, + /*{value, result}=*/{9, 10}, + /*{value, result}=*/{10, 0}); + TestHistogramBasic<i32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int32, + /*{value, result}=*/{9, 10}, + /*{value, result}=*/{10, 0}); + TestHistogramBasic<i64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int64, + /*{value, result}=*/{9, 10}, + /*{value, result}=*/{10, 0}); + TestHistogramBasic<double>(10, /*values range=*/{0.0, 10.0}, /*bucket range=*/{0.0, 2.0}, + EHistogramValueType::Double, + /*{value, result}=*/{9.0, 10}, + /*{value, result}=*/{10.0, 0}); + } + + Y_UNIT_TEST(Serialization) { + TestHistogramSerialization<ui32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, + EHistogramValueType::Uint32); + TestHistogramSerialization<ui64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, + EHistogramValueType::Uint64); + TestHistogramSerialization<i32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int32); + TestHistogramSerialization<i64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int64); + TestHistogramSerialization<double>(10, /*values range=*/{0.0, 10.0}, /*bucket range=*/{0.0, 2.0}, + EHistogramValueType::Double); + } + Y_UNIT_TEST(AggregateHistogram) { + TVector<ui64> resultCount{20, 20, 20, 20, 20, 0, 0, 0, 0, 0}; + TestHistogramAggregate<ui32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Uint32, 9, + resultCount); + } +} +} // namespace NKikimr diff --git a/yql/essentials/core/histogram/ut/ya.make b/yql/essentials/core/histogram/ut/ya.make new file mode 100644 index 00000000000..17e420ad072 --- /dev/null +++ b/yql/essentials/core/histogram/ut/ya.make @@ -0,0 +1,8 @@ +UNITTEST_FOR(yql/essentials/core/histogram) + +SIZE(MEDIUM) +SRCS( + eq_width_histogram_ut.cpp +) + +END() diff --git a/yql/essentials/core/histogram/ya.make b/yql/essentials/core/histogram/ya.make new file mode 100644 index 00000000000..bcc309c3798 --- /dev/null +++ b/yql/essentials/core/histogram/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +SRCS( + eq_width_histogram.h + eq_width_histogram.cpp +) + +END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/yql/essentials/core/minsketch/ut/ya.make b/yql/essentials/core/minsketch/ut/ya.make index 30f60c7229a..d951550facc 100644 --- a/yql/essentials/core/minsketch/ut/ya.make +++ b/yql/essentials/core/minsketch/ut/ya.make @@ -1,15 +1,6 @@ UNITTEST_FOR(yql/essentials/core/minsketch) -FORK_SUBTESTS() -IF (WITH_VALGRIND) - SPLIT_FACTOR(30) - TIMEOUT(1200) - SIZE(LARGE) - TAG(ya:fat) -ELSE() - TIMEOUT(600) - SIZE(MEDIUM) -ENDIF() +SIZE(MEDIUM) SRCS( count_min_sketch_ut.cpp diff --git a/yql/essentials/core/ya.make b/yql/essentials/core/ya.make index d1142fe083b..8626132b6f4 100644 --- a/yql/essentials/core/ya.make +++ b/yql/essentials/core/ya.make @@ -71,6 +71,7 @@ PEERDIR( yql/essentials/minikql yql/essentials/minikql/jsonpath/parser yql/essentials/core/minsketch + yql/essentials/core/histogram yql/essentials/protos yql/essentials/public/udf yql/essentials/public/udf/tz diff --git a/yql/essentials/core/yql_statistics.cpp b/yql/essentials/core/yql_statistics.cpp index f0958b8976e..cf70ad2cf55 100644 --- a/yql/essentials/core/yql_statistics.cpp +++ b/yql/essentials/core/yql_statistics.cpp @@ -189,6 +189,16 @@ std::shared_ptr<TOptimizerStatistics> NYql::OverrideStatistics(const NYql::TOpti Base64StrictDecode(countMinBase64, countMinRaw); cStat.CountMinSketch.reset(NKikimr::TCountMinSketch::FromString(countMinRaw.data(), countMinRaw.size())); } + if (auto eqWidthHistogram = colMap.find("histogram"); eqWidthHistogram != colMap.end()) { + TString histogramBase64 = eqWidthHistogram->second.GetStringSafe(); + + TString histogramBinary{}; + Base64StrictDecode(histogramBase64, histogramBinary); + auto histogram = std::make_shared<NKikimr::TEqWidthHistogram>( + histogramBinary.data(), histogramBinary.size()); + cStat.EqWidthHistogramEstimator = + std::make_shared<NKikimr::TEqWidthHistogramEstimator>(histogram); + } res->ColumnStatistics->Data[columnName] = cStat; } diff --git a/yql/essentials/core/yql_statistics.h b/yql/essentials/core/yql_statistics.h index f3875138f30..e96e8b9f50a 100644 --- a/yql/essentials/core/yql_statistics.h +++ b/yql/essentials/core/yql_statistics.h @@ -2,6 +2,7 @@ #include "yql_cost_function.h" #include <yql/essentials/core/minsketch/count_min_sketch.h> +#include <yql/essentials/core/histogram/eq_width_histogram.h> #include <library/cpp/json/json_reader.h> @@ -36,6 +37,7 @@ struct TColumnStatistics { std::optional<double> NumUniqueVals; std::optional<double> HyperLogLog; std::shared_ptr<NKikimr::TCountMinSketch> CountMinSketch; + std::shared_ptr<NKikimr::TEqWidthHistogramEstimator> EqWidthHistogramEstimator; TString Type; TColumnStatistics() {} |