summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordeniskhalikov <[email protected]>2025-03-31 15:42:14 +0300
committerdeniskhalikov <[email protected]>2025-03-31 15:58:39 +0300
commit06c199b566aca570faf8661b96ea873a229f923a (patch)
tree63cb078ebb2702770eb89056fba85fa2b6a9a7db
parent3e9cb82822c4878edee4ef5852b16116424db357 (diff)
Add equal width histogram.
This patch add equal width histogram for use in assessing predicate selectivity for CBO. commit_hash:8d2d3a7f89fb9b2261af1b6ba201e8d2a8b098b7
-rw-r--r--yql/essentials/core/histogram/eq_width_histogram.cpp73
-rw-r--r--yql/essentials/core/histogram/eq_width_histogram.h228
-rw-r--r--yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp127
-rw-r--r--yql/essentials/core/histogram/ut/ya.make8
-rw-r--r--yql/essentials/core/histogram/ya.make12
-rw-r--r--yql/essentials/core/minsketch/ut/ya.make11
-rw-r--r--yql/essentials/core/ya.make1
-rw-r--r--yql/essentials/core/yql_statistics.cpp10
-rw-r--r--yql/essentials/core/yql_statistics.h2
9 files changed, 462 insertions, 10 deletions
diff --git a/yql/essentials/core/histogram/eq_width_histogram.cpp b/yql/essentials/core/histogram/eq_width_histogram.cpp
new file mode 100644
index 00000000000..3c5a452fdbd
--- /dev/null
+++ b/yql/essentials/core/histogram/eq_width_histogram.cpp
@@ -0,0 +1,73 @@
+#include "eq_width_histogram.h"
+
+namespace NKikimr {
+
+TEqWidthHistogram::TEqWidthHistogram(ui32 numBuckets, EHistogramValueType valueType)
+ : ValueType(valueType), Buckets(numBuckets) {
+ // Exptected at least one bucket for histogram.
+ Y_ASSERT(numBuckets >= 1);
+}
+
+TEqWidthHistogram::TEqWidthHistogram(const char *str, ui64 size) {
+ Y_ASSERT(str && size);
+ const ui32 numBuckets = *reinterpret_cast<const ui32 *>(str);
+ Y_ABORT_UNLESS(GetBinarySize(numBuckets) == size);
+ ui32 offset = sizeof(ui32);
+ ValueType = *reinterpret_cast<const EHistogramValueType *>(str + offset);
+ offset += sizeof(EHistogramValueType);
+ Buckets = TVector<TBucket>(numBuckets);
+ for (ui32 i = 0; i < numBuckets; ++i) {
+ std::memcpy(&Buckets[i], reinterpret_cast<const char *>(str + offset), sizeof(TBucket));
+ offset += sizeof(TBucket);
+ }
+}
+
+ui64 TEqWidthHistogram::GetBinarySize(ui32 nBuckets) const {
+ return sizeof(ui32) + sizeof(EHistogramValueType) + sizeof(TBucket) * nBuckets;
+}
+
+// Binary layout:
+// [4 byte: number of buckets][1 byte: value type]
+// [sizeof(Bucket)[0]... sizeof(Bucket)[n]].
+std::unique_ptr<char> TEqWidthHistogram::Serialize(ui64 &binarySize) const {
+ binarySize = GetBinarySize(GetNumBuckets());
+ std::unique_ptr<char> binaryData(new char[binarySize]);
+ ui32 offset = 0;
+ const ui32 numBuckets = GetNumBuckets();
+ // 4 byte - number of buckets.
+ std::memcpy(binaryData.get(), &numBuckets, sizeof(ui32));
+ offset += sizeof(ui32);
+ // 1 byte - values type.
+ std::memcpy(binaryData.get() + offset, &ValueType, sizeof(EHistogramValueType));
+ offset += sizeof(EHistogramValueType);
+ // Buckets.
+ for (ui32 i = 0; i < numBuckets; ++i) {
+ std::memcpy(binaryData.get() + offset, &Buckets[i], sizeof(TBucket));
+ offset += sizeof(TBucket);
+ }
+ return binaryData;
+}
+
+TEqWidthHistogramEstimator::TEqWidthHistogramEstimator(std::shared_ptr<TEqWidthHistogram> histogram)
+ : Histogram(histogram) {
+ const auto numBuckets = Histogram->GetNumBuckets();
+ PrefixSum = TVector<ui64>(numBuckets);
+ SuffixSum = TVector<ui64>(numBuckets);
+ CreatePrefixSum(numBuckets);
+ CreateSuffixSum(numBuckets);
+}
+
+void TEqWidthHistogramEstimator::CreatePrefixSum(ui32 numBuckets) {
+ PrefixSum[0] = Histogram->GetNumElementsInBucket(0);
+ for (ui32 i = 1; i < numBuckets; ++i) {
+ PrefixSum[i] = PrefixSum[i - 1] + Histogram->GetNumElementsInBucket(i);
+ }
+}
+
+void TEqWidthHistogramEstimator::CreateSuffixSum(ui32 numBuckets) {
+ SuffixSum[numBuckets - 1] = Histogram->GetNumElementsInBucket(numBuckets - 1);
+ for (i32 i = static_cast<i32>(numBuckets) - 2; i >= 0; --i) {
+ SuffixSum[i] = SuffixSum[i + 1] + Histogram->GetNumElementsInBucket(i);
+ }
+}
+} // namespace NKikimr
diff --git a/yql/essentials/core/histogram/eq_width_histogram.h b/yql/essentials/core/histogram/eq_width_histogram.h
new file mode 100644
index 00000000000..97c660af76b
--- /dev/null
+++ b/yql/essentials/core/histogram/eq_width_histogram.h
@@ -0,0 +1,228 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/stream/output.h>
+#include <util/system/types.h>
+#include <cmath>
+
+namespace NKikimr {
+
+ // Helper functions to work with histogram values.
+template <typename T>
+inline T LoadFrom(const ui8 *storage) {
+ T val;
+ std::memcpy(&val, storage, sizeof(T));
+ return val;
+}
+template <typename T>
+inline void StoreTo(ui8 *storage, T value) {
+ std::memcpy(storage, &value, sizeof(T));
+}
+template <typename T>
+inline bool CmpEqual(T left, T right) {
+ return left == right;
+}
+template <>
+inline bool CmpEqual(double left, double right) {
+ return std::fabs(left - right) < std::numeric_limits<double>::epsilon();
+}
+template <typename T>
+inline bool CmpLess(T left, T right) {
+ return left < right;
+}
+
+// Represents value types supported by histogram.
+enum class EHistogramValueType : ui8 { Int16, Int32, Int64, Uint16, Uint32, Uint64, Double, NotSupported };
+
+// Bucket storage size for Equal width histogram.
+constexpr const ui32 EqWidthHistogramBucketStorageSize = 8;
+
+// This class represents an `Equal-width` histogram.
+// Each bucket represents a range of contiguous values of equal width, and the
+// aggregate summary stored in the bucket is the number of rows whose value lies
+// within that range.
+class TEqWidthHistogram {
+ public:
+#pragma pack(push, 1)
+ struct TBucket {
+ // The number of values in a bucket.
+ ui64 Count{0};
+ // The `start` value of a bucket, the `end` of the bucket is a next start.
+ // [start = start[i], end = start[i + 1])
+ ui8 Start[EqWidthHistogramBucketStorageSize];
+ };
+ struct TBucketRange {
+ ui8 Start[EqWidthHistogramBucketStorageSize];
+ ui8 End[EqWidthHistogramBucketStorageSize];
+ };
+#pragma pack(pop)
+
+ // Have to specify the number of buckets and type of the values.
+ TEqWidthHistogram(ui32 numBuckets = 1, EHistogramValueType type = EHistogramValueType::Int32);
+ // From serialized data.
+ TEqWidthHistogram(const char *str, ui64 size);
+
+ // Adds the given `val` to a histogram.
+ template <typename T>
+ void AddElement(T val) {
+ const auto index = FindBucketIndex(val);
+ // The given `index` in range [0, numBuckets - 1].
+ const T bucketValue = LoadFrom<T>(Buckets[index].Start);
+ if (!index || ((CmpEqual<T>(bucketValue, val) || CmpLess<T>(bucketValue, val)))) {
+ Buckets[index].Count++;
+ } else {
+ Buckets[index - 1].Count++;
+ }
+ }
+
+ // Returns an index of the bucket which stores the given `val`.
+ // Returned index in range [0, numBuckets - 1].
+ // Not using `std::lower_bound()` here because need an index to map to `suffix` and `prefix` sum.
+ template <typename T>
+ ui32 FindBucketIndex(T val) const {
+ ui32 start = 0;
+ ui32 end = GetNumBuckets() - 1;
+ while (start < end) {
+ auto it = start + (end - start) / 2;
+ if (CmpLess<T>(LoadFrom<T>(Buckets[it].Start), val)) {
+ start = it + 1;
+ } else {
+ end = it;
+ }
+ }
+ return start;
+ }
+
+ // Returns a number of buckets in a histogram.
+ ui32 GetNumBuckets() const { return Buckets.size(); }
+
+ template <typename T>
+ ui32 GetBucketWidth() const {
+ Y_ASSERT(GetNumBuckets());
+ if (GetNumBuckets() == 1) {
+ return std::max(static_cast<ui32>(LoadFrom<T>(Buckets.front().Start)), 1U);
+ } else {
+ return std::max(static_cast<ui32>(LoadFrom<T>(Buckets[1].Start) - LoadFrom<T>(Buckets[0].Start)), 1U);
+ }
+ }
+
+ template <>
+ ui32 GetBucketWidth<double>() const {
+ return 1;
+ }
+
+ // Returns histogram type.
+ EHistogramValueType GetType() const { return ValueType; }
+ // Returns a number of elements in a bucket by the given `index`.
+ ui64 GetNumElementsInBucket(ui32 index) const { return Buckets[index].Count; }
+
+ // Initializes buckets with a given `range`.
+ template <typename T>
+ void InitializeBuckets(const TBucketRange &range) {
+ Y_ASSERT(CmpLess<T>(LoadFrom<T>(range.Start), LoadFrom<T>(range.End)));
+ T rangeLen = LoadFrom<T>(range.End) - LoadFrom<T>(range.Start);
+ std::memcpy(Buckets[0].Start, range.Start, sizeof(range.Start));
+ for (ui32 i = 1; i < GetNumBuckets(); ++i) {
+ const T prevStart = LoadFrom<T>(Buckets[i - 1].Start);
+ StoreTo<T>(Buckets[i].Start, prevStart + rangeLen);
+ }
+ }
+
+ // Seriailizes to a binary representation
+ std::unique_ptr<char> Serialize(ui64 &binSize) const;
+ // Returns buckets.
+ const TVector<TBucket> &GetBuckets() const { return Buckets; }
+
+ template <typename T>
+ void Aggregate(const TEqWidthHistogram &other) {
+ if ((this->ValueType != other.GetType()) || (!BucketsEqual<T>(other))) {
+ // Should we fail?
+ return;
+ }
+ for (ui32 i = 0; i < Buckets.size(); ++i) {
+ Buckets[i].Count += other.GetBuckets()[i].Count;
+ }
+ }
+
+ private:
+ template <typename T>
+ bool BucketsEqual(const TEqWidthHistogram &other) {
+ if (Buckets.size() != other.GetNumBuckets()) {
+ return false;
+ }
+ for (ui32 i = 0; i < Buckets.size(); ++i) {
+ if (!CmpEqual<T>(LoadFrom<T>(Buckets[i].Start), LoadFrom<T>(GetBuckets()[i].Start))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Returns binary size of the histogram.
+ ui64 GetBinarySize(ui32 nBuckets) const;
+ EHistogramValueType ValueType;
+ TVector<TBucket> Buckets;
+};
+
+// This class represents a machinery to estimate a value in a histogram.
+class TEqWidthHistogramEstimator {
+ public:
+ TEqWidthHistogramEstimator(std::shared_ptr<TEqWidthHistogram> histogram);
+
+ // Methods to estimate values.
+ template <typename T>
+ ui64 EstimateLessOrEqual(T val) const {
+ return EstimateOrEqual<T>(val, PrefixSum);
+ }
+
+ template <typename T>
+ ui64 EstimateGreaterOrEqual(T val) const {
+ return EstimateOrEqual<T>(val, SuffixSum);
+ }
+
+ template <typename T>
+ ui64 EstimateLess(T val) const {
+ return EstimateNotEqual<T>(val, PrefixSum);
+ }
+
+ template <typename T>
+ ui64 EstimateGreater(T val) const {
+ return EstimateNotEqual<T>(val, SuffixSum);
+ }
+
+ template <typename T>
+ ui64 EstimateEqual(T val) const {
+ const auto index = Histogram->FindBucketIndex(val);
+ // Assuming uniform distribution.
+ return std::max(1U, static_cast<ui32>(Histogram->GetNumElementsInBucket(index) / Histogram->template GetBucketWidth<T>()));
+ }
+
+ // Returns the total number elements in histogram.
+ // Could be used to adjust scale.
+ ui64 GetNumElements() const { return PrefixSum.back(); }
+
+ private:
+ template <typename T>
+ ui64 EstimateOrEqual(T val, const TVector<ui64> &sumArray) const {
+ const auto index = Histogram->FindBucketIndex(val);
+ return sumArray[index];
+ }
+
+ template <typename T>
+ ui64 EstimateNotEqual(T val, const TVector<ui64> &sumArray) const {
+ const auto index = Histogram->FindBucketIndex(val);
+ // Take the previous backet if it's not the first one.
+ if (!index) {
+ return sumArray[index];
+ }
+ return sumArray[index - 1];
+ }
+
+ void CreatePrefixSum(ui32 numBuckets);
+ void CreateSuffixSum(ui32 numBuckets);
+ std::shared_ptr<TEqWidthHistogram> Histogram;
+ TVector<ui64> PrefixSum;
+ TVector<ui64> SuffixSum;
+};
+} // namespace NKikimr
diff --git a/yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp b/yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp
new file mode 100644
index 00000000000..9c1b1d969fe
--- /dev/null
+++ b/yql/essentials/core/histogram/ut/eq_width_histogram_ut.cpp
@@ -0,0 +1,127 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include "eq_width_histogram.h"
+
+namespace NKikimr {
+
+template <typename T>
+bool EqualHistograms(const std::shared_ptr<TEqWidthHistogram> &left, const std::shared_ptr<TEqWidthHistogram> &right) {
+ // Not expecting any nullptr.
+ if (!left || !right) return false;
+
+ if (left->GetNumBuckets() != right->GetNumBuckets()) {
+ return false;
+ }
+ if (left->GetType() != right->GetType()) {
+ return false;
+ }
+
+ for (ui32 i = 0; i < left->GetNumBuckets(); ++i) {
+ const auto &leftBucket = left->GetBuckets()[i];
+ const auto &rightBucket = right->GetBuckets()[i];
+ if (leftBucket.Count != rightBucket.Count) {
+ return false;
+ }
+ if (!CmpEqual<T>(LoadFrom<T>(leftBucket.Start), LoadFrom<T>(rightBucket.Start))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+template <typename T>
+std::shared_ptr<TEqWidthHistogram> CreateHistogram(ui32 numBuckets, T start, T range, EHistogramValueType valueType) {
+ std::shared_ptr<TEqWidthHistogram> histogram(std::make_shared<TEqWidthHistogram>(numBuckets, valueType));
+ TEqWidthHistogram::TBucketRange bucketRange;
+ StoreTo<T>(bucketRange.Start, start);
+ StoreTo<T>(bucketRange.End, range);
+ histogram->InitializeBuckets<T>(bucketRange);
+ return histogram;
+}
+
+template <typename T>
+void PopulateHistogram(std::shared_ptr<TEqWidthHistogram> histogram, const std::pair<ui32, ui32> &range) {
+ for (ui32 i = range.first; i < range.second; ++i) {
+ histogram->AddElement<T>(i);
+ }
+}
+
+template <typename T>
+void TestHistogramBasic(ui32 numBuckets, std::pair<ui32, ui32> range, std::pair<T, T> bucketRange,
+ EHistogramValueType valueType, std::pair<T, ui64> less, std::pair<T, ui64> greater) {
+ auto histogram = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType);
+ UNIT_ASSERT_VALUES_EQUAL(histogram->GetNumBuckets(), numBuckets);
+ PopulateHistogram<T>(histogram, range);
+ TEqWidthHistogramEstimator estimator(histogram);
+ UNIT_ASSERT_VALUES_EQUAL(estimator.EstimateLessOrEqual<T>(less.first), less.second);
+ UNIT_ASSERT_VALUES_EQUAL(estimator.EstimateGreaterOrEqual<T>(greater.first), greater.second);
+}
+
+template <typename T>
+void TestHistogramSerialization(ui32 numBuckets, std::pair<ui32, ui32> range, std::pair<T, T> bucketRange,
+ EHistogramValueType valueType) {
+ auto histogram = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType);
+ UNIT_ASSERT(histogram);
+ PopulateHistogram<T>(histogram, range);
+ ui64 binarySize = 0;
+ auto binaryData = histogram->Serialize(binarySize);
+ UNIT_ASSERT(binaryData && binarySize);
+ TString hString(binaryData.get(), binarySize);
+ auto histogramFromString = std::make_shared<TEqWidthHistogram>(hString.data(), hString.size());
+ UNIT_ASSERT(histogramFromString);
+ UNIT_ASSERT(EqualHistograms<T>(histogram, histogramFromString));
+}
+
+template <typename T>
+void TestHistogramAggregate(ui32 numBuckets, std::pair<ui32, ui32> range, std::pair<T, T> bucketRange,
+ EHistogramValueType valueType, ui32 numCombine, const TVector<ui64> &resultCount) {
+ auto histogram = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType);
+ UNIT_ASSERT(histogram);
+ PopulateHistogram<T>(histogram, range);
+ auto histogramToAdd = CreateHistogram<T>(numBuckets, bucketRange.first, bucketRange.second, valueType);
+ PopulateHistogram<T>(histogramToAdd, range);
+ UNIT_ASSERT(histogram);
+ for (ui32 i = 0; i < numCombine; ++i) histogram->template Aggregate<T>(*histogramToAdd);
+ for (ui32 i = 0; i < histogram->GetNumBuckets(); ++i) {
+ UNIT_ASSERT(histogram->GetBuckets()[i].Count == resultCount[i]);
+ }
+}
+
+Y_UNIT_TEST_SUITE(EqWidthHistogram) {
+ Y_UNIT_TEST(Basic) {
+ TestHistogramBasic<ui32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Uint32,
+ /*{value, result}=*/{9, 10},
+ /*{value, result}=*/{10, 0});
+ TestHistogramBasic<ui64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Uint64,
+ /*{value, result}=*/{9, 10},
+ /*{value, result}=*/{10, 0});
+ TestHistogramBasic<i32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int32,
+ /*{value, result}=*/{9, 10},
+ /*{value, result}=*/{10, 0});
+ TestHistogramBasic<i64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int64,
+ /*{value, result}=*/{9, 10},
+ /*{value, result}=*/{10, 0});
+ TestHistogramBasic<double>(10, /*values range=*/{0.0, 10.0}, /*bucket range=*/{0.0, 2.0},
+ EHistogramValueType::Double,
+ /*{value, result}=*/{9.0, 10},
+ /*{value, result}=*/{10.0, 0});
+ }
+
+ Y_UNIT_TEST(Serialization) {
+ TestHistogramSerialization<ui32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2},
+ EHistogramValueType::Uint32);
+ TestHistogramSerialization<ui64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2},
+ EHistogramValueType::Uint64);
+ TestHistogramSerialization<i32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int32);
+ TestHistogramSerialization<i64>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Int64);
+ TestHistogramSerialization<double>(10, /*values range=*/{0.0, 10.0}, /*bucket range=*/{0.0, 2.0},
+ EHistogramValueType::Double);
+ }
+ Y_UNIT_TEST(AggregateHistogram) {
+ TVector<ui64> resultCount{20, 20, 20, 20, 20, 0, 0, 0, 0, 0};
+ TestHistogramAggregate<ui32>(10, /*values range=*/{0, 10}, /*bucket range=*/{0, 2}, EHistogramValueType::Uint32, 9,
+ resultCount);
+ }
+}
+} // namespace NKikimr
diff --git a/yql/essentials/core/histogram/ut/ya.make b/yql/essentials/core/histogram/ut/ya.make
new file mode 100644
index 00000000000..17e420ad072
--- /dev/null
+++ b/yql/essentials/core/histogram/ut/ya.make
@@ -0,0 +1,8 @@
+UNITTEST_FOR(yql/essentials/core/histogram)
+
+SIZE(MEDIUM)
+SRCS(
+ eq_width_histogram_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/core/histogram/ya.make b/yql/essentials/core/histogram/ya.make
new file mode 100644
index 00000000000..bcc309c3798
--- /dev/null
+++ b/yql/essentials/core/histogram/ya.make
@@ -0,0 +1,12 @@
+LIBRARY()
+
+SRCS(
+ eq_width_histogram.h
+ eq_width_histogram.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+ ut
+)
diff --git a/yql/essentials/core/minsketch/ut/ya.make b/yql/essentials/core/minsketch/ut/ya.make
index 30f60c7229a..d951550facc 100644
--- a/yql/essentials/core/minsketch/ut/ya.make
+++ b/yql/essentials/core/minsketch/ut/ya.make
@@ -1,15 +1,6 @@
UNITTEST_FOR(yql/essentials/core/minsketch)
-FORK_SUBTESTS()
-IF (WITH_VALGRIND)
- SPLIT_FACTOR(30)
- TIMEOUT(1200)
- SIZE(LARGE)
- TAG(ya:fat)
-ELSE()
- TIMEOUT(600)
- SIZE(MEDIUM)
-ENDIF()
+SIZE(MEDIUM)
SRCS(
count_min_sketch_ut.cpp
diff --git a/yql/essentials/core/ya.make b/yql/essentials/core/ya.make
index d1142fe083b..8626132b6f4 100644
--- a/yql/essentials/core/ya.make
+++ b/yql/essentials/core/ya.make
@@ -71,6 +71,7 @@ PEERDIR(
yql/essentials/minikql
yql/essentials/minikql/jsonpath/parser
yql/essentials/core/minsketch
+ yql/essentials/core/histogram
yql/essentials/protos
yql/essentials/public/udf
yql/essentials/public/udf/tz
diff --git a/yql/essentials/core/yql_statistics.cpp b/yql/essentials/core/yql_statistics.cpp
index f0958b8976e..cf70ad2cf55 100644
--- a/yql/essentials/core/yql_statistics.cpp
+++ b/yql/essentials/core/yql_statistics.cpp
@@ -189,6 +189,16 @@ std::shared_ptr<TOptimizerStatistics> NYql::OverrideStatistics(const NYql::TOpti
Base64StrictDecode(countMinBase64, countMinRaw);
cStat.CountMinSketch.reset(NKikimr::TCountMinSketch::FromString(countMinRaw.data(), countMinRaw.size()));
}
+ if (auto eqWidthHistogram = colMap.find("histogram"); eqWidthHistogram != colMap.end()) {
+ TString histogramBase64 = eqWidthHistogram->second.GetStringSafe();
+
+ TString histogramBinary{};
+ Base64StrictDecode(histogramBase64, histogramBinary);
+ auto histogram = std::make_shared<NKikimr::TEqWidthHistogram>(
+ histogramBinary.data(), histogramBinary.size());
+ cStat.EqWidthHistogramEstimator =
+ std::make_shared<NKikimr::TEqWidthHistogramEstimator>(histogram);
+ }
res->ColumnStatistics->Data[columnName] = cStat;
}
diff --git a/yql/essentials/core/yql_statistics.h b/yql/essentials/core/yql_statistics.h
index f3875138f30..e96e8b9f50a 100644
--- a/yql/essentials/core/yql_statistics.h
+++ b/yql/essentials/core/yql_statistics.h
@@ -2,6 +2,7 @@
#include "yql_cost_function.h"
#include <yql/essentials/core/minsketch/count_min_sketch.h>
+#include <yql/essentials/core/histogram/eq_width_histogram.h>
#include <library/cpp/json/json_reader.h>
@@ -36,6 +37,7 @@ struct TColumnStatistics {
std::optional<double> NumUniqueVals;
std::optional<double> HyperLogLog;
std::shared_ptr<NKikimr::TCountMinSketch> CountMinSketch;
+ std::shared_ptr<NKikimr::TEqWidthHistogramEstimator> EqWidthHistogramEstimator;
TString Type;
TColumnStatistics() {}