aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/histogram
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/histogram
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/histogram')
-rw-r--r--library/cpp/histogram/adaptive/README.md1
-rw-r--r--library/cpp/histogram/adaptive/adaptive_histogram.cpp637
-rw-r--r--library/cpp/histogram/adaptive/adaptive_histogram.h131
-rw-r--r--library/cpp/histogram/adaptive/auto_histogram.h148
-rw-r--r--library/cpp/histogram/adaptive/block_histogram.cpp593
-rw-r--r--library/cpp/histogram/adaptive/block_histogram.h148
-rw-r--r--library/cpp/histogram/adaptive/common.cpp19
-rw-r--r--library/cpp/histogram/adaptive/common.h12
-rw-r--r--library/cpp/histogram/adaptive/fixed_bin_histogram.cpp538
-rw-r--r--library/cpp/histogram/adaptive/fixed_bin_histogram.h91
-rw-r--r--library/cpp/histogram/adaptive/histogram.h66
-rw-r--r--library/cpp/histogram/adaptive/merger.h68
-rw-r--r--library/cpp/histogram/adaptive/multi_histogram.h143
-rw-r--r--library/cpp/histogram/adaptive/protos/histo.proto36
-rw-r--r--library/cpp/histogram/adaptive/protos/python/ya.make3
-rw-r--r--library/cpp/histogram/adaptive/protos/ya.make13
-rw-r--r--library/cpp/histogram/adaptive/ya.make20
-rw-r--r--library/cpp/histogram/ya.make9
18 files changed, 2676 insertions, 0 deletions
diff --git a/library/cpp/histogram/adaptive/README.md b/library/cpp/histogram/adaptive/README.md
new file mode 100644
index 0000000000..17a4edbe64
--- /dev/null
+++ b/library/cpp/histogram/adaptive/README.md
@@ -0,0 +1 @@
+See https://wiki.yandex-team.ru/robot/newdesign/histograms/
diff --git a/library/cpp/histogram/adaptive/adaptive_histogram.cpp b/library/cpp/histogram/adaptive/adaptive_histogram.cpp
new file mode 100644
index 0000000000..cbfc494021
--- /dev/null
+++ b/library/cpp/histogram/adaptive/adaptive_histogram.cpp
@@ -0,0 +1,637 @@
+#include "adaptive_histogram.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/yexception.h>
+#include <util/generic/ymath.h>
+#include <util/string/printf.h>
+
+#include <util/system/backtrace.h>
+
+namespace NKiwiAggr {
+ TAdaptiveHistogram::TAdaptiveHistogram(size_t intervals, ui64 id, TQualityFunction qualityFunc)
+ : Id(id)
+ , Sum(0.0)
+ , Intervals(intervals)
+ , CalcQuality(qualityFunc)
+ {
+ }
+
+ TAdaptiveHistogram::TAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, TQualityFunction qualityFunc)
+ : TAdaptiveHistogram(defaultIntervals, defaultId, qualityFunc)
+ {
+ FromProto(histo);
+ }
+
+ TAdaptiveHistogram::TAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, TQualityFunction qualityFunc)
+ : TAdaptiveHistogram(defaultIntervals, defaultId, qualityFunc)
+ {
+ TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histo);
+ if (!adaptiveHisto) {
+ FromIHistogram(histo);
+ return;
+ }
+ Id = adaptiveHisto->Id;
+ MinValue = adaptiveHisto->MinValue;
+ MaxValue = adaptiveHisto->MaxValue;
+ Sum = adaptiveHisto->Sum;
+ Intervals = adaptiveHisto->Intervals;
+ Bins = adaptiveHisto->Bins;
+ BinsByQuality = adaptiveHisto->BinsByQuality;
+ if (CalcQuality == nullptr) {
+ CalcQuality = adaptiveHisto->CalcQuality;
+ }
+ }
+
+ TQualityFunction TAdaptiveHistogram::GetQualityFunc() {
+ return CalcQuality;
+ }
+
+ void TAdaptiveHistogram::Clear() {
+ Sum = 0.0;
+ Bins.clear();
+ BinsByQuality.clear();
+ }
+
+ void TAdaptiveHistogram::Add(const THistoRec& histoRec) {
+ if (!histoRec.HasId() || histoRec.GetId() == Id) {
+ Add(histoRec.GetValue(), histoRec.GetWeight());
+ }
+ }
+
+ void TAdaptiveHistogram::Add(double value, double weight) {
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight);
+ }
+ TWeightedValue weightedValue(value, weight);
+ Add(weightedValue, true);
+ PrecomputedBins.clear();
+ }
+
+ void TAdaptiveHistogram::Merge(const THistogram& histo, double multiplier) {
+ if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) {
+ fprintf(stderr, "Merging in histogram id %lu: skip bad histo with minvalue %f maxvalue %f\n", Id, histo.GetMinValue(), histo.GetMaxValue());
+ return;
+ }
+ if (histo.FreqSize() == 0) {
+ return; // skip empty histos
+ }
+ if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM ||
+ histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM ||
+ histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM ||
+ histo.GetType() == HT_ADAPTIVE_HISTOGRAM)
+ {
+ Y_VERIFY(histo.FreqSize() == histo.PositionSize(), "Corrupted histo");
+ for (size_t j = 0; j < histo.FreqSize(); ++j) {
+ double value = histo.GetPosition(j);
+ double weight = histo.GetFreq(j);
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight);
+ continue;
+ }
+ Add(value, weight * multiplier);
+ }
+
+ MinValue = Min(MinValue, histo.GetMinValue());
+ MaxValue = Max(MaxValue, histo.GetMaxValue());
+ } else if (histo.GetType() == HT_FIXED_BIN_HISTOGRAM) {
+ double pos = histo.GetMinValue() + histo.GetBinRange() / 2.0;
+ for (size_t j = 0; j < histo.FreqSize(); ++j) {
+ double weight = histo.GetFreq(j);
+ if (!IsValidFloat(pos) || !IsValidFloat(weight)) {
+ fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, pos, weight);
+ pos += histo.GetBinRange();
+ continue;
+ }
+ Add(pos, weight * multiplier);
+ pos += histo.GetBinRange();
+ }
+
+ MinValue = Min(MinValue, histo.GetMinValue());
+ MaxValue = Max(MaxValue, histo.GetMaxValue());
+ } else {
+ ythrow yexception() << "Unknown THistogram type";
+ }
+ }
+
+ void TAdaptiveHistogram::Merge(const TVector<THistogram>& histogramsToMerge) {
+ for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
+ Merge(histogramsToMerge[i], 1.0);
+ }
+ }
+
+ void TAdaptiveHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) {
+ TVector<IHistogramPtr> histogramsToMergeRepacked(0);
+ TVector<TAdaptiveHistogram*> histograms(0);
+ for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
+ if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) {
+ continue;
+ }
+ TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histogramsToMerge[i].Get());
+ if (adaptiveHisto) {
+ histogramsToMergeRepacked.push_back(histogramsToMerge[i]);
+ } else {
+ histogramsToMergeRepacked.push_back(IHistogramPtr(new TAdaptiveHistogram(histogramsToMerge[i].Get(), Intervals, Id, CalcQuality))); // Convert histograms that are not of TFixedBinHistogram type
+ }
+ if (histogramsToMergeRepacked.back()->Empty()) {
+ continue;
+ }
+ histograms.push_back(dynamic_cast<TAdaptiveHistogram*>(histogramsToMergeRepacked.back().Get()));
+ }
+
+ if (histograms.size() == 0) {
+ return;
+ }
+
+ for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) {
+ TAdaptiveHistogram* histo = histograms[histoIndex];
+ for (TPairSet::const_iterator it = histo->Bins.begin(); it != histo->Bins.end(); ++it) {
+ Add(*it, true);
+ }
+ }
+
+ for (size_t i = 0; i < histograms.size(); ++i) {
+ MinValue = Min(MinValue, histograms[i]->MinValue);
+ MaxValue = Max(MaxValue, histograms[i]->MaxValue);
+ }
+ }
+
+ void TAdaptiveHistogram::Multiply(double factor) {
+ if (!IsValidFloat(factor) || factor <= 0) {
+ ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor;
+ }
+ Sum *= factor;
+
+ TPairSet newBins;
+ for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) {
+ newBins.insert(TWeightedValue(it->first, it->second * factor));
+ }
+ Bins = newBins;
+
+ TPairSet newBinsByQuality;
+ for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) {
+ TPairSet::iterator rightBin = it;
+ ++rightBin;
+ if (rightBin == Bins.end()) {
+ break;
+ }
+ newBinsByQuality.insert(CalcQuality(*it, *rightBin));
+ }
+ BinsByQuality = newBinsByQuality;
+ }
+
+ void TAdaptiveHistogram::FromProto(const THistogram& histo) {
+ Y_VERIFY(histo.HasType(), "Attempt to parse TAdaptiveHistogram from THistogram protobuf with no Type field set");
+ ;
+ switch (histo.GetType()) { // check that histogram type could be deduced
+ case HT_ADAPTIVE_DISTANCE_HISTOGRAM:
+ case HT_ADAPTIVE_WEIGHT_HISTOGRAM:
+ case HT_ADAPTIVE_WARD_HISTOGRAM:
+ break; // ok
+ case HT_ADAPTIVE_HISTOGRAM:
+ if (CalcQuality != nullptr)
+ break; // ok
+ [[fallthrough]];
+ default: // not ok
+ ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record of type = " << (ui32)histo.GetType();
+ }
+
+ if (histo.FreqSize() != histo.PositionSize()) {
+ ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize();
+ }
+ if (CalcQuality == nullptr) {
+ if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) {
+ CalcQuality = CalcDistanceQuality;
+ } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) {
+ CalcQuality = CalcWeightQuality;
+ } else if (histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM) {
+ CalcQuality = CalcWardQuality;
+ } else {
+ ythrow yexception() << "Attempt to parse an HT_ADAPTIVE_HISTOGRAM without default quality function";
+ }
+ }
+ Id = histo.GetId();
+ Sum = 0.0;
+ Intervals = Max(Intervals, histo.FreqSize());
+ for (size_t i = 0; i < histo.FreqSize(); ++i) {
+ double value = histo.GetPosition(i);
+ double weight = histo.GetFreq(i);
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ fprintf(stderr, "FromProto in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight);
+ continue;
+ }
+ Add(value, weight);
+ }
+
+ if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) {
+ ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue());
+ }
+
+ MinValue = histo.GetMinValue();
+ MaxValue = histo.GetMaxValue();
+ }
+
+ void TAdaptiveHistogram::ToProto(THistogram& histo) {
+ histo.Clear();
+ if (CalcQuality == CalcDistanceQuality) {
+ histo.SetType(HT_ADAPTIVE_DISTANCE_HISTOGRAM);
+ } else if (CalcQuality == CalcWeightQuality) {
+ histo.SetType(HT_ADAPTIVE_WEIGHT_HISTOGRAM);
+ } else if (CalcQuality == CalcWardQuality) {
+ histo.SetType(HT_ADAPTIVE_WARD_HISTOGRAM);
+ } else {
+ histo.SetType(HT_ADAPTIVE_HISTOGRAM);
+ }
+ histo.SetId(Id);
+ if (Empty()) {
+ return;
+ }
+ histo.SetMinValue(MinValue);
+ histo.SetMaxValue(MaxValue);
+ for (TPairSet::const_iterator it = Bins.begin(); it != Bins.end(); ++it) {
+ histo.AddFreq(it->second);
+ histo.AddPosition(it->first);
+ }
+ }
+
+ void TAdaptiveHistogram::SetId(ui64 id) {
+ Id = id;
+ }
+
+ ui64 TAdaptiveHistogram::GetId() {
+ return Id;
+ }
+
+ bool TAdaptiveHistogram::Empty() {
+ return Bins.size() == 0;
+ }
+
+ double TAdaptiveHistogram::GetMinValue() {
+ return MinValue;
+ }
+
+ double TAdaptiveHistogram::GetMaxValue() {
+ return MaxValue;
+ }
+
+ double TAdaptiveHistogram::GetSum() {
+ return Sum;
+ }
+
+ double TAdaptiveHistogram::GetSumInRange(double leftBound, double rightBound) {
+ if (leftBound > rightBound) {
+ return 0.0;
+ }
+ return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum;
+ }
+
+ double TAdaptiveHistogram::GetSumAboveBound(double bound) {
+ if (Empty()) {
+ return 0.0;
+ }
+ if (bound < MinValue) {
+ return Sum;
+ }
+ if (bound > MaxValue) {
+ return 0.0;
+ }
+
+ if (!PrecomputedBins.empty()) {
+ return GetSumAboveBoundImpl(
+ bound,
+ PrecomputedBins,
+ LowerBound(PrecomputedBins.begin(), PrecomputedBins.end(), TFastBin{bound, -1.0, 0, 0}),
+ [](const auto& it) { return it->SumAbove; });
+ } else {
+ return GetSumAboveBoundImpl(
+ bound,
+ Bins,
+ Bins.lower_bound(TWeightedValue(bound, -1.0)),
+ [this](TPairSet::const_iterator rightBin) {
+ ++rightBin;
+ double sum = 0;
+ for (TPairSet::const_iterator it = rightBin; it != Bins.end(); ++it) {
+ sum += it->second;
+ }
+ return sum;
+ });
+ }
+ }
+
+ double TAdaptiveHistogram::GetSumBelowBound(double bound) {
+ if (Empty()) {
+ return 0.0;
+ }
+ if (bound < MinValue) {
+ return 0.0;
+ }
+ if (bound > MaxValue) {
+ return Sum;
+ }
+
+ if (!PrecomputedBins.empty()) {
+ return GetSumBelowBoundImpl(
+ bound,
+ PrecomputedBins,
+ LowerBound(PrecomputedBins.begin(), PrecomputedBins.end(), TFastBin{bound, -1.0, 0, 0}),
+ [](const auto& it) { return it->SumBelow; });
+ } else {
+ return GetSumBelowBoundImpl(
+ bound,
+ Bins,
+ Bins.lower_bound(TWeightedValue(bound, -1.0)),
+ [this](TPairSet::const_iterator rightBin) {
+ double sum = 0;
+ for (TPairSet::iterator it = Bins.begin(); it != rightBin; ++it) {
+ sum += it->second;
+ }
+ return sum;
+ });
+ }
+ }
+
+ double TAdaptiveHistogram::CalcUpperBound(double sum) {
+ Y_VERIFY(sum >= 0, "Sum must be >= 0");
+ if (sum == 0.0) {
+ return MinValue;
+ }
+ if (Empty()) {
+ return MaxValue;
+ }
+ TPairSet::iterator current = Bins.begin();
+ double gatheredSum = 0.0;
+ while (current != Bins.end() && gatheredSum < sum) {
+ gatheredSum += current->second;
+ ++current;
+ }
+ --current;
+ if (gatheredSum < sum) {
+ return MaxValue;
+ }
+ TWeightedValue left(MinValue, 0.0);
+ TWeightedValue right(MaxValue, 0.0);
+ if (current != Bins.begin()) {
+ TPairSet::iterator leftBin = current;
+ --leftBin;
+ left = *leftBin;
+ }
+ {
+ TPairSet::iterator rightBin = current;
+ ++rightBin;
+ if (rightBin != Bins.end()) {
+ right = *rightBin;
+ }
+ }
+ double sumToAdd = sum - (gatheredSum - current->second - left.second / 2);
+ if (sumToAdd <= ((current->second + left.second) / 2)) {
+ return left.first + 2 * sumToAdd * (current->first - left.first) / (current->second + left.second);
+ } else {
+ sumToAdd -= (current->second + left.second) / 2;
+ return current->first + 2 * sumToAdd * (right.first - current->first) / (right.second + current->second);
+ }
+ }
+
+ double TAdaptiveHistogram::CalcLowerBound(double sum) {
+ Y_VERIFY(sum >= 0, "Sum must be >= 0");
+ if (sum == 0.0) {
+ return MaxValue;
+ }
+ if (Empty()) {
+ return MinValue;
+ }
+ TPairSet::iterator current = Bins.end();
+ double gatheredSum = 0.0;
+ while (current != Bins.begin() && gatheredSum < sum) {
+ --current;
+ gatheredSum += current->second;
+ }
+ if (gatheredSum < sum) {
+ return MinValue;
+ }
+ TWeightedValue left(MinValue, 0.0);
+ TWeightedValue right(MaxValue, 0.0);
+ if (current != Bins.begin()) {
+ TPairSet::iterator leftBin = current;
+ --leftBin;
+ left = *leftBin;
+ }
+ {
+ TPairSet::iterator rightBin = current;
+ ++rightBin;
+ if (rightBin != Bins.end()) {
+ right = *rightBin;
+ }
+ }
+ double sumToAdd = sum - (gatheredSum - current->second - right.second / 2);
+ if (sumToAdd <= ((current->second + right.second) / 2)) {
+ return right.first - 2 * sumToAdd * (right.first - current->first) / (current->second + right.second);
+ } else {
+ sumToAdd -= (current->second + right.second) / 2;
+ return current->first - 2 * sumToAdd * (current->first - left.first) / (left.second + current->second);
+ }
+ }
+
+ double TAdaptiveHistogram::CalcUpperBoundSafe(double sum) {
+ if (!Empty()) {
+ sum = Max(Bins.begin()->second, sum);
+ }
+ return CalcUpperBound(sum);
+ }
+
+ double TAdaptiveHistogram::CalcLowerBoundSafe(double sum) {
+ if (!Empty()) {
+ sum = Max(Bins.rbegin()->second, sum);
+ }
+ return CalcLowerBound(sum);
+ }
+
+ void TAdaptiveHistogram::FromIHistogram(IHistogram* histo) {
+ if (!histo) {
+ ythrow yexception() << "Attempt to create TAdaptiveHistogram from a NULL pointer";
+ }
+ if (CalcQuality == CalcWardQuality) {
+ ythrow yexception() << "Not implemented";
+ } else if (CalcQuality != CalcDistanceQuality && CalcQuality != CalcWeightQuality) {
+ ythrow yexception() << "Attempt to create TAdaptiveHistogram from a pointer without default CalcQuality";
+ }
+ Id = histo->GetId();
+ if (histo->Empty()) {
+ return;
+ }
+ double sum = histo->GetSum();
+ double minValue = histo->GetMinValue();
+ double maxValue = histo->GetMaxValue();
+ if (minValue == maxValue) {
+ Add(minValue, sum);
+ return;
+ }
+ if (CalcQuality == CalcDistanceQuality) {
+ double binRange = (maxValue - minValue) / (Intervals);
+ for (size_t i = 0; i < Intervals; ++i) {
+ Add(minValue + binRange * (i + 0.5), histo->GetSumInRange(minValue + binRange * i, minValue + binRange * (i + 1)));
+ }
+ } else if (CalcQuality == CalcWeightQuality && sum != 0.0) {
+ double slab = sum / Intervals;
+ double prevBound = minValue;
+ for (size_t i = 0; i < Intervals; ++i) {
+ double bound = histo->CalcUpperBound(slab * (i + 1));
+ Add((bound + prevBound) / 2, slab);
+ prevBound = bound;
+ }
+ }
+ MinValue = minValue;
+ MaxValue = maxValue;
+ }
+
+ void TAdaptiveHistogram::Add(const TWeightedValue& weightedValue, bool initial) {
+ const double& value = weightedValue.first;
+ const double& weight = weightedValue.second;
+ if (weight <= 0.0) {
+ return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden
+ }
+ if (initial) {
+ Sum += weight;
+ }
+ if (Bins.size() == 0) {
+ MinValue = value;
+ MaxValue = value;
+ Bins.insert(weightedValue);
+ return;
+ }
+ if (value < MinValue) {
+ MinValue = value;
+ }
+ if (value > MaxValue) {
+ MaxValue = value;
+ }
+ TPairSet::iterator rightBin = Bins.lower_bound(TWeightedValue(value, -1.0));
+ if (rightBin != Bins.end() && rightBin->first == value) {
+ TPairSet::iterator currentBin = rightBin;
+ ++rightBin;
+ TWeightedValue newBin(value, weight + currentBin->second);
+ if (rightBin != Bins.end()) {
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) == 1, "Erase failed");
+ BinsByQuality.insert(CalcQuality(newBin, *rightBin));
+ }
+ if (currentBin != Bins.begin()) {
+ TPairSet::iterator leftBin = currentBin;
+ --leftBin;
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) == 1, "Erase failed");
+ BinsByQuality.insert(CalcQuality(*leftBin, newBin));
+ }
+ Bins.erase(currentBin);
+ Bins.insert(newBin);
+ return;
+ }
+ if (rightBin == Bins.begin()) {
+ BinsByQuality.insert(CalcQuality(weightedValue, *rightBin));
+ } else {
+ TPairSet::iterator leftBin = rightBin;
+ --leftBin;
+ if (rightBin == Bins.end()) {
+ BinsByQuality.insert(CalcQuality(*leftBin, weightedValue));
+ } else {
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *rightBin)) == 1, "Erase failed");
+ BinsByQuality.insert(CalcQuality(*leftBin, weightedValue));
+ BinsByQuality.insert(CalcQuality(weightedValue, *rightBin));
+ }
+ }
+ Bins.insert(weightedValue);
+ if (Bins.size() > Intervals) {
+ Shrink();
+ }
+ }
+
+ void TAdaptiveHistogram::Erase(double value) {
+ TPairSet::iterator currentBin = Bins.lower_bound(TWeightedValue(value, -1.0));
+ Y_VERIFY(currentBin != Bins.end() && currentBin->first == value, "Can't find bin that should be erased");
+ TPairSet::iterator rightBin = currentBin;
+ ++rightBin;
+ if (currentBin == Bins.begin()) {
+ Y_VERIFY(rightBin != Bins.end(), "No right bin for the first bin");
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) != 0, "Erase failed");
+ } else {
+ TPairSet::iterator leftBin = currentBin;
+ --leftBin;
+ if (rightBin == Bins.end()) {
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) != 0, "Erase failed");
+ } else {
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) != 0, "Erase failed");
+ Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) != 0, "Erase failed");
+ BinsByQuality.insert(CalcQuality(*leftBin, *rightBin));
+ }
+ }
+ Bins.erase(currentBin);
+ }
+
+ void TAdaptiveHistogram::Shrink() {
+ TPairSet::iterator worstBin = BinsByQuality.begin();
+ Y_VERIFY(worstBin != BinsByQuality.end(), "No right bin for the first bin");
+ TPairSet::iterator leftBin = Bins.lower_bound(TWeightedValue(worstBin->second, -1.0));
+ Y_VERIFY(leftBin != Bins.end() && leftBin->first == worstBin->second, "Can't find worst bin");
+ TPairSet::iterator rightBin = leftBin;
+ ++rightBin;
+ Y_VERIFY(rightBin != Bins.end(), "Can't find right bin");
+
+ TWeightedValue newBin((leftBin->first * leftBin->second + rightBin->first * rightBin->second) / (leftBin->second + rightBin->second), leftBin->second + rightBin->second);
+ if (Bins.size() > 2) {
+ Erase(leftBin->first);
+ Erase(rightBin->first);
+ } else {
+ Bins.clear();
+ BinsByQuality.clear();
+ }
+ Add(newBin, false);
+ }
+
+ void TAdaptiveHistogram::PrecomputePartialSums() {
+ PrecomputedBins.clear();
+ PrecomputedBins.reserve(Bins.size());
+ double currentSum = 0;
+ for (const auto& bin : Bins) {
+ PrecomputedBins.emplace_back(bin.first, bin.second, currentSum, Sum - currentSum - bin.second);
+ currentSum += bin.second;
+ }
+ }
+
+ template <typename TBins, typename TGetSumAbove>
+ double TAdaptiveHistogram::GetSumAboveBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumAbove& getSumAbove) const {
+ typename TBins::value_type left(MinValue, 0.0);
+ typename TBins::value_type right(MaxValue, 0.0);
+ if (rightBin != bins.end()) {
+ right = *rightBin;
+ }
+ if (rightBin != bins.begin()) {
+ typename TBins::const_iterator leftBin = rightBin;
+ --leftBin;
+ left = *leftBin;
+ }
+ double sum = (right.second / 2) + ((right.first == left.first) ? ((left.second + right.second) / 2) : (((left.second + right.second) / 2) * (right.first - bound) / (right.first - left.first)));
+ if (rightBin == bins.end()) {
+ return sum;
+ }
+ sum += getSumAbove(rightBin);
+ return sum;
+ }
+
+ template <typename TBins, typename TGetSumBelow>
+ double TAdaptiveHistogram::GetSumBelowBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumBelow& getSumBelow) const {
+ typename TBins::value_type left(MinValue, 0.0);
+ typename TBins::value_type right(MaxValue, 0.0);
+ if (rightBin != bins.end()) {
+ right = *rightBin;
+ }
+ if (rightBin != bins.begin()) {
+ typename TBins::const_iterator leftBin = rightBin;
+ --leftBin;
+ left = *leftBin;
+ }
+ double sum = (left.second / 2) + ((right.first == left.first) ? ((left.second + right.second) / 2) : (((left.second + right.second) / 2) * (bound - left.first) / (right.first - left.first)));
+ if (rightBin == bins.begin()) {
+ return sum;
+ }
+ --rightBin;
+ sum += getSumBelow(rightBin);
+ return sum;
+ }
+
+}
diff --git a/library/cpp/histogram/adaptive/adaptive_histogram.h b/library/cpp/histogram/adaptive/adaptive_histogram.h
new file mode 100644
index 0000000000..fa8f48433f
--- /dev/null
+++ b/library/cpp/histogram/adaptive/adaptive_histogram.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include "histogram.h"
+#include "common.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/set.h>
+#include <util/generic/vector.h>
+
+namespace NKiwiAggr {
+ class TAdaptiveHistogram: private TNonCopyable, public IHistogram {
+ protected:
+ static const size_t DEFAULT_INTERVALS = 100;
+
+ private:
+ using TPairSet = TSet<TWeightedValue>;
+
+ struct TFastBin {
+ // these names are for compatibility with TWeightedValue
+ double first;
+ double second;
+ // both sums do not include current bin
+ double SumBelow;
+ double SumAbove;
+
+ TFastBin(double first_, double second_, double sumBelow = 0, double sumAbove = 0)
+ : first(first_)
+ , second(second_)
+ , SumBelow(sumBelow)
+ , SumAbove(sumAbove)
+ {
+ }
+
+ bool operator<(const TFastBin& rhs) const {
+ return first < rhs.first;
+ }
+ };
+
+ ui64 Id;
+ double MinValue;
+ double MaxValue;
+ double Sum;
+ size_t Intervals;
+ TPairSet Bins;
+ TPairSet BinsByQuality;
+ TQualityFunction CalcQuality;
+
+ TVector<TFastBin> PrecomputedBins;
+
+ public:
+ TAdaptiveHistogram(size_t intervals, ui64 id = 0, TQualityFunction qualityFunc = CalcWeightQuality);
+ TAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, TQualityFunction qualityFunc = nullptr);
+ TAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, TQualityFunction qualityFunc = CalcWeightQuality);
+
+ ~TAdaptiveHistogram() override {
+ }
+
+ TQualityFunction GetQualityFunc();
+
+ void Clear() override;
+
+ void Add(double value, double weight) override;
+ void Add(const THistoRec& histoRec) override;
+
+ void Merge(const THistogram& histo, double multiplier) final;
+ void Merge(const TVector<THistogram>& histogramsToMerge) final;
+ void Merge(TVector<IHistogramPtr> histogramsToMerge) final;
+
+ void Multiply(double factor) final;
+
+ void FromProto(const THistogram& histo) final;
+ void ToProto(THistogram& histo) final;
+
+ void SetId(ui64 id) final;
+ ui64 GetId() final;
+ bool Empty() final;
+ double GetMinValue() final;
+ double GetMaxValue() final;
+ double GetSum() final;
+ double GetSumInRange(double leftBound, double rightBound) final;
+ double GetSumAboveBound(double bound) final;
+ double GetSumBelowBound(double bound) final;
+ double CalcUpperBound(double sum) final;
+ double CalcLowerBound(double sum) final;
+ double CalcUpperBoundSafe(double sum) final;
+ double CalcLowerBoundSafe(double sum) final;
+
+ void PrecomputePartialSums() final;
+
+ private:
+ void FromIHistogram(IHistogram* histo);
+ void Add(const TWeightedValue& weightedValue, bool initial);
+ void Erase(double value);
+ void Shrink();
+
+ template <typename TBins, typename TGetSumAbove>
+ double GetSumAboveBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumAbove& getSumAbove) const;
+
+ template <typename TBins, typename TGetSumBelow>
+ double GetSumBelowBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumBelow& getSumBelow) const;
+ };
+
+ template <TQualityFunction QualityFunction>
+ class TDefinedAdaptiveHistogram: public TAdaptiveHistogram {
+ public:
+ TDefinedAdaptiveHistogram(size_t intervals, ui64 id = 0)
+ : TAdaptiveHistogram(intervals, id, QualityFunction)
+ {
+ }
+
+ TDefinedAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0)
+ : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction)
+ {
+ }
+
+ TDefinedAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0)
+ : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction)
+ {
+ }
+
+ ~TDefinedAdaptiveHistogram() override {
+ }
+ };
+
+ typedef TDefinedAdaptiveHistogram<CalcDistanceQuality> TAdaptiveDistanceHistogram;
+ typedef TDefinedAdaptiveHistogram<CalcWeightQuality> TAdaptiveWeightHistogram;
+ typedef TDefinedAdaptiveHistogram<CalcWardQuality> TAdaptiveWardHistogram;
+
+}
diff --git a/library/cpp/histogram/adaptive/auto_histogram.h b/library/cpp/histogram/adaptive/auto_histogram.h
new file mode 100644
index 0000000000..9fdf0b9abe
--- /dev/null
+++ b/library/cpp/histogram/adaptive/auto_histogram.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include "adaptive_histogram.h"
+#include "fixed_bin_histogram.h"
+#include "histogram.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/yexception.h>
+
+namespace NKiwiAggr {
+ class TAutoHistogram: private TNonCopyable, public IHistogram {
+ private:
+ static const size_t DEFAULT_INTERVALS = 100;
+
+ ui64 Id;
+ size_t Intervals;
+ THolder<IHistogram> HistogramImpl;
+
+ public:
+ TAutoHistogram(size_t intervals, ui64 id = 0) {
+ Y_UNUSED(intervals);
+ Y_UNUSED(id);
+ ythrow yexception() << "Empty constructor is not defined for TAutoHistogram";
+ }
+
+ TAutoHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0)
+ : Id(defaultId)
+ , Intervals(defaultIntervals)
+ {
+ FromProto(histo);
+ }
+
+ TAutoHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) {
+ Y_UNUSED(histo);
+ Y_UNUSED(defaultIntervals);
+ Y_UNUSED(defaultId);
+ ythrow yexception() << "IHistogram constructor is not defined for TAutoHistogram";
+ }
+
+ virtual ~TAutoHistogram() {
+ }
+
+ virtual void Clear() {
+ HistogramImpl->Clear();
+ }
+
+ virtual void Add(double value, double weight) {
+ HistogramImpl->Add(value, weight);
+ }
+
+ virtual void Add(const THistoRec& histoRec) {
+ HistogramImpl->Add(histoRec);
+ }
+
+ virtual void Merge(const THistogram& histo, double multiplier) {
+ HistogramImpl->Merge(histo, multiplier);
+ }
+
+ virtual void Merge(const TVector<THistogram>& histogramsToMerge) {
+ HistogramImpl->Merge(histogramsToMerge);
+ }
+
+ virtual void Merge(TVector<IHistogramPtr> histogramsToMerge) {
+ HistogramImpl->Merge(histogramsToMerge);
+ }
+
+ virtual void Multiply(double factor) {
+ HistogramImpl->Multiply(factor);
+ }
+
+ virtual void FromProto(const THistogram& histo) {
+ if (!histo.HasType() || histo.GetType() == HT_FIXED_BIN_HISTOGRAM) {
+ HistogramImpl.Reset(new TFixedBinHistogram(histo, Intervals, Id));
+ } else if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) {
+ HistogramImpl.Reset(new TAdaptiveDistanceHistogram(histo, Intervals, Id));
+ } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) {
+ HistogramImpl.Reset(new TAdaptiveWeightHistogram(histo, Intervals, Id));
+ } else if (histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM) {
+ HistogramImpl.Reset(new TAdaptiveWardHistogram(histo, Intervals, Id));
+ } else {
+ ythrow yexception() << "Can't parse TAutoHistogram from a THistogram of type " << (ui32)histo.GetType();
+ }
+ }
+
+ virtual void ToProto(THistogram& histo) {
+ HistogramImpl->ToProto(histo);
+ }
+
+ virtual void SetId(ui64 id) {
+ HistogramImpl->SetId(id);
+ }
+
+ virtual ui64 GetId() {
+ return HistogramImpl->GetId();
+ }
+
+ virtual bool Empty() {
+ return HistogramImpl->Empty();
+ }
+
+ virtual double GetMinValue() {
+ return HistogramImpl->GetMinValue();
+ }
+
+ virtual double GetMaxValue() {
+ return HistogramImpl->GetMaxValue();
+ }
+
+ virtual double GetSum() {
+ return HistogramImpl->GetSum();
+ }
+
+ virtual double GetSumInRange(double leftBound, double rightBound) {
+ return HistogramImpl->GetSumInRange(leftBound, rightBound);
+ }
+
+ virtual double GetSumAboveBound(double bound) {
+ return HistogramImpl->GetSumAboveBound(bound);
+ }
+
+ virtual double GetSumBelowBound(double bound) {
+ return HistogramImpl->GetSumBelowBound(bound);
+ }
+
+ virtual double CalcUpperBound(double sum) {
+ return HistogramImpl->CalcUpperBound(sum);
+ }
+
+ virtual double CalcLowerBound(double sum) {
+ return HistogramImpl->CalcLowerBound(sum);
+ }
+
+ virtual double CalcUpperBoundSafe(double sum) {
+ return HistogramImpl->CalcUpperBoundSafe(sum);
+ }
+
+ virtual double CalcLowerBoundSafe(double sum) {
+ return HistogramImpl->CalcLowerBoundSafe(sum);
+ }
+
+ virtual void PrecomputePartialSums() {
+ return HistogramImpl->PrecomputePartialSums();
+ }
+ };
+
+}
diff --git a/library/cpp/histogram/adaptive/block_histogram.cpp b/library/cpp/histogram/adaptive/block_histogram.cpp
new file mode 100644
index 0000000000..6586d13ff6
--- /dev/null
+++ b/library/cpp/histogram/adaptive/block_histogram.cpp
@@ -0,0 +1,593 @@
+#include "block_histogram.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/yexception.h>
+#include <util/generic/intrlist.h>
+#include <util/generic/ptr.h>
+#include <util/generic/queue.h>
+#include <util/generic/ymath.h>
+#include <util/string/printf.h>
+
+namespace {
+ struct TEmpty {
+ };
+
+ class TSmartHeap {
+ private:
+ TVector<ui32> A;
+ TVector<ui32> Pos;
+ const TVector<double>& Weights;
+
+ public:
+ TSmartHeap(const TVector<double>& weights)
+ : A(weights.size())
+ , Pos(weights.size())
+ , Weights(weights)
+ {
+ for (ui32 i = 0; i < weights.size(); ++i) {
+ A[i] = i;
+ Pos[i] = i;
+ }
+ for (ui32 i = weights.size() / 2; i > 0; --i) {
+ Down(i - 1);
+ }
+ }
+
+ ui32 IdOfMin() {
+ return A[0];
+ }
+
+ void Pop() {
+ A[0] = A.back();
+ Pos[A[0]] = 0;
+ A.pop_back();
+ Down(0);
+ }
+
+ void DownElement(ui32 id) {
+ Down(Pos[id]);
+ }
+
+ private:
+ void SwapPositions(ui32 x, ui32 y) {
+ std::swap(A[x], A[y]);
+ Pos[A[x]] = x;
+ Pos[A[y]] = y;
+ }
+
+ void Down(ui32 pos) {
+ while (1) {
+ ui32 left = pos * 2 + 1;
+ ui32 right = pos * 2 + 2;
+ ui32 min = pos;
+ if (left < A.size() && Weights[A[min]] > Weights[A[left]])
+ min = left;
+ if (right < A.size() && Weights[A[min]] > Weights[A[right]])
+ min = right;
+ if (pos == min)
+ break;
+ SwapPositions(min, pos);
+ pos = min;
+ }
+ }
+ };
+
+}
+
+namespace NKiwiAggr {
+ ///////////////////
+ // TBlockHistogram
+ ///////////////////
+
+ TBlockHistogram::TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
+ size_t intervals, ui64 id, size_t shrinkSize)
+ : Type(type)
+ , CalcQuality(calcQuality)
+ , Intervals(intervals)
+ , ShrinkSize(shrinkSize)
+ , PrevSize(0)
+ , Id(id)
+ , Sum(0)
+ , MinValue(0)
+ , MaxValue(0)
+ {
+ CorrectShrinkSize();
+ }
+
+ void TBlockHistogram::Clear() {
+ PrevSize = 0;
+ Sum = 0.0;
+ MinValue = 0.0;
+ MaxValue = 0.0;
+ Bins.clear();
+ }
+
+ void TBlockHistogram::Add(const THistoRec& rec) {
+ if (!rec.HasId() || rec.GetId() == Id) {
+ Add(rec.GetValue(), rec.GetWeight());
+ }
+ }
+
+ void TBlockHistogram::Add(double value, double weight) {
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight);
+ }
+
+ if (weight <= 0.0) {
+ return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden
+ }
+
+ if (Bins.empty()) {
+ MinValue = value;
+ MaxValue = value;
+ } else {
+ MinValue = Min(MinValue, value);
+ MaxValue = Max(MaxValue, value);
+ }
+
+ Sum += weight;
+
+ if (Bins.size() > ShrinkSize) {
+ SortAndShrink(Intervals * SHRINK_MULTIPLIER);
+ }
+
+ Bins.push_back(TWeightedValue(value, weight));
+ }
+
+ void TBlockHistogram::Merge(const THistogram& histo, double multiplier) {
+ if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) {
+ fprintf(stderr, "Merging in histogram id %lu: skip bad histo with minvalue %f maxvalue %f\n", Id, histo.GetMinValue(), histo.GetMaxValue());
+ return;
+ }
+ if (histo.FreqSize() == 0) {
+ return; // skip empty histos
+ }
+ if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM ||
+ histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM ||
+ histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM ||
+ histo.GetType() == HT_ADAPTIVE_HISTOGRAM)
+ {
+ Y_VERIFY(histo.FreqSize() == histo.PositionSize(), "Corrupted histo");
+ for (size_t j = 0; j < histo.FreqSize(); ++j) {
+ double value = histo.GetPosition(j);
+ double weight = histo.GetFreq(j);
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight);
+ continue;
+ }
+ Add(value, weight * multiplier);
+ }
+
+ MinValue = Min(MinValue, histo.GetMinValue());
+ MaxValue = Max(MaxValue, histo.GetMaxValue());
+ } else if (histo.GetType() == HT_FIXED_BIN_HISTOGRAM) {
+ double pos = histo.GetMinValue() + histo.GetBinRange() / 2.0;
+ for (size_t j = 0; j < histo.FreqSize(); ++j) {
+ double weight = histo.GetFreq(j);
+ if (!IsValidFloat(pos) || !IsValidFloat(weight)) {
+ fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, pos, weight);
+ pos += histo.GetBinRange();
+ continue;
+ }
+ Add(pos, weight * multiplier);
+ pos += histo.GetBinRange();
+ }
+
+ MinValue = Min(MinValue, histo.GetMinValue());
+ MaxValue = Max(MaxValue, histo.GetMaxValue());
+ } else {
+ ythrow yexception() << "Unknown THistogram type";
+ }
+ }
+
+ void TBlockHistogram::Merge(const TVector<THistogram>& histogramsToMerge) {
+ for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
+ Merge(histogramsToMerge[i], 1.0);
+ }
+ }
+
+ void TBlockHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) {
+ Y_UNUSED(histogramsToMerge);
+ ythrow yexception() << "IHistogram::Merge(TVector<IHistogramPtr>) is not defined for TBlockHistogram";
+ }
+
+ void TBlockHistogram::Multiply(double factor) {
+ if (!IsValidFloat(factor) || factor <= 0) {
+ ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor;
+ }
+ Sum *= factor;
+ for (TVector<TWeightedValue>::iterator it = Bins.begin(); it != Bins.end(); ++it) {
+ it->second *= factor;
+ }
+ }
+
+ void TBlockHistogram::FromProto(const THistogram& histo) {
+ Y_VERIFY(histo.HasType(), "Attempt to parse TBlockHistogram from THistogram protobuf with no Type field set");
+ ;
+ switch (histo.GetType()) { // check that histogram type is correct
+ case HT_ADAPTIVE_DISTANCE_HISTOGRAM:
+ case HT_ADAPTIVE_WEIGHT_HISTOGRAM:
+ case HT_ADAPTIVE_WARD_HISTOGRAM:
+ case HT_ADAPTIVE_HISTOGRAM:
+ break; // ok
+ default: // not ok
+ ythrow yexception() << "Attempt to parse TBlockHistogram from THistogram protobuf record of type = " << (ui32)histo.GetType();
+ }
+
+ if (histo.FreqSize() != histo.PositionSize()) {
+ ythrow yexception() << "Attempt to parse TBlockHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize();
+ }
+ Id = histo.GetId();
+ Sum = 0;
+ Intervals = Max(Intervals, histo.FreqSize());
+ CorrectShrinkSize();
+ Bins.resize(histo.FreqSize());
+ PrevSize = Bins.size();
+ for (size_t i = 0; i < histo.FreqSize(); ++i) {
+ double value = histo.GetPosition(i);
+ double weight = histo.GetFreq(i);
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ fprintf(stderr, "FromProto in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight);
+ continue;
+ }
+ Bins[i].first = value;
+ Bins[i].second = weight;
+ Sum += Bins[i].second;
+ }
+
+ if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) {
+ ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue());
+ }
+ MinValue = histo.GetMinValue();
+ MaxValue = histo.GetMaxValue();
+ }
+
+ void TBlockHistogram::ToProto(THistogram& histo) {
+ histo.Clear();
+ histo.SetType(Type);
+ histo.SetId(Id);
+ if (Empty()) {
+ return;
+ }
+
+ SortAndShrink(Intervals, true);
+ histo.SetMinValue(MinValue);
+ histo.SetMaxValue(MaxValue);
+ for (TVector<TWeightedValue>::const_iterator it = Bins.begin(); it != Bins.end(); ++it) {
+ histo.AddFreq(it->second);
+ histo.AddPosition(it->first);
+ }
+ }
+
+ void TBlockHistogram::SetId(ui64 id) {
+ Id = id;
+ }
+
+ ui64 TBlockHistogram::GetId() {
+ return Id;
+ }
+
+ bool TBlockHistogram::Empty() {
+ return Bins.empty();
+ }
+
+ double TBlockHistogram::GetMinValue() {
+ return MinValue;
+ }
+
+ double TBlockHistogram::GetMaxValue() {
+ return MaxValue;
+ }
+
+ double TBlockHistogram::GetSum() {
+ return Sum;
+ }
+
+ void TBlockHistogram::SortAndShrink(size_t intervals, bool final) {
+ Y_VERIFY(intervals > 0);
+
+ if (Bins.size() <= intervals) {
+ return;
+ }
+
+ if (Bins.size() >= Intervals * GREEDY_SHRINK_MULTIPLIER) {
+ SortBins();
+ UniquifyBins();
+ FastGreedyShrink(intervals);
+
+ if (final) {
+ SlowShrink(intervals);
+ }
+
+ } else {
+ SortBins();
+ UniquifyBins();
+ SlowShrink(intervals);
+ }
+ }
+
+ void TBlockHistogram::SortBins() {
+ Sort(Bins.begin() + PrevSize, Bins.end());
+ if (PrevSize != 0) {
+ TVector<TWeightedValue> temp(Bins.begin(), Bins.begin() + PrevSize);
+ std::merge(temp.begin(), temp.end(), Bins.begin() + PrevSize, Bins.end(), Bins.begin());
+ }
+ }
+
+ void TBlockHistogram::UniquifyBins() {
+ if (Bins.empty())
+ return;
+
+ auto it1 = Bins.begin();
+ auto it2 = Bins.begin();
+ while (++it2 != Bins.end()) {
+ if (it1->first == it2->first) {
+ it1->second += it2->second;
+ } else {
+ *(++it1) = *it2;
+ }
+ }
+
+ Bins.erase(++it1, Bins.end());
+ }
+
+ void TBlockHistogram::CorrectShrinkSize() {
+ ShrinkSize = Max(ShrinkSize, Intervals * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER));
+ }
+
+ void TBlockHistogram::SlowShrink(size_t intervals) {
+ {
+ size_t pos = 0;
+ for (size_t i = 1; i < Bins.size(); ++i)
+ if (Bins[i].first - Bins[pos].first < 1e-9) {
+ Bins[pos].second += Bins[i].second;
+ } else {
+ ++pos;
+ Bins[pos] = Bins[i];
+ }
+ Bins.resize(pos + 1);
+ PrevSize = pos + 1;
+ }
+
+ if (Bins.size() <= intervals) {
+ return;
+ }
+
+ typedef TIntrusiveListItem<TEmpty> TListItem;
+
+ ui32 n = Bins.size() - 1;
+ const ui32 end = (ui32)Bins.size();
+
+ TArrayHolder<TListItem> listElementsHolder(new TListItem[end + 1]);
+ TListItem* const bins = listElementsHolder.Get();
+
+ for (ui32 i = 1; i <= end; ++i) {
+ bins[i].LinkAfter(&bins[i - 1]);
+ }
+
+ TVector<double> pairWeights(n);
+
+ for (ui32 i = 0; i < n; ++i) {
+ pairWeights[i] = CalcQuality(Bins[i], Bins[i + 1]).first;
+ }
+
+ TSmartHeap heap(pairWeights);
+
+ while (n + 1 > intervals) {
+ ui32 b = heap.IdOfMin();
+ heap.Pop();
+
+ ui32 a = (ui32)(bins[b].Prev() - bins);
+ ui32 c = (ui32)(bins[b].Next() - bins);
+ ui32 d = (ui32)(bins[b].Next()->Next() - bins);
+ Y_VERIFY(Bins[c].second != -1);
+
+ double mass = Bins[b].second + Bins[c].second;
+ Bins[c].first = (Bins[b].first * Bins[b].second + Bins[c].first * Bins[c].second) / mass;
+ Bins[c].second = mass;
+
+ bins[b].Unlink();
+ Bins[b].second = -1;
+
+ if (a != end) {
+ pairWeights[a] = CalcQuality(Bins[a], Bins[c]).first;
+ heap.DownElement(a);
+ }
+
+ if (d != end && c + 1 != Bins.size()) {
+ pairWeights[c] = CalcQuality(Bins[c], Bins[d]).first;
+ heap.DownElement(c);
+ }
+
+ --n;
+ }
+
+ size_t pos = 0;
+ for (TListItem* it = bins[end].Next(); it != &bins[end]; it = it->Next()) {
+ Bins[pos++] = Bins[it - bins];
+ }
+
+ Bins.resize(pos);
+ PrevSize = pos;
+ Y_VERIFY(pos == intervals);
+ }
+
+ double TBlockHistogram::GetSumInRange(double leftBound, double rightBound) {
+ Y_UNUSED(leftBound);
+ Y_UNUSED(rightBound);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ double TBlockHistogram::GetSumAboveBound(double bound) {
+ Y_UNUSED(bound);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ double TBlockHistogram::GetSumBelowBound(double bound) {
+ Y_UNUSED(bound);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ double TBlockHistogram::CalcUpperBound(double sum) {
+ Y_UNUSED(sum);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ double TBlockHistogram::CalcLowerBound(double sum) {
+ Y_UNUSED(sum);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ double TBlockHistogram::CalcUpperBoundSafe(double sum) {
+ Y_UNUSED(sum);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ double TBlockHistogram::CalcLowerBoundSafe(double sum) {
+ Y_UNUSED(sum);
+ ythrow yexception() << "Method is not implemented for TBlockHistogram";
+ return 0;
+ }
+
+ /////////////////////////
+ // TBlockWeightHistogram
+ /////////////////////////
+
+ TBlockWeightHistogram::TBlockWeightHistogram(size_t intervals, ui64 id, size_t shrinkSize)
+ : TBlockHistogram(HT_ADAPTIVE_WEIGHT_HISTOGRAM, CalcWeightQuality, intervals, id, shrinkSize)
+ {
+ }
+
+ void TBlockWeightHistogram::FastGreedyShrink(size_t intervals) {
+ if (Bins.size() <= intervals)
+ return;
+
+ double slab = Sum / intervals;
+
+ size_t i = 0;
+ size_t pos = 0;
+ while (i < Bins.size()) {
+ double curW = Bins[i].second;
+ double curMul = Bins[i].first * Bins[i].second;
+ ++i;
+ while (i < Bins.size() && curW + Bins[i].second <= slab && pos + Bins.size() - i >= intervals) {
+ curW += Bins[i].second;
+ curMul += Bins[i].first * Bins[i].second;
+ ++i;
+ }
+ Bins[pos++] = TWeightedValue(curMul / curW, curW);
+ }
+
+ Bins.resize(pos);
+ PrevSize = pos;
+ }
+
+ ///////////////////////
+ // TBlockWardHistogram
+ ///////////////////////
+
+ TBlockWardHistogram::TBlockWardHistogram(size_t intervals, ui64 id, size_t shrinkSize)
+ : TBlockHistogram(HT_ADAPTIVE_WARD_HISTOGRAM, CalcWardQuality, intervals, id, shrinkSize)
+ {
+ }
+
+ bool TBlockWardHistogram::CalcSplitInfo(
+ const TCumulatives::const_iterator beg,
+ const TCumulatives::const_iterator end, // (!) points to the final element
+ TSplitInfo& splitInfo // out
+ ) {
+ if (end - beg < 2) {
+ return false;
+ }
+
+ TCumulatives::const_iterator mid = LowerBound(beg, end + 1, TCumulative{(beg->first + end->first) / 2, 0.});
+
+ if (mid == beg) {
+ mid++;
+ } else if (mid == end) {
+ mid--;
+ }
+
+ // derived from Ward's minimum variance criterion
+ double profit = 0.0;
+ profit += (mid->second - beg->second) * (mid->second - beg->second) / (mid->first - beg->first);
+ profit += (end->second - mid->second) * (end->second - mid->second) / (end->first - mid->first);
+ profit -= (end->second - beg->second) * (end->second - beg->second) / (end->first - beg->first);
+
+ splitInfo = {profit, beg, mid, end};
+
+ return true;
+ }
+
+ void TBlockWardHistogram::FastGreedyShrink(size_t intervals) {
+ Y_VERIFY(intervals > 0);
+
+ if (Bins.size() <= intervals) {
+ return;
+ }
+
+ // fill cumulative sums
+ // sum at index i equals to the sum of all values before i
+ // sum at index i+1 equals to the sum of all values before i with the value at i added
+ TCumulatives cumulatives;
+ cumulatives.reserve(Bins.size() + 1);
+
+ TCumulative cumulative = {0., 0.};
+ cumulatives.push_back(cumulative);
+ for (size_t i = 0; i < Bins.size(); i++) {
+ cumulative.first += Bins[i].second;
+ cumulative.second += Bins[i].second * Bins[i].first;
+ cumulatives.push_back(cumulative);
+ }
+
+ TVector<TCumulatives::const_iterator> splits;
+ splits.reserve(intervals + 1);
+ splits.push_back(cumulatives.begin());
+ splits.push_back(cumulatives.end() - 1);
+
+ TPriorityQueue<TSplitInfo> candidates;
+
+ // explicitly add first split
+ TSplitInfo newSplitInfo;
+ if (CalcSplitInfo(cumulatives.begin(), cumulatives.end() - 1, newSplitInfo)) {
+ candidates.push(newSplitInfo);
+ }
+
+ // recursively split until done
+ for (size_t split = 0; split < intervals - 1 && !candidates.empty(); split++) {
+ TSplitInfo curSplitInfo = candidates.top();
+ candidates.pop();
+
+ splits.push_back(curSplitInfo.mid);
+
+ if (CalcSplitInfo(curSplitInfo.beg, curSplitInfo.mid, newSplitInfo)) {
+ candidates.push(newSplitInfo);
+ }
+ if (CalcSplitInfo(curSplitInfo.mid, curSplitInfo.end, newSplitInfo)) {
+ candidates.push(newSplitInfo);
+ }
+ }
+
+ // calclate new bin centers and weights
+ Sort(splits.begin(), splits.end());
+
+ Bins.clear();
+ for (auto it = splits.begin(); it + 1 != splits.end(); ++it) {
+ auto splitBeg = *it;
+ auto splitEnd = *(it + 1);
+ double cnt = (splitEnd->first - splitBeg->first);
+ double mu = (splitEnd->second - splitBeg->second) / cnt;
+
+ Bins.push_back(TWeightedValue(mu, cnt));
+ }
+ }
+
+}
diff --git a/library/cpp/histogram/adaptive/block_histogram.h b/library/cpp/histogram/adaptive/block_histogram.h
new file mode 100644
index 0000000000..266bb2f2b2
--- /dev/null
+++ b/library/cpp/histogram/adaptive/block_histogram.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include "histogram.h"
+#include "common.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <utility>
+
+namespace NKiwiAggr {
+ ///////////////////
+ // TBlockHistogram
+ ///////////////////
+
+ /**
+ * Contrary to adaptive histogram, block histogram doesn't rebuild bins
+ * after the addition of each point. Instead, it accumulates points and in case the amount
+ * of points overflows specified limits, it shrinks all the points at once to produce histogram
+ * Indeed, there exist two limits and two shrinkage operations:
+ * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit
+ * (override FastGreedyShrink to set specific behaviour)
+ * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the
+ * actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor)
+ * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization
+ */
+ class TBlockHistogram: private TNonCopyable, public IHistogram {
+ protected:
+ static const size_t SHRINK_MULTIPLIER = 2;
+ static const size_t GREEDY_SHRINK_MULTIPLIER = 4;
+ static const size_t DEFAULT_INTERVALS = 100;
+ static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER);
+
+ const EHistogramType Type;
+ const TQualityFunction CalcQuality;
+
+ size_t Intervals;
+ size_t ShrinkSize;
+ size_t PrevSize;
+
+ ui64 Id;
+
+ double Sum;
+ double MinValue;
+ double MaxValue;
+
+ TVector<TWeightedValue> Bins;
+
+ public:
+ TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
+ size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
+
+ virtual ~TBlockHistogram() {
+ }
+
+ virtual void Clear();
+
+ virtual void Add(double value, double weight);
+ virtual void Add(const THistoRec& histoRec);
+
+ virtual void Merge(const THistogram& histo, double multiplier);
+ virtual void Merge(const TVector<THistogram>& histogramsToMerge);
+ virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented
+
+ virtual void Multiply(double factor);
+
+ virtual void FromProto(const THistogram& histo);
+ virtual void ToProto(THistogram& histo);
+
+ virtual void SetId(ui64 id);
+ virtual ui64 GetId();
+ virtual bool Empty();
+ virtual double GetMinValue();
+ virtual double GetMaxValue();
+ virtual double GetSum();
+
+ // methods below are not implemented
+ virtual double GetSumInRange(double leftBound, double rightBound);
+ virtual double GetSumAboveBound(double bound);
+ virtual double GetSumBelowBound(double bound);
+ virtual double CalcUpperBound(double sum);
+ virtual double CalcLowerBound(double sum);
+ virtual double CalcUpperBoundSafe(double sum);
+ virtual double CalcLowerBoundSafe(double sum);
+
+ private:
+ void SortBins();
+ void UniquifyBins();
+ void CorrectShrinkSize();
+
+ void SortAndShrink(size_t intervals, bool final = false);
+
+ void SlowShrink(size_t intervals);
+ virtual void FastGreedyShrink(size_t intervals) = 0;
+ };
+
+ /////////////////////////
+ // TBlockWeightHistogram
+ /////////////////////////
+
+ class TBlockWeightHistogram: public TBlockHistogram {
+ public:
+ TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
+
+ virtual ~TBlockWeightHistogram() {
+ }
+
+ private:
+ virtual void FastGreedyShrink(size_t intervals) final;
+ };
+
+ ///////////////////////
+ // TBlockWardHistogram
+ ///////////////////////
+
+ class TBlockWardHistogram: public TBlockHistogram {
+ public:
+ TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
+
+ virtual ~TBlockWardHistogram() {
+ }
+
+ private:
+ using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers)
+ using TCumulatives = TVector<TCumulative>;
+
+ struct TSplitInfo {
+ double profit;
+
+ TCumulatives::const_iterator beg;
+ TCumulatives::const_iterator mid;
+ TCumulatives::const_iterator end;
+
+ bool operator<(const TSplitInfo& other) const {
+ return profit < other.profit;
+ }
+ };
+
+ private:
+ virtual void FastGreedyShrink(size_t intervals) final;
+
+ static bool CalcSplitInfo(const TCumulatives::const_iterator beg,
+ const TCumulatives::const_iterator end,
+ TSplitInfo& splitInfo);
+ };
+
+}
diff --git a/library/cpp/histogram/adaptive/common.cpp b/library/cpp/histogram/adaptive/common.cpp
new file mode 100644
index 0000000000..afc6322fce
--- /dev/null
+++ b/library/cpp/histogram/adaptive/common.cpp
@@ -0,0 +1,19 @@
+#include "common.h"
+
+namespace NKiwiAggr {
+ TWeightedValue CalcDistanceQuality(const TWeightedValue& left, const TWeightedValue& right) {
+ return TWeightedValue(right.first - left.first, left.first);
+ }
+
+ TWeightedValue CalcWeightQuality(const TWeightedValue& left, const TWeightedValue& right) {
+ return TWeightedValue(right.second + left.second, left.first);
+ }
+
+ TWeightedValue CalcWardQuality(const TWeightedValue& left, const TWeightedValue& right) {
+ const double N1 = left.second;
+ const double N2 = right.second;
+ const double mu1 = left.first;
+ const double mu2 = right.first;
+ return TWeightedValue(N1 * N2 / (N1 + N2) * (mu1 - mu2) * (mu1 - mu2), left.first);
+ }
+}
diff --git a/library/cpp/histogram/adaptive/common.h b/library/cpp/histogram/adaptive/common.h
new file mode 100644
index 0000000000..c0f7dfb26b
--- /dev/null
+++ b/library/cpp/histogram/adaptive/common.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <utility>
+
+namespace NKiwiAggr {
+ using TWeightedValue = std::pair<double, double>; // value, weight
+ using TQualityFunction = TWeightedValue (*)(const TWeightedValue&, const TWeightedValue&);
+
+ TWeightedValue CalcDistanceQuality(const TWeightedValue& left, const TWeightedValue& right);
+ TWeightedValue CalcWeightQuality(const TWeightedValue& left, const TWeightedValue& right);
+ TWeightedValue CalcWardQuality(const TWeightedValue& left, const TWeightedValue& right);
+}
diff --git a/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp b/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp
new file mode 100644
index 0000000000..558aba9e2d
--- /dev/null
+++ b/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp
@@ -0,0 +1,538 @@
+#include "fixed_bin_histogram.h"
+#include "auto_histogram.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/yexception.h>
+#include <util/generic/ymath.h>
+#include <util/string/printf.h>
+
+namespace NKiwiAggr {
+ TFixedBinHistogram::TFixedBinHistogram(size_t intervals, ui64 id, size_t trainingSetSize)
+ : TrainingSetSize(trainingSetSize)
+ , IsInitialized(false)
+ , IsEmpty(true)
+ , Id(id)
+ , Sum(0.0)
+ , Freqs(0)
+ , ReserveFreqs(0)
+ , BinRange(0.0)
+ , Intervals(intervals)
+ , BaseIndex(intervals / 2)
+ {
+ }
+
+ TFixedBinHistogram::TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize)
+ : TrainingSetSize(trainingSetSize)
+ , IsInitialized(false)
+ , IsEmpty(true)
+ , Id(defaultId)
+ , Sum(0.0)
+ , Freqs(0)
+ , ReserveFreqs(0)
+ , BinRange(0.0)
+ , Intervals(defaultIntervals)
+ , BaseIndex(defaultIntervals / 2)
+ {
+ FromProto(histo);
+ }
+
+ TFixedBinHistogram::TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize)
+ : TrainingSetSize(trainingSetSize)
+ , IsInitialized(false)
+ , IsEmpty(true)
+ , Id(defaultId)
+ , Sum(0.0)
+ , Freqs(0)
+ , ReserveFreqs(0)
+ , BinRange(0.0)
+ , Intervals(defaultIntervals)
+ , BaseIndex(defaultIntervals / 2)
+ {
+ TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histo);
+ if (!fixedBinHisto) {
+ FromIHistogram(histo);
+ return;
+ }
+ fixedBinHisto->Initialize();
+ TrainingSetSize = fixedBinHisto->TrainingSetSize;
+ IsInitialized = fixedBinHisto->IsInitialized;
+ IsEmpty = fixedBinHisto->IsEmpty;
+ Id = fixedBinHisto->Id;
+ MinValue = fixedBinHisto->MinValue;
+ MaxValue = fixedBinHisto->MaxValue;
+ Sum = fixedBinHisto->Sum;
+ Freqs.assign(fixedBinHisto->Freqs.begin(), fixedBinHisto->Freqs.end());
+ ReserveFreqs.assign(fixedBinHisto->ReserveFreqs.begin(), fixedBinHisto->ReserveFreqs.end());
+ ReferencePoint = fixedBinHisto->ReferencePoint;
+ BinRange = fixedBinHisto->BinRange;
+ Intervals = fixedBinHisto->Intervals;
+ FirstUsedBin = fixedBinHisto->FirstUsedBin;
+ LastUsedBin = fixedBinHisto->LastUsedBin;
+ BaseIndex = fixedBinHisto->BaseIndex;
+ }
+
+ void TFixedBinHistogram::Clear() {
+ TrainingSet.Destroy();
+ IsInitialized = false;
+ IsEmpty = true;
+ Sum = 0.0;
+ Freqs.clear();
+ ReserveFreqs.clear();
+ BinRange = 0.0;
+ }
+
+ void TFixedBinHistogram::Add(const THistoRec& rec) {
+ if (!rec.HasId() || rec.GetId() == Id) {
+ Add(rec.GetValue(), rec.GetWeight());
+ }
+ }
+
+ void TFixedBinHistogram::Add(double value, double weight) {
+ if (!IsValidFloat(value) || !IsValidFloat(weight)) {
+ ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight);
+ }
+
+ if (weight <= 0.0) {
+ return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden
+ }
+
+ Sum += weight;
+
+ if (!IsInitialized) {
+ if (!TrainingSet) {
+ TrainingSet.Reset(new TVector<TWeightedValue>(0));
+ }
+ TrainingSet->push_back(TWeightedValue(value, weight));
+ if (TrainingSet->size() >= TrainingSetSize) {
+ Initialize();
+ }
+ return;
+ }
+
+ i32 bin = CalcBin(value);
+ if (bin < 0 || bin >= (i32)Freqs.size() || (BinRange == 0.0 && value != ReferencePoint)) {
+ Shrink(Min(value, MinValue), Max(value, MaxValue));
+ Freqs[CalcBin(value)] += weight;
+ } else {
+ MinValue = Min(value, MinValue);
+ MaxValue = Max(value, MaxValue);
+ FirstUsedBin = Min(FirstUsedBin, bin);
+ LastUsedBin = Max(LastUsedBin, bin);
+ Freqs[bin] += weight;
+ }
+ }
+
+ void TFixedBinHistogram::Merge(const THistogram& /*histo*/, double /*multiplier*/) {
+ ythrow yexception() << "Method is not implemented for TFixedBinHistogram";
+ }
+
+ void TFixedBinHistogram::Merge(const TVector<THistogram>& histogramsToMerge) {
+ TVector<IHistogramPtr> parsedHistogramsToMerge;
+ for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
+ parsedHistogramsToMerge.push_back(IHistogramPtr(new TAutoHistogram(histogramsToMerge[i], Intervals, Id)));
+ }
+ Merge(parsedHistogramsToMerge);
+ }
+
+ void TFixedBinHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) {
+ TVector<IHistogramPtr> histogramsToMergeRepacked(0);
+ TVector<TFixedBinHistogram*> histograms(0);
+
+ // put current histogram to the vector of histograms to merge and clear self
+ if (!Empty()) {
+ histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(this, Intervals, Id, TrainingSetSize)));
+ histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get()));
+ }
+ Clear();
+
+ for (size_t i = 0; i < histogramsToMerge.size(); ++i) {
+ if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) {
+ continue;
+ }
+ TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histogramsToMerge[i].Get());
+ if (fixedBinHisto) {
+ fixedBinHisto->Initialize();
+ histogramsToMergeRepacked.push_back(histogramsToMerge[i]);
+ } else {
+ histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(histogramsToMerge[i].Get(), Intervals, Id, TrainingSetSize))); // Convert histograms that are not of TFixedBinHistogram type
+ }
+ histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get()));
+ }
+
+ if (histograms.size() == 0) {
+ return;
+ }
+
+ double minValue = histograms[0]->MinValue;
+ double maxValue = histograms[0]->MaxValue;
+ Sum = histograms[0]->Sum;
+ for (size_t i = 1; i < histograms.size(); ++i) {
+ minValue = Min(minValue, histograms[i]->MinValue);
+ maxValue = Max(maxValue, histograms[i]->MaxValue);
+ Sum += histograms[i]->Sum;
+ }
+ SetFrame(minValue, maxValue, true);
+
+ if (BinRange == 0.0) {
+ Freqs[BaseIndex] = Sum;
+ return;
+ }
+
+ for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) {
+ TFixedBinHistogram* histo = histograms[histoIndex];
+ for (i32 bin = histo->FirstUsedBin; bin <= histo->LastUsedBin; ++bin) {
+ double binStart = histo->BinStart(bin);
+ double binEnd = histo->BinEnd(bin);
+ double freq = histo->Freqs[bin];
+ if (binStart == binEnd) {
+ Freqs[CalcBin(binStart)] += freq;
+ continue;
+ }
+ size_t firstCross = CalcBin(binStart);
+ size_t lastCross = CalcBin(binEnd);
+ for (size_t i = firstCross; i <= lastCross; ++i) {
+ double mergedBinStart = BinStart(i);
+ double mergedBinEnd = BinEnd(i);
+ double crossStart = Max(mergedBinStart, binStart);
+ double crossEnd = Min(mergedBinEnd, binEnd);
+ if (binStart == binEnd) {
+ }
+ Freqs[i] += freq * (crossEnd - crossStart) / (binEnd - binStart);
+ }
+ }
+ }
+ }
+
+ void TFixedBinHistogram::Multiply(double factor) {
+ if (!IsValidFloat(factor) || factor <= 0) {
+ ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor;
+ }
+ if (!IsInitialized) {
+ Initialize();
+ }
+ Sum *= factor;
+ for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
+ Freqs[i] *= factor;
+ }
+ }
+
+ void TFixedBinHistogram::FromProto(const THistogram& histo) {
+ if (histo.HasType() && histo.GetType() != HT_FIXED_BIN_HISTOGRAM) {
+ ythrow yexception() << "Attempt to parse TFixedBinHistogram from THistogram protobuf record of wrong type = " << (ui32)histo.GetType();
+ }
+ TrainingSet.Destroy();
+ IsInitialized = false;
+ Sum = 0.0;
+
+ Id = histo.GetId();
+ size_t intervals = histo.FreqSize();
+ if (intervals == 0) {
+ IsEmpty = true;
+ return;
+ }
+ Intervals = intervals;
+ TrainingSetSize = Intervals;
+ BaseIndex = Intervals / 2;
+
+ if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue()) || !IsValidFloat(histo.GetBinRange())) {
+ ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f binrange %f", Id, histo.GetMinValue(), histo.GetMaxValue(), histo.GetBinRange());
+ }
+
+ double minValue = histo.GetMinValue();
+ double binRange = histo.GetBinRange();
+ double maxValue = histo.HasMaxValue() ? histo.GetMaxValue() : minValue + binRange * Intervals;
+ SetFrame(minValue, maxValue, true);
+ BinRange = binRange;
+ for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
+ Freqs[i] = histo.GetFreq(i - BaseIndex);
+ if (!IsValidFloat(Freqs[i])) {
+ ythrow yexception() << Sprintf("FromProto in histogram id %lu: bad value %f", Id, Freqs[i]);
+ }
+ Sum += Freqs[i];
+ }
+ }
+
+ void TFixedBinHistogram::ToProto(THistogram& histo) {
+ histo.Clear();
+ if (!IsInitialized) {
+ Initialize();
+ }
+ histo.SetType(HT_FIXED_BIN_HISTOGRAM);
+ histo.SetId(Id);
+ if (IsEmpty) {
+ return;
+ }
+ if (FirstUsedBin < (i32)BaseIndex || (LastUsedBin - FirstUsedBin + 1) > (i32)Intervals) {
+ Shrink(MinValue, MaxValue);
+ }
+ histo.SetMinValue(MinValue);
+ histo.SetMaxValue(MaxValue);
+ histo.SetBinRange(BinRange);
+ for (ui32 i = BaseIndex; i < BaseIndex + Intervals; ++i) {
+ histo.AddFreq(Freqs[i]);
+ }
+ }
+
+ void TFixedBinHistogram::SetId(ui64 id) {
+ Id = id;
+ }
+
+ ui64 TFixedBinHistogram::GetId() {
+ return Id;
+ }
+
+ bool TFixedBinHistogram::Empty() {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ return IsEmpty;
+ }
+
+ double TFixedBinHistogram::GetMinValue() {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ return MinValue;
+ }
+
+ double TFixedBinHistogram::GetMaxValue() {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ return MaxValue;
+ }
+
+ double TFixedBinHistogram::GetSum() {
+ return Sum;
+ }
+
+ double TFixedBinHistogram::GetSumInRange(double leftBound, double rightBound) {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ if (leftBound > rightBound) {
+ return 0.0;
+ }
+ return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum;
+ }
+
+ double TFixedBinHistogram::GetSumAboveBound(double bound) {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ if (IsEmpty) {
+ return 0.0;
+ }
+ if (BinRange == 0.0) { // special case - all values added to histogram are the same
+ return (bound <= ReferencePoint) ? Sum : 0.0;
+ }
+ i32 bin = CalcBin(bound);
+ if (bin < FirstUsedBin) {
+ return Sum;
+ }
+ if (bin > LastUsedBin) {
+ return 0.0;
+ }
+ double binStart = BinStart(bin);
+ double binEnd = BinEnd(bin);
+ double result = (bound < binStart) ? Freqs[bin] : Freqs[bin] * (binEnd - bound) / (binEnd - binStart);
+ for (i32 i = bin + 1; i <= LastUsedBin; ++i) {
+ result += Freqs[i];
+ }
+ return result;
+ }
+
+ double TFixedBinHistogram::GetSumBelowBound(double bound) {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ if (IsEmpty) {
+ return 0.0;
+ }
+ if (BinRange == 0.0) { // special case - all values added to histogram are the same
+ return (bound > ReferencePoint) ? Sum : 0.0;
+ }
+ i32 bin = CalcBin(bound);
+ if (bin < FirstUsedBin) {
+ return 0.0;
+ }
+ if (bin > LastUsedBin) {
+ return Sum;
+ }
+ double binStart = BinStart(bin);
+ double binEnd = BinEnd(bin);
+ double result = (bound > binEnd) ? Freqs[bin] : Freqs[bin] * (bound - binStart) / (binEnd - binStart);
+ for (i32 i = bin - 1; i >= FirstUsedBin; --i) {
+ result += Freqs[i];
+ }
+ return result;
+ }
+
+ double TFixedBinHistogram::CalcUpperBound(double sum) {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ if (sum == 0.0) {
+ return MinValue;
+ }
+ if (IsEmpty) {
+ return MaxValue;
+ }
+ i32 currentBin = FirstUsedBin;
+ double gatheredSum = 0.0;
+ while (gatheredSum < sum && currentBin <= LastUsedBin) {
+ gatheredSum += Freqs[currentBin];
+ ++currentBin;
+ }
+ --currentBin;
+ if ((gatheredSum <= sum && currentBin == LastUsedBin) || (Freqs[currentBin] == 0)) {
+ return MaxValue;
+ }
+ double binStart = BinStart(currentBin);
+ double binEnd = BinEnd(currentBin);
+ return binEnd - (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin];
+ }
+
+ double TFixedBinHistogram::CalcLowerBound(double sum) {
+ if (!IsInitialized) {
+ Initialize();
+ }
+ if (sum == 0.0) {
+ return MaxValue;
+ }
+ if (IsEmpty) {
+ return MinValue;
+ }
+ i32 currentBin = LastUsedBin;
+ double gatheredSum = 0.0;
+ while (gatheredSum < sum && currentBin >= FirstUsedBin) {
+ gatheredSum += Freqs[currentBin];
+ --currentBin;
+ }
+ ++currentBin;
+ if ((gatheredSum <= sum && currentBin == FirstUsedBin) || (Freqs[currentBin] == 0)) {
+ return MinValue;
+ }
+ double binStart = BinStart(currentBin);
+ double binEnd = BinEnd(currentBin);
+ return binStart + (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin];
+ }
+
+ double TFixedBinHistogram::CalcUpperBoundSafe(double sum) {
+ if (!Empty()) {
+ sum = Max(Freqs[FirstUsedBin], sum);
+ }
+ return CalcUpperBound(sum);
+ }
+
+ double TFixedBinHistogram::CalcLowerBoundSafe(double sum) {
+ if (!Empty()) {
+ sum = Max(Freqs[LastUsedBin], sum);
+ }
+ return CalcLowerBound(sum);
+ }
+
+ double TFixedBinHistogram::CalcBinRange(double referencePoint, double maxValue) {
+ return (maxValue - referencePoint) / ((double)Intervals - 0.02);
+ }
+
+ void TFixedBinHistogram::SetFrame(double minValue, double maxValue, bool clear) {
+ MinValue = minValue;
+ MaxValue = maxValue;
+ ReferencePoint = MinValue;
+ BinRange = CalcBinRange(ReferencePoint, MaxValue);
+ FirstUsedBin = BaseIndex;
+ LastUsedBin = (BinRange == 0.0) ? BaseIndex : BaseIndex + Intervals - 1;
+ if (clear) {
+ Freqs.assign(2 * Intervals, 0.0);
+ ReserveFreqs.assign(2 * Intervals, 0.0);
+ IsEmpty = false;
+ IsInitialized = true;
+ }
+ }
+
+ void TFixedBinHistogram::FromIHistogram(IHistogram* histo) {
+ if (!histo) {
+ ythrow yexception() << "Attempt to create TFixedBinFistogram from a NULL pointer";
+ }
+ Id = histo->GetId();
+ if (histo->Empty()) {
+ IsInitialized = false;
+ IsEmpty = true;
+ return;
+ }
+ SetFrame(histo->GetMinValue(), histo->GetMaxValue(), true);
+ Sum = histo->GetSum();
+ if (BinRange == 0.0) {
+ Freqs[BaseIndex] = Sum;
+ return;
+ }
+ for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
+ Freqs[i] = histo->GetSumInRange(BinStart(i), BinEnd(i));
+ }
+ return;
+ }
+
+ void TFixedBinHistogram::Initialize() {
+ if (IsInitialized) {
+ return;
+ }
+ if (!TrainingSet || TrainingSet->size() == 0) {
+ IsEmpty = true;
+ return;
+ }
+ SetFrame(MinElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first,
+ MaxElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, true);
+ for (TVector<TWeightedValue>::const_iterator it = TrainingSet->begin(); it != TrainingSet->end(); ++it) {
+ Freqs[CalcBin(it->first)] += it->second;
+ }
+ TrainingSet.Destroy();
+ }
+
+ i32 TFixedBinHistogram::CalcBin(double value) {
+ return (BinRange == 0.0) ? BaseIndex : static_cast<i32>(BaseIndex + (value - ReferencePoint) / BinRange);
+ }
+
+ double TFixedBinHistogram::CalcDensity(double value) {
+ i32 bin = CalcBin(value);
+ if (bin < 0 || bin >= (i32)Freqs.size() || BinRange == 0.0 || GetSum() == 0) {
+ return 0.0;
+ }
+ return Freqs[bin] / GetSum() / BinRange;
+ }
+
+ double TFixedBinHistogram::BinStart(i32 i) {
+ return Max(ReferencePoint + (i - BaseIndex) * BinRange, MinValue);
+ }
+
+ double TFixedBinHistogram::BinEnd(i32 i) {
+ return Min(ReferencePoint + (i + 1 - BaseIndex) * BinRange, MaxValue);
+ }
+
+ void TFixedBinHistogram::Shrink(double newReferencePoint, double newMaxValue) {
+ Y_VERIFY(newReferencePoint < newMaxValue, "Invalid Shrink()");
+ memset(&(ReserveFreqs[0]), 0, ReserveFreqs.size() * sizeof(double));
+
+ double newBinRange = CalcBinRange(newReferencePoint, newMaxValue);
+ for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) {
+ double binStart = BinStart(i);
+ double binEnd = BinEnd(i);
+ double freq = Freqs[i];
+ i32 firstCross = static_cast<i32>(BaseIndex + (binStart - newReferencePoint) / newBinRange);
+ i32 lastCross = static_cast<i32>(BaseIndex + (binEnd - newReferencePoint) / newBinRange);
+ for (i32 j = firstCross; j <= lastCross; ++j) {
+ double newBinStart = newReferencePoint + (j - BaseIndex) * newBinRange;
+ double newBinEnd = newReferencePoint + (j + 1 - BaseIndex) * newBinRange;
+ double crossStart = Max(newBinStart, binStart);
+ double crossEnd = Min(newBinEnd, binEnd);
+ ReserveFreqs[j] += (binStart == binEnd) ? freq : freq * (crossEnd - crossStart) / (binEnd - binStart);
+ }
+ }
+
+ Freqs.swap(ReserveFreqs);
+ SetFrame(newReferencePoint, newMaxValue, false);
+ }
+
+}
diff --git a/library/cpp/histogram/adaptive/fixed_bin_histogram.h b/library/cpp/histogram/adaptive/fixed_bin_histogram.h
new file mode 100644
index 0000000000..bd380bd94a
--- /dev/null
+++ b/library/cpp/histogram/adaptive/fixed_bin_histogram.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "histogram.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <utility>
+
+namespace NKiwiAggr {
+ class TFixedBinHistogram: private TNonCopyable, public IHistogram {
+ private:
+ static const size_t DEFAULT_TRAINING_SET_SIZE = 10000;
+ static const size_t DEFAULT_INTERVALS = 100;
+
+ typedef std::pair<double, double> TWeightedValue; // value, weight
+ THolder<TVector<TWeightedValue>> TrainingSet;
+ size_t TrainingSetSize;
+
+ bool IsInitialized;
+ bool IsEmpty;
+
+ ui64 Id;
+ double MinValue;
+ double MaxValue;
+ double Sum;
+
+ TVector<double> Freqs;
+ TVector<double> ReserveFreqs;
+ double ReferencePoint;
+ double BinRange;
+ size_t Intervals;
+ i32 FirstUsedBin;
+ i32 LastUsedBin;
+ i32 BaseIndex;
+
+ public:
+ TFixedBinHistogram(size_t intervals, ui64 id = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE);
+ TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE);
+ TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE);
+
+ virtual ~TFixedBinHistogram() {
+ }
+
+ virtual void Clear();
+
+ virtual void Add(double value, double weight);
+ virtual void Add(const THistoRec& histoRec);
+
+ virtual void Merge(const THistogram& histo, double multiplier);
+ virtual void Merge(const TVector<THistogram>& histogramsToMerge);
+ virtual void Merge(TVector<IHistogramPtr> histogramsToMerge);
+
+ virtual void Multiply(double factor);
+
+ virtual void FromProto(const THistogram& histo);
+ virtual void ToProto(THistogram& histo);
+
+ virtual void SetId(ui64 id);
+ virtual ui64 GetId();
+ virtual bool Empty();
+ virtual double GetMinValue();
+ virtual double GetMaxValue();
+ virtual double GetSum();
+ virtual double GetSumInRange(double leftBound, double rightBound);
+ virtual double GetSumAboveBound(double bound);
+ virtual double GetSumBelowBound(double bound);
+ virtual double CalcUpperBound(double sum);
+ virtual double CalcLowerBound(double sum);
+ virtual double CalcUpperBoundSafe(double sum);
+ virtual double CalcLowerBoundSafe(double sum);
+
+ double CalcDensity(double value);
+
+ private:
+ double CalcBinRange(double referencePoint, double maxValue);
+ void SetFrame(double minValue, double maxValue, bool clear);
+ void FromIHistogram(IHistogram* histo);
+ void Initialize();
+ i32 CalcBin(double value);
+ double BinStart(i32 i);
+ double BinEnd(i32 i);
+ void Shrink(double newMinValue, double newMaxValue);
+
+ static bool CompareWeightedValue(const TWeightedValue& left, const TWeightedValue& right) {
+ return left.first < right.first;
+ }
+ };
+
+}
diff --git a/library/cpp/histogram/adaptive/histogram.h b/library/cpp/histogram/adaptive/histogram.h
new file mode 100644
index 0000000000..360fd9a693
--- /dev/null
+++ b/library/cpp/histogram/adaptive/histogram.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+
+namespace NKiwiAggr {
+ class THistogram;
+ class THistoRec;
+
+ class IHistogram;
+ typedef TAtomicSharedPtr<IHistogram> IHistogramPtr;
+
+ class IHistogram {
+ public:
+ // Supposed constructors:
+ //
+ // TSomeHistogram(size_t intervals, ui64 id = 0); // where intervals is some constant that defines histogram accuracy
+ // TSomeHistogram(const THistogram& histo); // histo must be acceptable for TSomeHistogram, for example, only with HT_FIXED_BIN_HISTOGRAM for TFixedBinHistogram
+ // TSomeHistogram(IHistogram* histo); // any kind of IHistogram
+
+ virtual ~IHistogram() {
+ }
+
+ virtual void Clear() = 0;
+
+ // zero- or negative-weighted values are skipped
+ virtual void Add(double value, double weight) = 0;
+ virtual void Add(const THistoRec& histoRec) = 0;
+
+ // Merge some other histos into current
+ virtual void Merge(const THistogram& histo, double multiplier) = 0;
+ virtual void Merge(const TVector<THistogram>& histogramsToMerge) = 0;
+ virtual void Merge(TVector<IHistogramPtr> histogramsToMerge) = 0;
+
+ // factor should be greater then zero
+ virtual void Multiply(double factor) = 0;
+
+ virtual void FromProto(const THistogram& histo) = 0; // throws exception in case of wrong histogram type of histo
+ virtual void ToProto(THistogram& histo) = 0;
+
+ virtual void SetId(ui64 id) = 0;
+ virtual ui64 GetId() = 0;
+ virtual bool Empty() = 0;
+ virtual double GetMinValue() = 0;
+ virtual double GetMaxValue() = 0;
+ virtual double GetSum() = 0;
+ virtual double GetSumInRange(double leftBound, double rightBound) = 0;
+ virtual double GetSumAboveBound(double bound) = 0;
+ virtual double GetSumBelowBound(double bound) = 0;
+ virtual double CalcUpperBound(double sum) = 0;
+ virtual double CalcLowerBound(double sum) = 0;
+ virtual double CalcUpperBoundSafe(double sum) = 0;
+ virtual double CalcLowerBoundSafe(double sum) = 0;
+ double GetValueAtPercentile(double percentile) {
+ return CalcUpperBound(percentile * GetSum());
+ }
+ double GetValueAtPercentileSafe(double percentile) {
+ return CalcUpperBoundSafe(percentile * GetSum());
+ }
+
+ // Histogram implementation is supposed to clear all precomputed values() if Add() is called after PrecomputePartialSums()
+ virtual void PrecomputePartialSums() {
+ }
+ };
+
+}
diff --git a/library/cpp/histogram/adaptive/merger.h b/library/cpp/histogram/adaptive/merger.h
new file mode 100644
index 0000000000..fc9a6b6a4f
--- /dev/null
+++ b/library/cpp/histogram/adaptive/merger.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <util/generic/buffer.h>
+
+namespace NKiwiAggr {
+ class IMerger {
+ private:
+ bool IsMerged;
+ ui32 AutoMergeInterval; // Call Merge() after each AutoMergeInterval calls of Add(); zero means no autoMerge
+ ui32 NotMergedCount;
+
+ public:
+ IMerger(ui32 autoMergeInterval = 0)
+ : IsMerged(true)
+ , AutoMergeInterval(autoMergeInterval)
+ , NotMergedCount(0)
+ {
+ }
+
+ virtual ~IMerger() {
+ }
+
+ // returns true if something is added
+ virtual bool Add(const void* data, size_t size) {
+ if (AddImpl(data, size)) {
+ AutoMerge();
+ return true;
+ }
+ return false;
+ }
+
+ virtual void Merge() {
+ if (!IsMerged) {
+ MergeImpl();
+ IsMerged = true;
+ }
+ }
+
+ virtual void Reset() {
+ ResetImpl();
+ IsMerged = true;
+ }
+
+ // You can add some more result-getters if you want.
+ // Do not forget to call Merge() in the beginning of each merger.
+ virtual void GetResult(TBuffer& buffer) = 0;
+
+ protected:
+ // AutoMerge() is called in Add() after each AddImpl()
+ void AutoMerge() {
+ IsMerged = false;
+ if (AutoMergeInterval) {
+ ++NotMergedCount;
+ if (NotMergedCount >= AutoMergeInterval) {
+ MergeImpl();
+ IsMerged = true;
+ NotMergedCount = 0;
+ }
+ }
+ }
+
+ // Implementation of merger: define it in derivatives
+ virtual bool AddImpl(const void* data, size_t size) = 0; // returns true if something is added
+ virtual void MergeImpl() = 0;
+ virtual void ResetImpl() = 0;
+ };
+
+}
diff --git a/library/cpp/histogram/adaptive/multi_histogram.h b/library/cpp/histogram/adaptive/multi_histogram.h
new file mode 100644
index 0000000000..41caac5ba6
--- /dev/null
+++ b/library/cpp/histogram/adaptive/multi_histogram.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "histogram.h"
+#include "auto_histogram.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/hash.h>
+#include <util/generic/ptr.h>
+#include <utility>
+
+namespace NKiwiAggr {
+ template <class TMyHistogram>
+ class TMultiHistogram {
+ private:
+ static const size_t DEFAULT_INTERVALS = 100;
+
+ typedef THashMap<ui64, IHistogramPtr> THistogramsMap;
+ THistogramsMap Histograms;
+ size_t Intervals;
+
+ public:
+ TMultiHistogram(size_t intervals = DEFAULT_INTERVALS)
+ : Intervals(intervals)
+ {
+ }
+
+ TMultiHistogram(const THistograms& histograms, size_t defaultIntervals = DEFAULT_INTERVALS)
+ : Intervals(defaultIntervals)
+ {
+ FromProto(histograms);
+ }
+
+ virtual ~TMultiHistogram() {
+ }
+
+ void Clear() {
+ Histograms.clear();
+ }
+
+ void Add(const THistoRecs& histoRecs) {
+ for (size_t i = 0; i < histoRecs.HistoRecsSize(); ++i) {
+ Add(histoRecs.GetHistoRecs(i).GetId(), histoRecs.GetHistoRecs(i).GetValue(), histoRecs.GetHistoRecs(i).GetWeight());
+ }
+ }
+
+ void Add(const THistoRec& histoRec) {
+ Add(histoRec.GetId(), histoRec.GetValue(), histoRec.GetWeight());
+ }
+
+ void Add(ui64 id, double value, double weight) {
+ THistogramsMap::const_iterator it = Histograms.find(id);
+ if (it == Histograms.end()) {
+ it = Histograms.insert(std::make_pair(id, IHistogramPtr(new TMyHistogram(Intervals, id)))).first;
+ }
+ it->second->Add(value, weight);
+ }
+
+ void Multiply(double factor) {
+ for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) {
+ it->second->Multiply(factor);
+ }
+ }
+
+ TVector<ui64> GetIds() const {
+ TVector<ui64> result(0);
+ for (THistogramsMap::const_iterator it = Histograms.begin(); it != Histograms.end(); ++it) {
+ result.push_back(it->first);
+ }
+ return result;
+ }
+
+ IHistogramPtr GetHistogram(ui64 id) const {
+ THistogramsMap::const_iterator it = Histograms.find(id);
+ if (it != Histograms.end()) {
+ return it->second;
+ }
+ return IHistogramPtr();
+ }
+
+ double GetMaxHistoSum() const {
+ double sum = 0.0;
+ for (THistogramsMap::const_iterator it = Histograms.begin(); it != Histograms.end(); ++it) {
+ sum = std::max(sum, it->second->GetSum());
+ }
+ return sum;
+ }
+
+ bool Empty() {
+ for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) {
+ if (!it->second->Empty()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ virtual double OverallSum() {
+ double sum = 0.0;
+ for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) {
+ sum += it->second->GetSum();
+ }
+ return sum;
+ }
+
+ void FromProto(const THistograms& histograms) {
+ for (size_t i = 0; i < histograms.HistoRecsSize(); ++i) {
+ IHistogramPtr newHisto(new TMyHistogram(histograms.GetHistoRecs(i), Intervals));
+ if (!newHisto->Empty()) {
+ Histograms[newHisto->GetId()] = newHisto;
+ }
+ }
+ }
+
+ void ToProto(THistograms& histograms) {
+ histograms.Clear();
+ for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) {
+ THistogram* histo = histograms.AddHistoRecs();
+ it->second->ToProto(*histo);
+ }
+ }
+
+ void PrecomputePartialSums() {
+ for (auto& it : Histograms) {
+ it.second->PrecomputePartialSums();
+ }
+ }
+ };
+
+ template <class TMerger, class TSomeMultiHistogram>
+ static void MergeToMultiHistogram(const void* data, size_t size, TSomeMultiHistogram& multiHistogram, ui32 intervals = 300) {
+ TMerger merger(intervals);
+ merger.Add(data, size);
+ THistograms histograms;
+ merger.GetResult(histograms);
+ multiHistogram.FromProto(histograms);
+ }
+
+ // Good for parsing from THistograms protobuf
+ typedef TMultiHistogram<TAutoHistogram> TAutoMultiHistogram;
+ typedef TAtomicSharedPtr<TAutoMultiHistogram> TAutoMultiHistogramPtr;
+
+}
diff --git a/library/cpp/histogram/adaptive/protos/histo.proto b/library/cpp/histogram/adaptive/protos/histo.proto
new file mode 100644
index 0000000000..8961fef022
--- /dev/null
+++ b/library/cpp/histogram/adaptive/protos/histo.proto
@@ -0,0 +1,36 @@
+package NKiwiAggr;
+
+
+// THistoRec represents a value record added to a multihistogram
+message THistoRec {
+ optional uint64 Id = 1; // Current histogram identifier
+ optional double Value = 2;
+ optional double Weight = 3 [default = 1.0]; // You can set a certain weight to each record or just skip records using Weight=0
+}
+
+message THistoRecs {
+ repeated THistoRec HistoRecs = 1;
+}
+
+enum EHistogramType {
+ HT_FIXED_BIN_HISTOGRAM = 1;
+ HT_ADAPTIVE_DISTANCE_HISTOGRAM = 2;
+ HT_ADAPTIVE_WEIGHT_HISTOGRAM = 3;
+ HT_ADAPTIVE_HISTOGRAM = 4; // if the quality function is unknown
+ HT_ADAPTIVE_WARD_HISTOGRAM = 5;
+}
+
+message THistogram {
+ optional uint64 Id = 1;
+ optional double MinValue = 2;
+ optional double BinRange = 4; // for FIXED_BIN_HISTOGRAM only. And it's OK that it is 4 after 2
+ repeated float Freq = 5;
+ repeated float Position = 6; // for ADAPTIVE histograms only
+ optional double MaxValue = 7;
+ optional EHistogramType Type = 8; // Empty field means FIXED_BIN_HISTOGRAM
+}
+
+// Multihistogam
+message THistograms {
+ repeated THistogram HistoRecs = 1;
+}
diff --git a/library/cpp/histogram/adaptive/protos/python/ya.make b/library/cpp/histogram/adaptive/protos/python/ya.make
new file mode 100644
index 0000000000..3328c27965
--- /dev/null
+++ b/library/cpp/histogram/adaptive/protos/python/ya.make
@@ -0,0 +1,3 @@
+OWNER(abogutskiy)
+
+PY_PROTOS_FOR(library/cpp/histogram/adaptive/protos)
diff --git a/library/cpp/histogram/adaptive/protos/ya.make b/library/cpp/histogram/adaptive/protos/ya.make
new file mode 100644
index 0000000000..7635cfcb8c
--- /dev/null
+++ b/library/cpp/histogram/adaptive/protos/ya.make
@@ -0,0 +1,13 @@
+PROTO_LIBRARY()
+
+OWNER(g:crawl)
+
+SRCS(
+ histo.proto
+)
+
+IF (NOT PY_PROTOS_FOR)
+ EXCLUDE_TAGS(GO_PROTO)
+ENDIF()
+
+END()
diff --git a/library/cpp/histogram/adaptive/ya.make b/library/cpp/histogram/adaptive/ya.make
new file mode 100644
index 0000000000..b589801b27
--- /dev/null
+++ b/library/cpp/histogram/adaptive/ya.make
@@ -0,0 +1,20 @@
+LIBRARY()
+
+OWNER(
+ zosimov
+ svirg
+)
+
+SRCS(
+ common.cpp
+ adaptive_histogram.cpp
+ block_histogram.cpp
+ fixed_bin_histogram.cpp
+)
+
+PEERDIR(
+ contrib/libs/protobuf
+ library/cpp/histogram/adaptive/protos
+)
+
+END()
diff --git a/library/cpp/histogram/ya.make b/library/cpp/histogram/ya.make
new file mode 100644
index 0000000000..77cfd96902
--- /dev/null
+++ b/library/cpp/histogram/ya.make
@@ -0,0 +1,9 @@
+RECURSE(
+ adaptive
+ hdr
+ hdr/ut
+ simple
+ simple/ut
+ rt
+ rt/ut
+)