diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/histogram | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/histogram')
18 files changed, 2676 insertions, 0 deletions
diff --git a/library/cpp/histogram/adaptive/README.md b/library/cpp/histogram/adaptive/README.md new file mode 100644 index 0000000000..17a4edbe64 --- /dev/null +++ b/library/cpp/histogram/adaptive/README.md @@ -0,0 +1 @@ +See https://wiki.yandex-team.ru/robot/newdesign/histograms/ diff --git a/library/cpp/histogram/adaptive/adaptive_histogram.cpp b/library/cpp/histogram/adaptive/adaptive_histogram.cpp new file mode 100644 index 0000000000..cbfc494021 --- /dev/null +++ b/library/cpp/histogram/adaptive/adaptive_histogram.cpp @@ -0,0 +1,637 @@ +#include "adaptive_histogram.h" + +#include <util/generic/algorithm.h> +#include <util/generic/yexception.h> +#include <util/generic/ymath.h> +#include <util/string/printf.h> + +#include <util/system/backtrace.h> + +namespace NKiwiAggr { + TAdaptiveHistogram::TAdaptiveHistogram(size_t intervals, ui64 id, TQualityFunction qualityFunc) + : Id(id) + , Sum(0.0) + , Intervals(intervals) + , CalcQuality(qualityFunc) + { + } + + TAdaptiveHistogram::TAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, TQualityFunction qualityFunc) + : TAdaptiveHistogram(defaultIntervals, defaultId, qualityFunc) + { + FromProto(histo); + } + + TAdaptiveHistogram::TAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, TQualityFunction qualityFunc) + : TAdaptiveHistogram(defaultIntervals, defaultId, qualityFunc) + { + TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histo); + if (!adaptiveHisto) { + FromIHistogram(histo); + return; + } + Id = adaptiveHisto->Id; + MinValue = adaptiveHisto->MinValue; + MaxValue = adaptiveHisto->MaxValue; + Sum = adaptiveHisto->Sum; + Intervals = adaptiveHisto->Intervals; + Bins = adaptiveHisto->Bins; + BinsByQuality = adaptiveHisto->BinsByQuality; + if (CalcQuality == nullptr) { + CalcQuality = adaptiveHisto->CalcQuality; + } + } + + TQualityFunction TAdaptiveHistogram::GetQualityFunc() { + return CalcQuality; + } + + void TAdaptiveHistogram::Clear() { + Sum = 0.0; + Bins.clear(); + BinsByQuality.clear(); + } + + void TAdaptiveHistogram::Add(const THistoRec& histoRec) { + if (!histoRec.HasId() || histoRec.GetId() == Id) { + Add(histoRec.GetValue(), histoRec.GetWeight()); + } + } + + void TAdaptiveHistogram::Add(double value, double weight) { + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight); + } + TWeightedValue weightedValue(value, weight); + Add(weightedValue, true); + PrecomputedBins.clear(); + } + + void TAdaptiveHistogram::Merge(const THistogram& histo, double multiplier) { + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { + fprintf(stderr, "Merging in histogram id %lu: skip bad histo with minvalue %f maxvalue %f\n", Id, histo.GetMinValue(), histo.GetMaxValue()); + return; + } + if (histo.FreqSize() == 0) { + return; // skip empty histos + } + if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM || + histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM || + histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM || + histo.GetType() == HT_ADAPTIVE_HISTOGRAM) + { + Y_VERIFY(histo.FreqSize() == histo.PositionSize(), "Corrupted histo"); + for (size_t j = 0; j < histo.FreqSize(); ++j) { + double value = histo.GetPosition(j); + double weight = histo.GetFreq(j); + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); + continue; + } + Add(value, weight * multiplier); + } + + MinValue = Min(MinValue, histo.GetMinValue()); + MaxValue = Max(MaxValue, histo.GetMaxValue()); + } else if (histo.GetType() == HT_FIXED_BIN_HISTOGRAM) { + double pos = histo.GetMinValue() + histo.GetBinRange() / 2.0; + for (size_t j = 0; j < histo.FreqSize(); ++j) { + double weight = histo.GetFreq(j); + if (!IsValidFloat(pos) || !IsValidFloat(weight)) { + fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, pos, weight); + pos += histo.GetBinRange(); + continue; + } + Add(pos, weight * multiplier); + pos += histo.GetBinRange(); + } + + MinValue = Min(MinValue, histo.GetMinValue()); + MaxValue = Max(MaxValue, histo.GetMaxValue()); + } else { + ythrow yexception() << "Unknown THistogram type"; + } + } + + void TAdaptiveHistogram::Merge(const TVector<THistogram>& histogramsToMerge) { + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + Merge(histogramsToMerge[i], 1.0); + } + } + + void TAdaptiveHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) { + TVector<IHistogramPtr> histogramsToMergeRepacked(0); + TVector<TAdaptiveHistogram*> histograms(0); + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) { + continue; + } + TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histogramsToMerge[i].Get()); + if (adaptiveHisto) { + histogramsToMergeRepacked.push_back(histogramsToMerge[i]); + } else { + histogramsToMergeRepacked.push_back(IHistogramPtr(new TAdaptiveHistogram(histogramsToMerge[i].Get(), Intervals, Id, CalcQuality))); // Convert histograms that are not of TFixedBinHistogram type + } + if (histogramsToMergeRepacked.back()->Empty()) { + continue; + } + histograms.push_back(dynamic_cast<TAdaptiveHistogram*>(histogramsToMergeRepacked.back().Get())); + } + + if (histograms.size() == 0) { + return; + } + + for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) { + TAdaptiveHistogram* histo = histograms[histoIndex]; + for (TPairSet::const_iterator it = histo->Bins.begin(); it != histo->Bins.end(); ++it) { + Add(*it, true); + } + } + + for (size_t i = 0; i < histograms.size(); ++i) { + MinValue = Min(MinValue, histograms[i]->MinValue); + MaxValue = Max(MaxValue, histograms[i]->MaxValue); + } + } + + void TAdaptiveHistogram::Multiply(double factor) { + if (!IsValidFloat(factor) || factor <= 0) { + ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; + } + Sum *= factor; + + TPairSet newBins; + for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) { + newBins.insert(TWeightedValue(it->first, it->second * factor)); + } + Bins = newBins; + + TPairSet newBinsByQuality; + for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) { + TPairSet::iterator rightBin = it; + ++rightBin; + if (rightBin == Bins.end()) { + break; + } + newBinsByQuality.insert(CalcQuality(*it, *rightBin)); + } + BinsByQuality = newBinsByQuality; + } + + void TAdaptiveHistogram::FromProto(const THistogram& histo) { + Y_VERIFY(histo.HasType(), "Attempt to parse TAdaptiveHistogram from THistogram protobuf with no Type field set"); + ; + switch (histo.GetType()) { // check that histogram type could be deduced + case HT_ADAPTIVE_DISTANCE_HISTOGRAM: + case HT_ADAPTIVE_WEIGHT_HISTOGRAM: + case HT_ADAPTIVE_WARD_HISTOGRAM: + break; // ok + case HT_ADAPTIVE_HISTOGRAM: + if (CalcQuality != nullptr) + break; // ok + [[fallthrough]]; + default: // not ok + ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record of type = " << (ui32)histo.GetType(); + } + + if (histo.FreqSize() != histo.PositionSize()) { + ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize(); + } + if (CalcQuality == nullptr) { + if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) { + CalcQuality = CalcDistanceQuality; + } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) { + CalcQuality = CalcWeightQuality; + } else if (histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM) { + CalcQuality = CalcWardQuality; + } else { + ythrow yexception() << "Attempt to parse an HT_ADAPTIVE_HISTOGRAM without default quality function"; + } + } + Id = histo.GetId(); + Sum = 0.0; + Intervals = Max(Intervals, histo.FreqSize()); + for (size_t i = 0; i < histo.FreqSize(); ++i) { + double value = histo.GetPosition(i); + double weight = histo.GetFreq(i); + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + fprintf(stderr, "FromProto in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); + continue; + } + Add(value, weight); + } + + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { + ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue()); + } + + MinValue = histo.GetMinValue(); + MaxValue = histo.GetMaxValue(); + } + + void TAdaptiveHistogram::ToProto(THistogram& histo) { + histo.Clear(); + if (CalcQuality == CalcDistanceQuality) { + histo.SetType(HT_ADAPTIVE_DISTANCE_HISTOGRAM); + } else if (CalcQuality == CalcWeightQuality) { + histo.SetType(HT_ADAPTIVE_WEIGHT_HISTOGRAM); + } else if (CalcQuality == CalcWardQuality) { + histo.SetType(HT_ADAPTIVE_WARD_HISTOGRAM); + } else { + histo.SetType(HT_ADAPTIVE_HISTOGRAM); + } + histo.SetId(Id); + if (Empty()) { + return; + } + histo.SetMinValue(MinValue); + histo.SetMaxValue(MaxValue); + for (TPairSet::const_iterator it = Bins.begin(); it != Bins.end(); ++it) { + histo.AddFreq(it->second); + histo.AddPosition(it->first); + } + } + + void TAdaptiveHistogram::SetId(ui64 id) { + Id = id; + } + + ui64 TAdaptiveHistogram::GetId() { + return Id; + } + + bool TAdaptiveHistogram::Empty() { + return Bins.size() == 0; + } + + double TAdaptiveHistogram::GetMinValue() { + return MinValue; + } + + double TAdaptiveHistogram::GetMaxValue() { + return MaxValue; + } + + double TAdaptiveHistogram::GetSum() { + return Sum; + } + + double TAdaptiveHistogram::GetSumInRange(double leftBound, double rightBound) { + if (leftBound > rightBound) { + return 0.0; + } + return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum; + } + + double TAdaptiveHistogram::GetSumAboveBound(double bound) { + if (Empty()) { + return 0.0; + } + if (bound < MinValue) { + return Sum; + } + if (bound > MaxValue) { + return 0.0; + } + + if (!PrecomputedBins.empty()) { + return GetSumAboveBoundImpl( + bound, + PrecomputedBins, + LowerBound(PrecomputedBins.begin(), PrecomputedBins.end(), TFastBin{bound, -1.0, 0, 0}), + [](const auto& it) { return it->SumAbove; }); + } else { + return GetSumAboveBoundImpl( + bound, + Bins, + Bins.lower_bound(TWeightedValue(bound, -1.0)), + [this](TPairSet::const_iterator rightBin) { + ++rightBin; + double sum = 0; + for (TPairSet::const_iterator it = rightBin; it != Bins.end(); ++it) { + sum += it->second; + } + return sum; + }); + } + } + + double TAdaptiveHistogram::GetSumBelowBound(double bound) { + if (Empty()) { + return 0.0; + } + if (bound < MinValue) { + return 0.0; + } + if (bound > MaxValue) { + return Sum; + } + + if (!PrecomputedBins.empty()) { + return GetSumBelowBoundImpl( + bound, + PrecomputedBins, + LowerBound(PrecomputedBins.begin(), PrecomputedBins.end(), TFastBin{bound, -1.0, 0, 0}), + [](const auto& it) { return it->SumBelow; }); + } else { + return GetSumBelowBoundImpl( + bound, + Bins, + Bins.lower_bound(TWeightedValue(bound, -1.0)), + [this](TPairSet::const_iterator rightBin) { + double sum = 0; + for (TPairSet::iterator it = Bins.begin(); it != rightBin; ++it) { + sum += it->second; + } + return sum; + }); + } + } + + double TAdaptiveHistogram::CalcUpperBound(double sum) { + Y_VERIFY(sum >= 0, "Sum must be >= 0"); + if (sum == 0.0) { + return MinValue; + } + if (Empty()) { + return MaxValue; + } + TPairSet::iterator current = Bins.begin(); + double gatheredSum = 0.0; + while (current != Bins.end() && gatheredSum < sum) { + gatheredSum += current->second; + ++current; + } + --current; + if (gatheredSum < sum) { + return MaxValue; + } + TWeightedValue left(MinValue, 0.0); + TWeightedValue right(MaxValue, 0.0); + if (current != Bins.begin()) { + TPairSet::iterator leftBin = current; + --leftBin; + left = *leftBin; + } + { + TPairSet::iterator rightBin = current; + ++rightBin; + if (rightBin != Bins.end()) { + right = *rightBin; + } + } + double sumToAdd = sum - (gatheredSum - current->second - left.second / 2); + if (sumToAdd <= ((current->second + left.second) / 2)) { + return left.first + 2 * sumToAdd * (current->first - left.first) / (current->second + left.second); + } else { + sumToAdd -= (current->second + left.second) / 2; + return current->first + 2 * sumToAdd * (right.first - current->first) / (right.second + current->second); + } + } + + double TAdaptiveHistogram::CalcLowerBound(double sum) { + Y_VERIFY(sum >= 0, "Sum must be >= 0"); + if (sum == 0.0) { + return MaxValue; + } + if (Empty()) { + return MinValue; + } + TPairSet::iterator current = Bins.end(); + double gatheredSum = 0.0; + while (current != Bins.begin() && gatheredSum < sum) { + --current; + gatheredSum += current->second; + } + if (gatheredSum < sum) { + return MinValue; + } + TWeightedValue left(MinValue, 0.0); + TWeightedValue right(MaxValue, 0.0); + if (current != Bins.begin()) { + TPairSet::iterator leftBin = current; + --leftBin; + left = *leftBin; + } + { + TPairSet::iterator rightBin = current; + ++rightBin; + if (rightBin != Bins.end()) { + right = *rightBin; + } + } + double sumToAdd = sum - (gatheredSum - current->second - right.second / 2); + if (sumToAdd <= ((current->second + right.second) / 2)) { + return right.first - 2 * sumToAdd * (right.first - current->first) / (current->second + right.second); + } else { + sumToAdd -= (current->second + right.second) / 2; + return current->first - 2 * sumToAdd * (current->first - left.first) / (left.second + current->second); + } + } + + double TAdaptiveHistogram::CalcUpperBoundSafe(double sum) { + if (!Empty()) { + sum = Max(Bins.begin()->second, sum); + } + return CalcUpperBound(sum); + } + + double TAdaptiveHistogram::CalcLowerBoundSafe(double sum) { + if (!Empty()) { + sum = Max(Bins.rbegin()->second, sum); + } + return CalcLowerBound(sum); + } + + void TAdaptiveHistogram::FromIHistogram(IHistogram* histo) { + if (!histo) { + ythrow yexception() << "Attempt to create TAdaptiveHistogram from a NULL pointer"; + } + if (CalcQuality == CalcWardQuality) { + ythrow yexception() << "Not implemented"; + } else if (CalcQuality != CalcDistanceQuality && CalcQuality != CalcWeightQuality) { + ythrow yexception() << "Attempt to create TAdaptiveHistogram from a pointer without default CalcQuality"; + } + Id = histo->GetId(); + if (histo->Empty()) { + return; + } + double sum = histo->GetSum(); + double minValue = histo->GetMinValue(); + double maxValue = histo->GetMaxValue(); + if (minValue == maxValue) { + Add(minValue, sum); + return; + } + if (CalcQuality == CalcDistanceQuality) { + double binRange = (maxValue - minValue) / (Intervals); + for (size_t i = 0; i < Intervals; ++i) { + Add(minValue + binRange * (i + 0.5), histo->GetSumInRange(minValue + binRange * i, minValue + binRange * (i + 1))); + } + } else if (CalcQuality == CalcWeightQuality && sum != 0.0) { + double slab = sum / Intervals; + double prevBound = minValue; + for (size_t i = 0; i < Intervals; ++i) { + double bound = histo->CalcUpperBound(slab * (i + 1)); + Add((bound + prevBound) / 2, slab); + prevBound = bound; + } + } + MinValue = minValue; + MaxValue = maxValue; + } + + void TAdaptiveHistogram::Add(const TWeightedValue& weightedValue, bool initial) { + const double& value = weightedValue.first; + const double& weight = weightedValue.second; + if (weight <= 0.0) { + return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden + } + if (initial) { + Sum += weight; + } + if (Bins.size() == 0) { + MinValue = value; + MaxValue = value; + Bins.insert(weightedValue); + return; + } + if (value < MinValue) { + MinValue = value; + } + if (value > MaxValue) { + MaxValue = value; + } + TPairSet::iterator rightBin = Bins.lower_bound(TWeightedValue(value, -1.0)); + if (rightBin != Bins.end() && rightBin->first == value) { + TPairSet::iterator currentBin = rightBin; + ++rightBin; + TWeightedValue newBin(value, weight + currentBin->second); + if (rightBin != Bins.end()) { + Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) == 1, "Erase failed"); + BinsByQuality.insert(CalcQuality(newBin, *rightBin)); + } + if (currentBin != Bins.begin()) { + TPairSet::iterator leftBin = currentBin; + --leftBin; + Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) == 1, "Erase failed"); + BinsByQuality.insert(CalcQuality(*leftBin, newBin)); + } + Bins.erase(currentBin); + Bins.insert(newBin); + return; + } + if (rightBin == Bins.begin()) { + BinsByQuality.insert(CalcQuality(weightedValue, *rightBin)); + } else { + TPairSet::iterator leftBin = rightBin; + --leftBin; + if (rightBin == Bins.end()) { + BinsByQuality.insert(CalcQuality(*leftBin, weightedValue)); + } else { + Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *rightBin)) == 1, "Erase failed"); + BinsByQuality.insert(CalcQuality(*leftBin, weightedValue)); + BinsByQuality.insert(CalcQuality(weightedValue, *rightBin)); + } + } + Bins.insert(weightedValue); + if (Bins.size() > Intervals) { + Shrink(); + } + } + + void TAdaptiveHistogram::Erase(double value) { + TPairSet::iterator currentBin = Bins.lower_bound(TWeightedValue(value, -1.0)); + Y_VERIFY(currentBin != Bins.end() && currentBin->first == value, "Can't find bin that should be erased"); + TPairSet::iterator rightBin = currentBin; + ++rightBin; + if (currentBin == Bins.begin()) { + Y_VERIFY(rightBin != Bins.end(), "No right bin for the first bin"); + Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) != 0, "Erase failed"); + } else { + TPairSet::iterator leftBin = currentBin; + --leftBin; + if (rightBin == Bins.end()) { + Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) != 0, "Erase failed"); + } else { + Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) != 0, "Erase failed"); + Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) != 0, "Erase failed"); + BinsByQuality.insert(CalcQuality(*leftBin, *rightBin)); + } + } + Bins.erase(currentBin); + } + + void TAdaptiveHistogram::Shrink() { + TPairSet::iterator worstBin = BinsByQuality.begin(); + Y_VERIFY(worstBin != BinsByQuality.end(), "No right bin for the first bin"); + TPairSet::iterator leftBin = Bins.lower_bound(TWeightedValue(worstBin->second, -1.0)); + Y_VERIFY(leftBin != Bins.end() && leftBin->first == worstBin->second, "Can't find worst bin"); + TPairSet::iterator rightBin = leftBin; + ++rightBin; + Y_VERIFY(rightBin != Bins.end(), "Can't find right bin"); + + TWeightedValue newBin((leftBin->first * leftBin->second + rightBin->first * rightBin->second) / (leftBin->second + rightBin->second), leftBin->second + rightBin->second); + if (Bins.size() > 2) { + Erase(leftBin->first); + Erase(rightBin->first); + } else { + Bins.clear(); + BinsByQuality.clear(); + } + Add(newBin, false); + } + + void TAdaptiveHistogram::PrecomputePartialSums() { + PrecomputedBins.clear(); + PrecomputedBins.reserve(Bins.size()); + double currentSum = 0; + for (const auto& bin : Bins) { + PrecomputedBins.emplace_back(bin.first, bin.second, currentSum, Sum - currentSum - bin.second); + currentSum += bin.second; + } + } + + template <typename TBins, typename TGetSumAbove> + double TAdaptiveHistogram::GetSumAboveBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumAbove& getSumAbove) const { + typename TBins::value_type left(MinValue, 0.0); + typename TBins::value_type right(MaxValue, 0.0); + if (rightBin != bins.end()) { + right = *rightBin; + } + if (rightBin != bins.begin()) { + typename TBins::const_iterator leftBin = rightBin; + --leftBin; + left = *leftBin; + } + double sum = (right.second / 2) + ((right.first == left.first) ? ((left.second + right.second) / 2) : (((left.second + right.second) / 2) * (right.first - bound) / (right.first - left.first))); + if (rightBin == bins.end()) { + return sum; + } + sum += getSumAbove(rightBin); + return sum; + } + + template <typename TBins, typename TGetSumBelow> + double TAdaptiveHistogram::GetSumBelowBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumBelow& getSumBelow) const { + typename TBins::value_type left(MinValue, 0.0); + typename TBins::value_type right(MaxValue, 0.0); + if (rightBin != bins.end()) { + right = *rightBin; + } + if (rightBin != bins.begin()) { + typename TBins::const_iterator leftBin = rightBin; + --leftBin; + left = *leftBin; + } + double sum = (left.second / 2) + ((right.first == left.first) ? ((left.second + right.second) / 2) : (((left.second + right.second) / 2) * (bound - left.first) / (right.first - left.first))); + if (rightBin == bins.begin()) { + return sum; + } + --rightBin; + sum += getSumBelow(rightBin); + return sum; + } + +} diff --git a/library/cpp/histogram/adaptive/adaptive_histogram.h b/library/cpp/histogram/adaptive/adaptive_histogram.h new file mode 100644 index 0000000000..fa8f48433f --- /dev/null +++ b/library/cpp/histogram/adaptive/adaptive_histogram.h @@ -0,0 +1,131 @@ +#pragma once + +#include "histogram.h" +#include "common.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/ptr.h> +#include <util/generic/set.h> +#include <util/generic/vector.h> + +namespace NKiwiAggr { + class TAdaptiveHistogram: private TNonCopyable, public IHistogram { + protected: + static const size_t DEFAULT_INTERVALS = 100; + + private: + using TPairSet = TSet<TWeightedValue>; + + struct TFastBin { + // these names are for compatibility with TWeightedValue + double first; + double second; + // both sums do not include current bin + double SumBelow; + double SumAbove; + + TFastBin(double first_, double second_, double sumBelow = 0, double sumAbove = 0) + : first(first_) + , second(second_) + , SumBelow(sumBelow) + , SumAbove(sumAbove) + { + } + + bool operator<(const TFastBin& rhs) const { + return first < rhs.first; + } + }; + + ui64 Id; + double MinValue; + double MaxValue; + double Sum; + size_t Intervals; + TPairSet Bins; + TPairSet BinsByQuality; + TQualityFunction CalcQuality; + + TVector<TFastBin> PrecomputedBins; + + public: + TAdaptiveHistogram(size_t intervals, ui64 id = 0, TQualityFunction qualityFunc = CalcWeightQuality); + TAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, TQualityFunction qualityFunc = nullptr); + TAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, TQualityFunction qualityFunc = CalcWeightQuality); + + ~TAdaptiveHistogram() override { + } + + TQualityFunction GetQualityFunc(); + + void Clear() override; + + void Add(double value, double weight) override; + void Add(const THistoRec& histoRec) override; + + void Merge(const THistogram& histo, double multiplier) final; + void Merge(const TVector<THistogram>& histogramsToMerge) final; + void Merge(TVector<IHistogramPtr> histogramsToMerge) final; + + void Multiply(double factor) final; + + void FromProto(const THistogram& histo) final; + void ToProto(THistogram& histo) final; + + void SetId(ui64 id) final; + ui64 GetId() final; + bool Empty() final; + double GetMinValue() final; + double GetMaxValue() final; + double GetSum() final; + double GetSumInRange(double leftBound, double rightBound) final; + double GetSumAboveBound(double bound) final; + double GetSumBelowBound(double bound) final; + double CalcUpperBound(double sum) final; + double CalcLowerBound(double sum) final; + double CalcUpperBoundSafe(double sum) final; + double CalcLowerBoundSafe(double sum) final; + + void PrecomputePartialSums() final; + + private: + void FromIHistogram(IHistogram* histo); + void Add(const TWeightedValue& weightedValue, bool initial); + void Erase(double value); + void Shrink(); + + template <typename TBins, typename TGetSumAbove> + double GetSumAboveBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumAbove& getSumAbove) const; + + template <typename TBins, typename TGetSumBelow> + double GetSumBelowBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumBelow& getSumBelow) const; + }; + + template <TQualityFunction QualityFunction> + class TDefinedAdaptiveHistogram: public TAdaptiveHistogram { + public: + TDefinedAdaptiveHistogram(size_t intervals, ui64 id = 0) + : TAdaptiveHistogram(intervals, id, QualityFunction) + { + } + + TDefinedAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) + : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction) + { + } + + TDefinedAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) + : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction) + { + } + + ~TDefinedAdaptiveHistogram() override { + } + }; + + typedef TDefinedAdaptiveHistogram<CalcDistanceQuality> TAdaptiveDistanceHistogram; + typedef TDefinedAdaptiveHistogram<CalcWeightQuality> TAdaptiveWeightHistogram; + typedef TDefinedAdaptiveHistogram<CalcWardQuality> TAdaptiveWardHistogram; + +} diff --git a/library/cpp/histogram/adaptive/auto_histogram.h b/library/cpp/histogram/adaptive/auto_histogram.h new file mode 100644 index 0000000000..9fdf0b9abe --- /dev/null +++ b/library/cpp/histogram/adaptive/auto_histogram.h @@ -0,0 +1,148 @@ +#pragma once + +#include "adaptive_histogram.h" +#include "fixed_bin_histogram.h" +#include "histogram.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/ptr.h> +#include <util/generic/yexception.h> + +namespace NKiwiAggr { + class TAutoHistogram: private TNonCopyable, public IHistogram { + private: + static const size_t DEFAULT_INTERVALS = 100; + + ui64 Id; + size_t Intervals; + THolder<IHistogram> HistogramImpl; + + public: + TAutoHistogram(size_t intervals, ui64 id = 0) { + Y_UNUSED(intervals); + Y_UNUSED(id); + ythrow yexception() << "Empty constructor is not defined for TAutoHistogram"; + } + + TAutoHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) + : Id(defaultId) + , Intervals(defaultIntervals) + { + FromProto(histo); + } + + TAutoHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) { + Y_UNUSED(histo); + Y_UNUSED(defaultIntervals); + Y_UNUSED(defaultId); + ythrow yexception() << "IHistogram constructor is not defined for TAutoHistogram"; + } + + virtual ~TAutoHistogram() { + } + + virtual void Clear() { + HistogramImpl->Clear(); + } + + virtual void Add(double value, double weight) { + HistogramImpl->Add(value, weight); + } + + virtual void Add(const THistoRec& histoRec) { + HistogramImpl->Add(histoRec); + } + + virtual void Merge(const THistogram& histo, double multiplier) { + HistogramImpl->Merge(histo, multiplier); + } + + virtual void Merge(const TVector<THistogram>& histogramsToMerge) { + HistogramImpl->Merge(histogramsToMerge); + } + + virtual void Merge(TVector<IHistogramPtr> histogramsToMerge) { + HistogramImpl->Merge(histogramsToMerge); + } + + virtual void Multiply(double factor) { + HistogramImpl->Multiply(factor); + } + + virtual void FromProto(const THistogram& histo) { + if (!histo.HasType() || histo.GetType() == HT_FIXED_BIN_HISTOGRAM) { + HistogramImpl.Reset(new TFixedBinHistogram(histo, Intervals, Id)); + } else if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) { + HistogramImpl.Reset(new TAdaptiveDistanceHistogram(histo, Intervals, Id)); + } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) { + HistogramImpl.Reset(new TAdaptiveWeightHistogram(histo, Intervals, Id)); + } else if (histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM) { + HistogramImpl.Reset(new TAdaptiveWardHistogram(histo, Intervals, Id)); + } else { + ythrow yexception() << "Can't parse TAutoHistogram from a THistogram of type " << (ui32)histo.GetType(); + } + } + + virtual void ToProto(THistogram& histo) { + HistogramImpl->ToProto(histo); + } + + virtual void SetId(ui64 id) { + HistogramImpl->SetId(id); + } + + virtual ui64 GetId() { + return HistogramImpl->GetId(); + } + + virtual bool Empty() { + return HistogramImpl->Empty(); + } + + virtual double GetMinValue() { + return HistogramImpl->GetMinValue(); + } + + virtual double GetMaxValue() { + return HistogramImpl->GetMaxValue(); + } + + virtual double GetSum() { + return HistogramImpl->GetSum(); + } + + virtual double GetSumInRange(double leftBound, double rightBound) { + return HistogramImpl->GetSumInRange(leftBound, rightBound); + } + + virtual double GetSumAboveBound(double bound) { + return HistogramImpl->GetSumAboveBound(bound); + } + + virtual double GetSumBelowBound(double bound) { + return HistogramImpl->GetSumBelowBound(bound); + } + + virtual double CalcUpperBound(double sum) { + return HistogramImpl->CalcUpperBound(sum); + } + + virtual double CalcLowerBound(double sum) { + return HistogramImpl->CalcLowerBound(sum); + } + + virtual double CalcUpperBoundSafe(double sum) { + return HistogramImpl->CalcUpperBoundSafe(sum); + } + + virtual double CalcLowerBoundSafe(double sum) { + return HistogramImpl->CalcLowerBoundSafe(sum); + } + + virtual void PrecomputePartialSums() { + return HistogramImpl->PrecomputePartialSums(); + } + }; + +} diff --git a/library/cpp/histogram/adaptive/block_histogram.cpp b/library/cpp/histogram/adaptive/block_histogram.cpp new file mode 100644 index 0000000000..6586d13ff6 --- /dev/null +++ b/library/cpp/histogram/adaptive/block_histogram.cpp @@ -0,0 +1,593 @@ +#include "block_histogram.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/algorithm.h> +#include <util/generic/yexception.h> +#include <util/generic/intrlist.h> +#include <util/generic/ptr.h> +#include <util/generic/queue.h> +#include <util/generic/ymath.h> +#include <util/string/printf.h> + +namespace { + struct TEmpty { + }; + + class TSmartHeap { + private: + TVector<ui32> A; + TVector<ui32> Pos; + const TVector<double>& Weights; + + public: + TSmartHeap(const TVector<double>& weights) + : A(weights.size()) + , Pos(weights.size()) + , Weights(weights) + { + for (ui32 i = 0; i < weights.size(); ++i) { + A[i] = i; + Pos[i] = i; + } + for (ui32 i = weights.size() / 2; i > 0; --i) { + Down(i - 1); + } + } + + ui32 IdOfMin() { + return A[0]; + } + + void Pop() { + A[0] = A.back(); + Pos[A[0]] = 0; + A.pop_back(); + Down(0); + } + + void DownElement(ui32 id) { + Down(Pos[id]); + } + + private: + void SwapPositions(ui32 x, ui32 y) { + std::swap(A[x], A[y]); + Pos[A[x]] = x; + Pos[A[y]] = y; + } + + void Down(ui32 pos) { + while (1) { + ui32 left = pos * 2 + 1; + ui32 right = pos * 2 + 2; + ui32 min = pos; + if (left < A.size() && Weights[A[min]] > Weights[A[left]]) + min = left; + if (right < A.size() && Weights[A[min]] > Weights[A[right]]) + min = right; + if (pos == min) + break; + SwapPositions(min, pos); + pos = min; + } + } + }; + +} + +namespace NKiwiAggr { + /////////////////// + // TBlockHistogram + /////////////////// + + TBlockHistogram::TBlockHistogram(EHistogramType type, TQualityFunction calcQuality, + size_t intervals, ui64 id, size_t shrinkSize) + : Type(type) + , CalcQuality(calcQuality) + , Intervals(intervals) + , ShrinkSize(shrinkSize) + , PrevSize(0) + , Id(id) + , Sum(0) + , MinValue(0) + , MaxValue(0) + { + CorrectShrinkSize(); + } + + void TBlockHistogram::Clear() { + PrevSize = 0; + Sum = 0.0; + MinValue = 0.0; + MaxValue = 0.0; + Bins.clear(); + } + + void TBlockHistogram::Add(const THistoRec& rec) { + if (!rec.HasId() || rec.GetId() == Id) { + Add(rec.GetValue(), rec.GetWeight()); + } + } + + void TBlockHistogram::Add(double value, double weight) { + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight); + } + + if (weight <= 0.0) { + return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden + } + + if (Bins.empty()) { + MinValue = value; + MaxValue = value; + } else { + MinValue = Min(MinValue, value); + MaxValue = Max(MaxValue, value); + } + + Sum += weight; + + if (Bins.size() > ShrinkSize) { + SortAndShrink(Intervals * SHRINK_MULTIPLIER); + } + + Bins.push_back(TWeightedValue(value, weight)); + } + + void TBlockHistogram::Merge(const THistogram& histo, double multiplier) { + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { + fprintf(stderr, "Merging in histogram id %lu: skip bad histo with minvalue %f maxvalue %f\n", Id, histo.GetMinValue(), histo.GetMaxValue()); + return; + } + if (histo.FreqSize() == 0) { + return; // skip empty histos + } + if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM || + histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM || + histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM || + histo.GetType() == HT_ADAPTIVE_HISTOGRAM) + { + Y_VERIFY(histo.FreqSize() == histo.PositionSize(), "Corrupted histo"); + for (size_t j = 0; j < histo.FreqSize(); ++j) { + double value = histo.GetPosition(j); + double weight = histo.GetFreq(j); + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); + continue; + } + Add(value, weight * multiplier); + } + + MinValue = Min(MinValue, histo.GetMinValue()); + MaxValue = Max(MaxValue, histo.GetMaxValue()); + } else if (histo.GetType() == HT_FIXED_BIN_HISTOGRAM) { + double pos = histo.GetMinValue() + histo.GetBinRange() / 2.0; + for (size_t j = 0; j < histo.FreqSize(); ++j) { + double weight = histo.GetFreq(j); + if (!IsValidFloat(pos) || !IsValidFloat(weight)) { + fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, pos, weight); + pos += histo.GetBinRange(); + continue; + } + Add(pos, weight * multiplier); + pos += histo.GetBinRange(); + } + + MinValue = Min(MinValue, histo.GetMinValue()); + MaxValue = Max(MaxValue, histo.GetMaxValue()); + } else { + ythrow yexception() << "Unknown THistogram type"; + } + } + + void TBlockHistogram::Merge(const TVector<THistogram>& histogramsToMerge) { + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + Merge(histogramsToMerge[i], 1.0); + } + } + + void TBlockHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) { + Y_UNUSED(histogramsToMerge); + ythrow yexception() << "IHistogram::Merge(TVector<IHistogramPtr>) is not defined for TBlockHistogram"; + } + + void TBlockHistogram::Multiply(double factor) { + if (!IsValidFloat(factor) || factor <= 0) { + ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; + } + Sum *= factor; + for (TVector<TWeightedValue>::iterator it = Bins.begin(); it != Bins.end(); ++it) { + it->second *= factor; + } + } + + void TBlockHistogram::FromProto(const THistogram& histo) { + Y_VERIFY(histo.HasType(), "Attempt to parse TBlockHistogram from THistogram protobuf with no Type field set"); + ; + switch (histo.GetType()) { // check that histogram type is correct + case HT_ADAPTIVE_DISTANCE_HISTOGRAM: + case HT_ADAPTIVE_WEIGHT_HISTOGRAM: + case HT_ADAPTIVE_WARD_HISTOGRAM: + case HT_ADAPTIVE_HISTOGRAM: + break; // ok + default: // not ok + ythrow yexception() << "Attempt to parse TBlockHistogram from THistogram protobuf record of type = " << (ui32)histo.GetType(); + } + + if (histo.FreqSize() != histo.PositionSize()) { + ythrow yexception() << "Attempt to parse TBlockHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize(); + } + Id = histo.GetId(); + Sum = 0; + Intervals = Max(Intervals, histo.FreqSize()); + CorrectShrinkSize(); + Bins.resize(histo.FreqSize()); + PrevSize = Bins.size(); + for (size_t i = 0; i < histo.FreqSize(); ++i) { + double value = histo.GetPosition(i); + double weight = histo.GetFreq(i); + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + fprintf(stderr, "FromProto in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); + continue; + } + Bins[i].first = value; + Bins[i].second = weight; + Sum += Bins[i].second; + } + + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { + ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue()); + } + MinValue = histo.GetMinValue(); + MaxValue = histo.GetMaxValue(); + } + + void TBlockHistogram::ToProto(THistogram& histo) { + histo.Clear(); + histo.SetType(Type); + histo.SetId(Id); + if (Empty()) { + return; + } + + SortAndShrink(Intervals, true); + histo.SetMinValue(MinValue); + histo.SetMaxValue(MaxValue); + for (TVector<TWeightedValue>::const_iterator it = Bins.begin(); it != Bins.end(); ++it) { + histo.AddFreq(it->second); + histo.AddPosition(it->first); + } + } + + void TBlockHistogram::SetId(ui64 id) { + Id = id; + } + + ui64 TBlockHistogram::GetId() { + return Id; + } + + bool TBlockHistogram::Empty() { + return Bins.empty(); + } + + double TBlockHistogram::GetMinValue() { + return MinValue; + } + + double TBlockHistogram::GetMaxValue() { + return MaxValue; + } + + double TBlockHistogram::GetSum() { + return Sum; + } + + void TBlockHistogram::SortAndShrink(size_t intervals, bool final) { + Y_VERIFY(intervals > 0); + + if (Bins.size() <= intervals) { + return; + } + + if (Bins.size() >= Intervals * GREEDY_SHRINK_MULTIPLIER) { + SortBins(); + UniquifyBins(); + FastGreedyShrink(intervals); + + if (final) { + SlowShrink(intervals); + } + + } else { + SortBins(); + UniquifyBins(); + SlowShrink(intervals); + } + } + + void TBlockHistogram::SortBins() { + Sort(Bins.begin() + PrevSize, Bins.end()); + if (PrevSize != 0) { + TVector<TWeightedValue> temp(Bins.begin(), Bins.begin() + PrevSize); + std::merge(temp.begin(), temp.end(), Bins.begin() + PrevSize, Bins.end(), Bins.begin()); + } + } + + void TBlockHistogram::UniquifyBins() { + if (Bins.empty()) + return; + + auto it1 = Bins.begin(); + auto it2 = Bins.begin(); + while (++it2 != Bins.end()) { + if (it1->first == it2->first) { + it1->second += it2->second; + } else { + *(++it1) = *it2; + } + } + + Bins.erase(++it1, Bins.end()); + } + + void TBlockHistogram::CorrectShrinkSize() { + ShrinkSize = Max(ShrinkSize, Intervals * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER)); + } + + void TBlockHistogram::SlowShrink(size_t intervals) { + { + size_t pos = 0; + for (size_t i = 1; i < Bins.size(); ++i) + if (Bins[i].first - Bins[pos].first < 1e-9) { + Bins[pos].second += Bins[i].second; + } else { + ++pos; + Bins[pos] = Bins[i]; + } + Bins.resize(pos + 1); + PrevSize = pos + 1; + } + + if (Bins.size() <= intervals) { + return; + } + + typedef TIntrusiveListItem<TEmpty> TListItem; + + ui32 n = Bins.size() - 1; + const ui32 end = (ui32)Bins.size(); + + TArrayHolder<TListItem> listElementsHolder(new TListItem[end + 1]); + TListItem* const bins = listElementsHolder.Get(); + + for (ui32 i = 1; i <= end; ++i) { + bins[i].LinkAfter(&bins[i - 1]); + } + + TVector<double> pairWeights(n); + + for (ui32 i = 0; i < n; ++i) { + pairWeights[i] = CalcQuality(Bins[i], Bins[i + 1]).first; + } + + TSmartHeap heap(pairWeights); + + while (n + 1 > intervals) { + ui32 b = heap.IdOfMin(); + heap.Pop(); + + ui32 a = (ui32)(bins[b].Prev() - bins); + ui32 c = (ui32)(bins[b].Next() - bins); + ui32 d = (ui32)(bins[b].Next()->Next() - bins); + Y_VERIFY(Bins[c].second != -1); + + double mass = Bins[b].second + Bins[c].second; + Bins[c].first = (Bins[b].first * Bins[b].second + Bins[c].first * Bins[c].second) / mass; + Bins[c].second = mass; + + bins[b].Unlink(); + Bins[b].second = -1; + + if (a != end) { + pairWeights[a] = CalcQuality(Bins[a], Bins[c]).first; + heap.DownElement(a); + } + + if (d != end && c + 1 != Bins.size()) { + pairWeights[c] = CalcQuality(Bins[c], Bins[d]).first; + heap.DownElement(c); + } + + --n; + } + + size_t pos = 0; + for (TListItem* it = bins[end].Next(); it != &bins[end]; it = it->Next()) { + Bins[pos++] = Bins[it - bins]; + } + + Bins.resize(pos); + PrevSize = pos; + Y_VERIFY(pos == intervals); + } + + double TBlockHistogram::GetSumInRange(double leftBound, double rightBound) { + Y_UNUSED(leftBound); + Y_UNUSED(rightBound); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + double TBlockHistogram::GetSumAboveBound(double bound) { + Y_UNUSED(bound); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + double TBlockHistogram::GetSumBelowBound(double bound) { + Y_UNUSED(bound); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + double TBlockHistogram::CalcUpperBound(double sum) { + Y_UNUSED(sum); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + double TBlockHistogram::CalcLowerBound(double sum) { + Y_UNUSED(sum); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + double TBlockHistogram::CalcUpperBoundSafe(double sum) { + Y_UNUSED(sum); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + double TBlockHistogram::CalcLowerBoundSafe(double sum) { + Y_UNUSED(sum); + ythrow yexception() << "Method is not implemented for TBlockHistogram"; + return 0; + } + + ///////////////////////// + // TBlockWeightHistogram + ///////////////////////// + + TBlockWeightHistogram::TBlockWeightHistogram(size_t intervals, ui64 id, size_t shrinkSize) + : TBlockHistogram(HT_ADAPTIVE_WEIGHT_HISTOGRAM, CalcWeightQuality, intervals, id, shrinkSize) + { + } + + void TBlockWeightHistogram::FastGreedyShrink(size_t intervals) { + if (Bins.size() <= intervals) + return; + + double slab = Sum / intervals; + + size_t i = 0; + size_t pos = 0; + while (i < Bins.size()) { + double curW = Bins[i].second; + double curMul = Bins[i].first * Bins[i].second; + ++i; + while (i < Bins.size() && curW + Bins[i].second <= slab && pos + Bins.size() - i >= intervals) { + curW += Bins[i].second; + curMul += Bins[i].first * Bins[i].second; + ++i; + } + Bins[pos++] = TWeightedValue(curMul / curW, curW); + } + + Bins.resize(pos); + PrevSize = pos; + } + + /////////////////////// + // TBlockWardHistogram + /////////////////////// + + TBlockWardHistogram::TBlockWardHistogram(size_t intervals, ui64 id, size_t shrinkSize) + : TBlockHistogram(HT_ADAPTIVE_WARD_HISTOGRAM, CalcWardQuality, intervals, id, shrinkSize) + { + } + + bool TBlockWardHistogram::CalcSplitInfo( + const TCumulatives::const_iterator beg, + const TCumulatives::const_iterator end, // (!) points to the final element + TSplitInfo& splitInfo // out + ) { + if (end - beg < 2) { + return false; + } + + TCumulatives::const_iterator mid = LowerBound(beg, end + 1, TCumulative{(beg->first + end->first) / 2, 0.}); + + if (mid == beg) { + mid++; + } else if (mid == end) { + mid--; + } + + // derived from Ward's minimum variance criterion + double profit = 0.0; + profit += (mid->second - beg->second) * (mid->second - beg->second) / (mid->first - beg->first); + profit += (end->second - mid->second) * (end->second - mid->second) / (end->first - mid->first); + profit -= (end->second - beg->second) * (end->second - beg->second) / (end->first - beg->first); + + splitInfo = {profit, beg, mid, end}; + + return true; + } + + void TBlockWardHistogram::FastGreedyShrink(size_t intervals) { + Y_VERIFY(intervals > 0); + + if (Bins.size() <= intervals) { + return; + } + + // fill cumulative sums + // sum at index i equals to the sum of all values before i + // sum at index i+1 equals to the sum of all values before i with the value at i added + TCumulatives cumulatives; + cumulatives.reserve(Bins.size() + 1); + + TCumulative cumulative = {0., 0.}; + cumulatives.push_back(cumulative); + for (size_t i = 0; i < Bins.size(); i++) { + cumulative.first += Bins[i].second; + cumulative.second += Bins[i].second * Bins[i].first; + cumulatives.push_back(cumulative); + } + + TVector<TCumulatives::const_iterator> splits; + splits.reserve(intervals + 1); + splits.push_back(cumulatives.begin()); + splits.push_back(cumulatives.end() - 1); + + TPriorityQueue<TSplitInfo> candidates; + + // explicitly add first split + TSplitInfo newSplitInfo; + if (CalcSplitInfo(cumulatives.begin(), cumulatives.end() - 1, newSplitInfo)) { + candidates.push(newSplitInfo); + } + + // recursively split until done + for (size_t split = 0; split < intervals - 1 && !candidates.empty(); split++) { + TSplitInfo curSplitInfo = candidates.top(); + candidates.pop(); + + splits.push_back(curSplitInfo.mid); + + if (CalcSplitInfo(curSplitInfo.beg, curSplitInfo.mid, newSplitInfo)) { + candidates.push(newSplitInfo); + } + if (CalcSplitInfo(curSplitInfo.mid, curSplitInfo.end, newSplitInfo)) { + candidates.push(newSplitInfo); + } + } + + // calclate new bin centers and weights + Sort(splits.begin(), splits.end()); + + Bins.clear(); + for (auto it = splits.begin(); it + 1 != splits.end(); ++it) { + auto splitBeg = *it; + auto splitEnd = *(it + 1); + double cnt = (splitEnd->first - splitBeg->first); + double mu = (splitEnd->second - splitBeg->second) / cnt; + + Bins.push_back(TWeightedValue(mu, cnt)); + } + } + +} diff --git a/library/cpp/histogram/adaptive/block_histogram.h b/library/cpp/histogram/adaptive/block_histogram.h new file mode 100644 index 0000000000..266bb2f2b2 --- /dev/null +++ b/library/cpp/histogram/adaptive/block_histogram.h @@ -0,0 +1,148 @@ +#pragma once + +#include "histogram.h" +#include "common.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <utility> + +namespace NKiwiAggr { + /////////////////// + // TBlockHistogram + /////////////////// + + /** + * Contrary to adaptive histogram, block histogram doesn't rebuild bins + * after the addition of each point. Instead, it accumulates points and in case the amount + * of points overflows specified limits, it shrinks all the points at once to produce histogram + * Indeed, there exist two limits and two shrinkage operations: + * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit + * (override FastGreedyShrink to set specific behaviour) + * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the + * actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor) + * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization + */ + class TBlockHistogram: private TNonCopyable, public IHistogram { + protected: + static const size_t SHRINK_MULTIPLIER = 2; + static const size_t GREEDY_SHRINK_MULTIPLIER = 4; + static const size_t DEFAULT_INTERVALS = 100; + static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER); + + const EHistogramType Type; + const TQualityFunction CalcQuality; + + size_t Intervals; + size_t ShrinkSize; + size_t PrevSize; + + ui64 Id; + + double Sum; + double MinValue; + double MaxValue; + + TVector<TWeightedValue> Bins; + + public: + TBlockHistogram(EHistogramType type, TQualityFunction calcQuality, + size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE); + + virtual ~TBlockHistogram() { + } + + virtual void Clear(); + + virtual void Add(double value, double weight); + virtual void Add(const THistoRec& histoRec); + + virtual void Merge(const THistogram& histo, double multiplier); + virtual void Merge(const TVector<THistogram>& histogramsToMerge); + virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented + + virtual void Multiply(double factor); + + virtual void FromProto(const THistogram& histo); + virtual void ToProto(THistogram& histo); + + virtual void SetId(ui64 id); + virtual ui64 GetId(); + virtual bool Empty(); + virtual double GetMinValue(); + virtual double GetMaxValue(); + virtual double GetSum(); + + // methods below are not implemented + virtual double GetSumInRange(double leftBound, double rightBound); + virtual double GetSumAboveBound(double bound); + virtual double GetSumBelowBound(double bound); + virtual double CalcUpperBound(double sum); + virtual double CalcLowerBound(double sum); + virtual double CalcUpperBoundSafe(double sum); + virtual double CalcLowerBoundSafe(double sum); + + private: + void SortBins(); + void UniquifyBins(); + void CorrectShrinkSize(); + + void SortAndShrink(size_t intervals, bool final = false); + + void SlowShrink(size_t intervals); + virtual void FastGreedyShrink(size_t intervals) = 0; + }; + + ///////////////////////// + // TBlockWeightHistogram + ///////////////////////// + + class TBlockWeightHistogram: public TBlockHistogram { + public: + TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE); + + virtual ~TBlockWeightHistogram() { + } + + private: + virtual void FastGreedyShrink(size_t intervals) final; + }; + + /////////////////////// + // TBlockWardHistogram + /////////////////////// + + class TBlockWardHistogram: public TBlockHistogram { + public: + TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE); + + virtual ~TBlockWardHistogram() { + } + + private: + using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers) + using TCumulatives = TVector<TCumulative>; + + struct TSplitInfo { + double profit; + + TCumulatives::const_iterator beg; + TCumulatives::const_iterator mid; + TCumulatives::const_iterator end; + + bool operator<(const TSplitInfo& other) const { + return profit < other.profit; + } + }; + + private: + virtual void FastGreedyShrink(size_t intervals) final; + + static bool CalcSplitInfo(const TCumulatives::const_iterator beg, + const TCumulatives::const_iterator end, + TSplitInfo& splitInfo); + }; + +} diff --git a/library/cpp/histogram/adaptive/common.cpp b/library/cpp/histogram/adaptive/common.cpp new file mode 100644 index 0000000000..afc6322fce --- /dev/null +++ b/library/cpp/histogram/adaptive/common.cpp @@ -0,0 +1,19 @@ +#include "common.h" + +namespace NKiwiAggr { + TWeightedValue CalcDistanceQuality(const TWeightedValue& left, const TWeightedValue& right) { + return TWeightedValue(right.first - left.first, left.first); + } + + TWeightedValue CalcWeightQuality(const TWeightedValue& left, const TWeightedValue& right) { + return TWeightedValue(right.second + left.second, left.first); + } + + TWeightedValue CalcWardQuality(const TWeightedValue& left, const TWeightedValue& right) { + const double N1 = left.second; + const double N2 = right.second; + const double mu1 = left.first; + const double mu2 = right.first; + return TWeightedValue(N1 * N2 / (N1 + N2) * (mu1 - mu2) * (mu1 - mu2), left.first); + } +} diff --git a/library/cpp/histogram/adaptive/common.h b/library/cpp/histogram/adaptive/common.h new file mode 100644 index 0000000000..c0f7dfb26b --- /dev/null +++ b/library/cpp/histogram/adaptive/common.h @@ -0,0 +1,12 @@ +#pragma once + +#include <utility> + +namespace NKiwiAggr { + using TWeightedValue = std::pair<double, double>; // value, weight + using TQualityFunction = TWeightedValue (*)(const TWeightedValue&, const TWeightedValue&); + + TWeightedValue CalcDistanceQuality(const TWeightedValue& left, const TWeightedValue& right); + TWeightedValue CalcWeightQuality(const TWeightedValue& left, const TWeightedValue& right); + TWeightedValue CalcWardQuality(const TWeightedValue& left, const TWeightedValue& right); +} diff --git a/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp b/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp new file mode 100644 index 0000000000..558aba9e2d --- /dev/null +++ b/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp @@ -0,0 +1,538 @@ +#include "fixed_bin_histogram.h" +#include "auto_histogram.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/algorithm.h> +#include <util/generic/yexception.h> +#include <util/generic/ymath.h> +#include <util/string/printf.h> + +namespace NKiwiAggr { + TFixedBinHistogram::TFixedBinHistogram(size_t intervals, ui64 id, size_t trainingSetSize) + : TrainingSetSize(trainingSetSize) + , IsInitialized(false) + , IsEmpty(true) + , Id(id) + , Sum(0.0) + , Freqs(0) + , ReserveFreqs(0) + , BinRange(0.0) + , Intervals(intervals) + , BaseIndex(intervals / 2) + { + } + + TFixedBinHistogram::TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize) + : TrainingSetSize(trainingSetSize) + , IsInitialized(false) + , IsEmpty(true) + , Id(defaultId) + , Sum(0.0) + , Freqs(0) + , ReserveFreqs(0) + , BinRange(0.0) + , Intervals(defaultIntervals) + , BaseIndex(defaultIntervals / 2) + { + FromProto(histo); + } + + TFixedBinHistogram::TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize) + : TrainingSetSize(trainingSetSize) + , IsInitialized(false) + , IsEmpty(true) + , Id(defaultId) + , Sum(0.0) + , Freqs(0) + , ReserveFreqs(0) + , BinRange(0.0) + , Intervals(defaultIntervals) + , BaseIndex(defaultIntervals / 2) + { + TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histo); + if (!fixedBinHisto) { + FromIHistogram(histo); + return; + } + fixedBinHisto->Initialize(); + TrainingSetSize = fixedBinHisto->TrainingSetSize; + IsInitialized = fixedBinHisto->IsInitialized; + IsEmpty = fixedBinHisto->IsEmpty; + Id = fixedBinHisto->Id; + MinValue = fixedBinHisto->MinValue; + MaxValue = fixedBinHisto->MaxValue; + Sum = fixedBinHisto->Sum; + Freqs.assign(fixedBinHisto->Freqs.begin(), fixedBinHisto->Freqs.end()); + ReserveFreqs.assign(fixedBinHisto->ReserveFreqs.begin(), fixedBinHisto->ReserveFreqs.end()); + ReferencePoint = fixedBinHisto->ReferencePoint; + BinRange = fixedBinHisto->BinRange; + Intervals = fixedBinHisto->Intervals; + FirstUsedBin = fixedBinHisto->FirstUsedBin; + LastUsedBin = fixedBinHisto->LastUsedBin; + BaseIndex = fixedBinHisto->BaseIndex; + } + + void TFixedBinHistogram::Clear() { + TrainingSet.Destroy(); + IsInitialized = false; + IsEmpty = true; + Sum = 0.0; + Freqs.clear(); + ReserveFreqs.clear(); + BinRange = 0.0; + } + + void TFixedBinHistogram::Add(const THistoRec& rec) { + if (!rec.HasId() || rec.GetId() == Id) { + Add(rec.GetValue(), rec.GetWeight()); + } + } + + void TFixedBinHistogram::Add(double value, double weight) { + if (!IsValidFloat(value) || !IsValidFloat(weight)) { + ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight); + } + + if (weight <= 0.0) { + return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden + } + + Sum += weight; + + if (!IsInitialized) { + if (!TrainingSet) { + TrainingSet.Reset(new TVector<TWeightedValue>(0)); + } + TrainingSet->push_back(TWeightedValue(value, weight)); + if (TrainingSet->size() >= TrainingSetSize) { + Initialize(); + } + return; + } + + i32 bin = CalcBin(value); + if (bin < 0 || bin >= (i32)Freqs.size() || (BinRange == 0.0 && value != ReferencePoint)) { + Shrink(Min(value, MinValue), Max(value, MaxValue)); + Freqs[CalcBin(value)] += weight; + } else { + MinValue = Min(value, MinValue); + MaxValue = Max(value, MaxValue); + FirstUsedBin = Min(FirstUsedBin, bin); + LastUsedBin = Max(LastUsedBin, bin); + Freqs[bin] += weight; + } + } + + void TFixedBinHistogram::Merge(const THistogram& /*histo*/, double /*multiplier*/) { + ythrow yexception() << "Method is not implemented for TFixedBinHistogram"; + } + + void TFixedBinHistogram::Merge(const TVector<THistogram>& histogramsToMerge) { + TVector<IHistogramPtr> parsedHistogramsToMerge; + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + parsedHistogramsToMerge.push_back(IHistogramPtr(new TAutoHistogram(histogramsToMerge[i], Intervals, Id))); + } + Merge(parsedHistogramsToMerge); + } + + void TFixedBinHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) { + TVector<IHistogramPtr> histogramsToMergeRepacked(0); + TVector<TFixedBinHistogram*> histograms(0); + + // put current histogram to the vector of histograms to merge and clear self + if (!Empty()) { + histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(this, Intervals, Id, TrainingSetSize))); + histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get())); + } + Clear(); + + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) { + continue; + } + TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histogramsToMerge[i].Get()); + if (fixedBinHisto) { + fixedBinHisto->Initialize(); + histogramsToMergeRepacked.push_back(histogramsToMerge[i]); + } else { + histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(histogramsToMerge[i].Get(), Intervals, Id, TrainingSetSize))); // Convert histograms that are not of TFixedBinHistogram type + } + histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get())); + } + + if (histograms.size() == 0) { + return; + } + + double minValue = histograms[0]->MinValue; + double maxValue = histograms[0]->MaxValue; + Sum = histograms[0]->Sum; + for (size_t i = 1; i < histograms.size(); ++i) { + minValue = Min(minValue, histograms[i]->MinValue); + maxValue = Max(maxValue, histograms[i]->MaxValue); + Sum += histograms[i]->Sum; + } + SetFrame(minValue, maxValue, true); + + if (BinRange == 0.0) { + Freqs[BaseIndex] = Sum; + return; + } + + for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) { + TFixedBinHistogram* histo = histograms[histoIndex]; + for (i32 bin = histo->FirstUsedBin; bin <= histo->LastUsedBin; ++bin) { + double binStart = histo->BinStart(bin); + double binEnd = histo->BinEnd(bin); + double freq = histo->Freqs[bin]; + if (binStart == binEnd) { + Freqs[CalcBin(binStart)] += freq; + continue; + } + size_t firstCross = CalcBin(binStart); + size_t lastCross = CalcBin(binEnd); + for (size_t i = firstCross; i <= lastCross; ++i) { + double mergedBinStart = BinStart(i); + double mergedBinEnd = BinEnd(i); + double crossStart = Max(mergedBinStart, binStart); + double crossEnd = Min(mergedBinEnd, binEnd); + if (binStart == binEnd) { + } + Freqs[i] += freq * (crossEnd - crossStart) / (binEnd - binStart); + } + } + } + } + + void TFixedBinHistogram::Multiply(double factor) { + if (!IsValidFloat(factor) || factor <= 0) { + ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; + } + if (!IsInitialized) { + Initialize(); + } + Sum *= factor; + for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { + Freqs[i] *= factor; + } + } + + void TFixedBinHistogram::FromProto(const THistogram& histo) { + if (histo.HasType() && histo.GetType() != HT_FIXED_BIN_HISTOGRAM) { + ythrow yexception() << "Attempt to parse TFixedBinHistogram from THistogram protobuf record of wrong type = " << (ui32)histo.GetType(); + } + TrainingSet.Destroy(); + IsInitialized = false; + Sum = 0.0; + + Id = histo.GetId(); + size_t intervals = histo.FreqSize(); + if (intervals == 0) { + IsEmpty = true; + return; + } + Intervals = intervals; + TrainingSetSize = Intervals; + BaseIndex = Intervals / 2; + + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue()) || !IsValidFloat(histo.GetBinRange())) { + ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f binrange %f", Id, histo.GetMinValue(), histo.GetMaxValue(), histo.GetBinRange()); + } + + double minValue = histo.GetMinValue(); + double binRange = histo.GetBinRange(); + double maxValue = histo.HasMaxValue() ? histo.GetMaxValue() : minValue + binRange * Intervals; + SetFrame(minValue, maxValue, true); + BinRange = binRange; + for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { + Freqs[i] = histo.GetFreq(i - BaseIndex); + if (!IsValidFloat(Freqs[i])) { + ythrow yexception() << Sprintf("FromProto in histogram id %lu: bad value %f", Id, Freqs[i]); + } + Sum += Freqs[i]; + } + } + + void TFixedBinHistogram::ToProto(THistogram& histo) { + histo.Clear(); + if (!IsInitialized) { + Initialize(); + } + histo.SetType(HT_FIXED_BIN_HISTOGRAM); + histo.SetId(Id); + if (IsEmpty) { + return; + } + if (FirstUsedBin < (i32)BaseIndex || (LastUsedBin - FirstUsedBin + 1) > (i32)Intervals) { + Shrink(MinValue, MaxValue); + } + histo.SetMinValue(MinValue); + histo.SetMaxValue(MaxValue); + histo.SetBinRange(BinRange); + for (ui32 i = BaseIndex; i < BaseIndex + Intervals; ++i) { + histo.AddFreq(Freqs[i]); + } + } + + void TFixedBinHistogram::SetId(ui64 id) { + Id = id; + } + + ui64 TFixedBinHistogram::GetId() { + return Id; + } + + bool TFixedBinHistogram::Empty() { + if (!IsInitialized) { + Initialize(); + } + return IsEmpty; + } + + double TFixedBinHistogram::GetMinValue() { + if (!IsInitialized) { + Initialize(); + } + return MinValue; + } + + double TFixedBinHistogram::GetMaxValue() { + if (!IsInitialized) { + Initialize(); + } + return MaxValue; + } + + double TFixedBinHistogram::GetSum() { + return Sum; + } + + double TFixedBinHistogram::GetSumInRange(double leftBound, double rightBound) { + if (!IsInitialized) { + Initialize(); + } + if (leftBound > rightBound) { + return 0.0; + } + return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum; + } + + double TFixedBinHistogram::GetSumAboveBound(double bound) { + if (!IsInitialized) { + Initialize(); + } + if (IsEmpty) { + return 0.0; + } + if (BinRange == 0.0) { // special case - all values added to histogram are the same + return (bound <= ReferencePoint) ? Sum : 0.0; + } + i32 bin = CalcBin(bound); + if (bin < FirstUsedBin) { + return Sum; + } + if (bin > LastUsedBin) { + return 0.0; + } + double binStart = BinStart(bin); + double binEnd = BinEnd(bin); + double result = (bound < binStart) ? Freqs[bin] : Freqs[bin] * (binEnd - bound) / (binEnd - binStart); + for (i32 i = bin + 1; i <= LastUsedBin; ++i) { + result += Freqs[i]; + } + return result; + } + + double TFixedBinHistogram::GetSumBelowBound(double bound) { + if (!IsInitialized) { + Initialize(); + } + if (IsEmpty) { + return 0.0; + } + if (BinRange == 0.0) { // special case - all values added to histogram are the same + return (bound > ReferencePoint) ? Sum : 0.0; + } + i32 bin = CalcBin(bound); + if (bin < FirstUsedBin) { + return 0.0; + } + if (bin > LastUsedBin) { + return Sum; + } + double binStart = BinStart(bin); + double binEnd = BinEnd(bin); + double result = (bound > binEnd) ? Freqs[bin] : Freqs[bin] * (bound - binStart) / (binEnd - binStart); + for (i32 i = bin - 1; i >= FirstUsedBin; --i) { + result += Freqs[i]; + } + return result; + } + + double TFixedBinHistogram::CalcUpperBound(double sum) { + if (!IsInitialized) { + Initialize(); + } + if (sum == 0.0) { + return MinValue; + } + if (IsEmpty) { + return MaxValue; + } + i32 currentBin = FirstUsedBin; + double gatheredSum = 0.0; + while (gatheredSum < sum && currentBin <= LastUsedBin) { + gatheredSum += Freqs[currentBin]; + ++currentBin; + } + --currentBin; + if ((gatheredSum <= sum && currentBin == LastUsedBin) || (Freqs[currentBin] == 0)) { + return MaxValue; + } + double binStart = BinStart(currentBin); + double binEnd = BinEnd(currentBin); + return binEnd - (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin]; + } + + double TFixedBinHistogram::CalcLowerBound(double sum) { + if (!IsInitialized) { + Initialize(); + } + if (sum == 0.0) { + return MaxValue; + } + if (IsEmpty) { + return MinValue; + } + i32 currentBin = LastUsedBin; + double gatheredSum = 0.0; + while (gatheredSum < sum && currentBin >= FirstUsedBin) { + gatheredSum += Freqs[currentBin]; + --currentBin; + } + ++currentBin; + if ((gatheredSum <= sum && currentBin == FirstUsedBin) || (Freqs[currentBin] == 0)) { + return MinValue; + } + double binStart = BinStart(currentBin); + double binEnd = BinEnd(currentBin); + return binStart + (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin]; + } + + double TFixedBinHistogram::CalcUpperBoundSafe(double sum) { + if (!Empty()) { + sum = Max(Freqs[FirstUsedBin], sum); + } + return CalcUpperBound(sum); + } + + double TFixedBinHistogram::CalcLowerBoundSafe(double sum) { + if (!Empty()) { + sum = Max(Freqs[LastUsedBin], sum); + } + return CalcLowerBound(sum); + } + + double TFixedBinHistogram::CalcBinRange(double referencePoint, double maxValue) { + return (maxValue - referencePoint) / ((double)Intervals - 0.02); + } + + void TFixedBinHistogram::SetFrame(double minValue, double maxValue, bool clear) { + MinValue = minValue; + MaxValue = maxValue; + ReferencePoint = MinValue; + BinRange = CalcBinRange(ReferencePoint, MaxValue); + FirstUsedBin = BaseIndex; + LastUsedBin = (BinRange == 0.0) ? BaseIndex : BaseIndex + Intervals - 1; + if (clear) { + Freqs.assign(2 * Intervals, 0.0); + ReserveFreqs.assign(2 * Intervals, 0.0); + IsEmpty = false; + IsInitialized = true; + } + } + + void TFixedBinHistogram::FromIHistogram(IHistogram* histo) { + if (!histo) { + ythrow yexception() << "Attempt to create TFixedBinFistogram from a NULL pointer"; + } + Id = histo->GetId(); + if (histo->Empty()) { + IsInitialized = false; + IsEmpty = true; + return; + } + SetFrame(histo->GetMinValue(), histo->GetMaxValue(), true); + Sum = histo->GetSum(); + if (BinRange == 0.0) { + Freqs[BaseIndex] = Sum; + return; + } + for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { + Freqs[i] = histo->GetSumInRange(BinStart(i), BinEnd(i)); + } + return; + } + + void TFixedBinHistogram::Initialize() { + if (IsInitialized) { + return; + } + if (!TrainingSet || TrainingSet->size() == 0) { + IsEmpty = true; + return; + } + SetFrame(MinElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, + MaxElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, true); + for (TVector<TWeightedValue>::const_iterator it = TrainingSet->begin(); it != TrainingSet->end(); ++it) { + Freqs[CalcBin(it->first)] += it->second; + } + TrainingSet.Destroy(); + } + + i32 TFixedBinHistogram::CalcBin(double value) { + return (BinRange == 0.0) ? BaseIndex : static_cast<i32>(BaseIndex + (value - ReferencePoint) / BinRange); + } + + double TFixedBinHistogram::CalcDensity(double value) { + i32 bin = CalcBin(value); + if (bin < 0 || bin >= (i32)Freqs.size() || BinRange == 0.0 || GetSum() == 0) { + return 0.0; + } + return Freqs[bin] / GetSum() / BinRange; + } + + double TFixedBinHistogram::BinStart(i32 i) { + return Max(ReferencePoint + (i - BaseIndex) * BinRange, MinValue); + } + + double TFixedBinHistogram::BinEnd(i32 i) { + return Min(ReferencePoint + (i + 1 - BaseIndex) * BinRange, MaxValue); + } + + void TFixedBinHistogram::Shrink(double newReferencePoint, double newMaxValue) { + Y_VERIFY(newReferencePoint < newMaxValue, "Invalid Shrink()"); + memset(&(ReserveFreqs[0]), 0, ReserveFreqs.size() * sizeof(double)); + + double newBinRange = CalcBinRange(newReferencePoint, newMaxValue); + for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { + double binStart = BinStart(i); + double binEnd = BinEnd(i); + double freq = Freqs[i]; + i32 firstCross = static_cast<i32>(BaseIndex + (binStart - newReferencePoint) / newBinRange); + i32 lastCross = static_cast<i32>(BaseIndex + (binEnd - newReferencePoint) / newBinRange); + for (i32 j = firstCross; j <= lastCross; ++j) { + double newBinStart = newReferencePoint + (j - BaseIndex) * newBinRange; + double newBinEnd = newReferencePoint + (j + 1 - BaseIndex) * newBinRange; + double crossStart = Max(newBinStart, binStart); + double crossEnd = Min(newBinEnd, binEnd); + ReserveFreqs[j] += (binStart == binEnd) ? freq : freq * (crossEnd - crossStart) / (binEnd - binStart); + } + } + + Freqs.swap(ReserveFreqs); + SetFrame(newReferencePoint, newMaxValue, false); + } + +} diff --git a/library/cpp/histogram/adaptive/fixed_bin_histogram.h b/library/cpp/histogram/adaptive/fixed_bin_histogram.h new file mode 100644 index 0000000000..bd380bd94a --- /dev/null +++ b/library/cpp/histogram/adaptive/fixed_bin_histogram.h @@ -0,0 +1,91 @@ +#pragma once + +#include "histogram.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <utility> + +namespace NKiwiAggr { + class TFixedBinHistogram: private TNonCopyable, public IHistogram { + private: + static const size_t DEFAULT_TRAINING_SET_SIZE = 10000; + static const size_t DEFAULT_INTERVALS = 100; + + typedef std::pair<double, double> TWeightedValue; // value, weight + THolder<TVector<TWeightedValue>> TrainingSet; + size_t TrainingSetSize; + + bool IsInitialized; + bool IsEmpty; + + ui64 Id; + double MinValue; + double MaxValue; + double Sum; + + TVector<double> Freqs; + TVector<double> ReserveFreqs; + double ReferencePoint; + double BinRange; + size_t Intervals; + i32 FirstUsedBin; + i32 LastUsedBin; + i32 BaseIndex; + + public: + TFixedBinHistogram(size_t intervals, ui64 id = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE); + TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE); + TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE); + + virtual ~TFixedBinHistogram() { + } + + virtual void Clear(); + + virtual void Add(double value, double weight); + virtual void Add(const THistoRec& histoRec); + + virtual void Merge(const THistogram& histo, double multiplier); + virtual void Merge(const TVector<THistogram>& histogramsToMerge); + virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); + + virtual void Multiply(double factor); + + virtual void FromProto(const THistogram& histo); + virtual void ToProto(THistogram& histo); + + virtual void SetId(ui64 id); + virtual ui64 GetId(); + virtual bool Empty(); + virtual double GetMinValue(); + virtual double GetMaxValue(); + virtual double GetSum(); + virtual double GetSumInRange(double leftBound, double rightBound); + virtual double GetSumAboveBound(double bound); + virtual double GetSumBelowBound(double bound); + virtual double CalcUpperBound(double sum); + virtual double CalcLowerBound(double sum); + virtual double CalcUpperBoundSafe(double sum); + virtual double CalcLowerBoundSafe(double sum); + + double CalcDensity(double value); + + private: + double CalcBinRange(double referencePoint, double maxValue); + void SetFrame(double minValue, double maxValue, bool clear); + void FromIHistogram(IHistogram* histo); + void Initialize(); + i32 CalcBin(double value); + double BinStart(i32 i); + double BinEnd(i32 i); + void Shrink(double newMinValue, double newMaxValue); + + static bool CompareWeightedValue(const TWeightedValue& left, const TWeightedValue& right) { + return left.first < right.first; + } + }; + +} diff --git a/library/cpp/histogram/adaptive/histogram.h b/library/cpp/histogram/adaptive/histogram.h new file mode 100644 index 0000000000..360fd9a693 --- /dev/null +++ b/library/cpp/histogram/adaptive/histogram.h @@ -0,0 +1,66 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> + +namespace NKiwiAggr { + class THistogram; + class THistoRec; + + class IHistogram; + typedef TAtomicSharedPtr<IHistogram> IHistogramPtr; + + class IHistogram { + public: + // Supposed constructors: + // + // TSomeHistogram(size_t intervals, ui64 id = 0); // where intervals is some constant that defines histogram accuracy + // TSomeHistogram(const THistogram& histo); // histo must be acceptable for TSomeHistogram, for example, only with HT_FIXED_BIN_HISTOGRAM for TFixedBinHistogram + // TSomeHistogram(IHistogram* histo); // any kind of IHistogram + + virtual ~IHistogram() { + } + + virtual void Clear() = 0; + + // zero- or negative-weighted values are skipped + virtual void Add(double value, double weight) = 0; + virtual void Add(const THistoRec& histoRec) = 0; + + // Merge some other histos into current + virtual void Merge(const THistogram& histo, double multiplier) = 0; + virtual void Merge(const TVector<THistogram>& histogramsToMerge) = 0; + virtual void Merge(TVector<IHistogramPtr> histogramsToMerge) = 0; + + // factor should be greater then zero + virtual void Multiply(double factor) = 0; + + virtual void FromProto(const THistogram& histo) = 0; // throws exception in case of wrong histogram type of histo + virtual void ToProto(THistogram& histo) = 0; + + virtual void SetId(ui64 id) = 0; + virtual ui64 GetId() = 0; + virtual bool Empty() = 0; + virtual double GetMinValue() = 0; + virtual double GetMaxValue() = 0; + virtual double GetSum() = 0; + virtual double GetSumInRange(double leftBound, double rightBound) = 0; + virtual double GetSumAboveBound(double bound) = 0; + virtual double GetSumBelowBound(double bound) = 0; + virtual double CalcUpperBound(double sum) = 0; + virtual double CalcLowerBound(double sum) = 0; + virtual double CalcUpperBoundSafe(double sum) = 0; + virtual double CalcLowerBoundSafe(double sum) = 0; + double GetValueAtPercentile(double percentile) { + return CalcUpperBound(percentile * GetSum()); + } + double GetValueAtPercentileSafe(double percentile) { + return CalcUpperBoundSafe(percentile * GetSum()); + } + + // Histogram implementation is supposed to clear all precomputed values() if Add() is called after PrecomputePartialSums() + virtual void PrecomputePartialSums() { + } + }; + +} diff --git a/library/cpp/histogram/adaptive/merger.h b/library/cpp/histogram/adaptive/merger.h new file mode 100644 index 0000000000..fc9a6b6a4f --- /dev/null +++ b/library/cpp/histogram/adaptive/merger.h @@ -0,0 +1,68 @@ +#pragma once + +#include <util/generic/buffer.h> + +namespace NKiwiAggr { + class IMerger { + private: + bool IsMerged; + ui32 AutoMergeInterval; // Call Merge() after each AutoMergeInterval calls of Add(); zero means no autoMerge + ui32 NotMergedCount; + + public: + IMerger(ui32 autoMergeInterval = 0) + : IsMerged(true) + , AutoMergeInterval(autoMergeInterval) + , NotMergedCount(0) + { + } + + virtual ~IMerger() { + } + + // returns true if something is added + virtual bool Add(const void* data, size_t size) { + if (AddImpl(data, size)) { + AutoMerge(); + return true; + } + return false; + } + + virtual void Merge() { + if (!IsMerged) { + MergeImpl(); + IsMerged = true; + } + } + + virtual void Reset() { + ResetImpl(); + IsMerged = true; + } + + // You can add some more result-getters if you want. + // Do not forget to call Merge() in the beginning of each merger. + virtual void GetResult(TBuffer& buffer) = 0; + + protected: + // AutoMerge() is called in Add() after each AddImpl() + void AutoMerge() { + IsMerged = false; + if (AutoMergeInterval) { + ++NotMergedCount; + if (NotMergedCount >= AutoMergeInterval) { + MergeImpl(); + IsMerged = true; + NotMergedCount = 0; + } + } + } + + // Implementation of merger: define it in derivatives + virtual bool AddImpl(const void* data, size_t size) = 0; // returns true if something is added + virtual void MergeImpl() = 0; + virtual void ResetImpl() = 0; + }; + +} diff --git a/library/cpp/histogram/adaptive/multi_histogram.h b/library/cpp/histogram/adaptive/multi_histogram.h new file mode 100644 index 0000000000..41caac5ba6 --- /dev/null +++ b/library/cpp/histogram/adaptive/multi_histogram.h @@ -0,0 +1,143 @@ +#pragma once + +#include "histogram.h" +#include "auto_histogram.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/hash.h> +#include <util/generic/ptr.h> +#include <utility> + +namespace NKiwiAggr { + template <class TMyHistogram> + class TMultiHistogram { + private: + static const size_t DEFAULT_INTERVALS = 100; + + typedef THashMap<ui64, IHistogramPtr> THistogramsMap; + THistogramsMap Histograms; + size_t Intervals; + + public: + TMultiHistogram(size_t intervals = DEFAULT_INTERVALS) + : Intervals(intervals) + { + } + + TMultiHistogram(const THistograms& histograms, size_t defaultIntervals = DEFAULT_INTERVALS) + : Intervals(defaultIntervals) + { + FromProto(histograms); + } + + virtual ~TMultiHistogram() { + } + + void Clear() { + Histograms.clear(); + } + + void Add(const THistoRecs& histoRecs) { + for (size_t i = 0; i < histoRecs.HistoRecsSize(); ++i) { + Add(histoRecs.GetHistoRecs(i).GetId(), histoRecs.GetHistoRecs(i).GetValue(), histoRecs.GetHistoRecs(i).GetWeight()); + } + } + + void Add(const THistoRec& histoRec) { + Add(histoRec.GetId(), histoRec.GetValue(), histoRec.GetWeight()); + } + + void Add(ui64 id, double value, double weight) { + THistogramsMap::const_iterator it = Histograms.find(id); + if (it == Histograms.end()) { + it = Histograms.insert(std::make_pair(id, IHistogramPtr(new TMyHistogram(Intervals, id)))).first; + } + it->second->Add(value, weight); + } + + void Multiply(double factor) { + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + it->second->Multiply(factor); + } + } + + TVector<ui64> GetIds() const { + TVector<ui64> result(0); + for (THistogramsMap::const_iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + result.push_back(it->first); + } + return result; + } + + IHistogramPtr GetHistogram(ui64 id) const { + THistogramsMap::const_iterator it = Histograms.find(id); + if (it != Histograms.end()) { + return it->second; + } + return IHistogramPtr(); + } + + double GetMaxHistoSum() const { + double sum = 0.0; + for (THistogramsMap::const_iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + sum = std::max(sum, it->second->GetSum()); + } + return sum; + } + + bool Empty() { + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + if (!it->second->Empty()) { + return false; + } + } + return true; + } + + virtual double OverallSum() { + double sum = 0.0; + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + sum += it->second->GetSum(); + } + return sum; + } + + void FromProto(const THistograms& histograms) { + for (size_t i = 0; i < histograms.HistoRecsSize(); ++i) { + IHistogramPtr newHisto(new TMyHistogram(histograms.GetHistoRecs(i), Intervals)); + if (!newHisto->Empty()) { + Histograms[newHisto->GetId()] = newHisto; + } + } + } + + void ToProto(THistograms& histograms) { + histograms.Clear(); + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + THistogram* histo = histograms.AddHistoRecs(); + it->second->ToProto(*histo); + } + } + + void PrecomputePartialSums() { + for (auto& it : Histograms) { + it.second->PrecomputePartialSums(); + } + } + }; + + template <class TMerger, class TSomeMultiHistogram> + static void MergeToMultiHistogram(const void* data, size_t size, TSomeMultiHistogram& multiHistogram, ui32 intervals = 300) { + TMerger merger(intervals); + merger.Add(data, size); + THistograms histograms; + merger.GetResult(histograms); + multiHistogram.FromProto(histograms); + } + + // Good for parsing from THistograms protobuf + typedef TMultiHistogram<TAutoHistogram> TAutoMultiHistogram; + typedef TAtomicSharedPtr<TAutoMultiHistogram> TAutoMultiHistogramPtr; + +} diff --git a/library/cpp/histogram/adaptive/protos/histo.proto b/library/cpp/histogram/adaptive/protos/histo.proto new file mode 100644 index 0000000000..8961fef022 --- /dev/null +++ b/library/cpp/histogram/adaptive/protos/histo.proto @@ -0,0 +1,36 @@ +package NKiwiAggr; + + +// THistoRec represents a value record added to a multihistogram +message THistoRec { + optional uint64 Id = 1; // Current histogram identifier + optional double Value = 2; + optional double Weight = 3 [default = 1.0]; // You can set a certain weight to each record or just skip records using Weight=0 +} + +message THistoRecs { + repeated THistoRec HistoRecs = 1; +} + +enum EHistogramType { + HT_FIXED_BIN_HISTOGRAM = 1; + HT_ADAPTIVE_DISTANCE_HISTOGRAM = 2; + HT_ADAPTIVE_WEIGHT_HISTOGRAM = 3; + HT_ADAPTIVE_HISTOGRAM = 4; // if the quality function is unknown + HT_ADAPTIVE_WARD_HISTOGRAM = 5; +} + +message THistogram { + optional uint64 Id = 1; + optional double MinValue = 2; + optional double BinRange = 4; // for FIXED_BIN_HISTOGRAM only. And it's OK that it is 4 after 2 + repeated float Freq = 5; + repeated float Position = 6; // for ADAPTIVE histograms only + optional double MaxValue = 7; + optional EHistogramType Type = 8; // Empty field means FIXED_BIN_HISTOGRAM +} + +// Multihistogam +message THistograms { + repeated THistogram HistoRecs = 1; +} diff --git a/library/cpp/histogram/adaptive/protos/python/ya.make b/library/cpp/histogram/adaptive/protos/python/ya.make new file mode 100644 index 0000000000..3328c27965 --- /dev/null +++ b/library/cpp/histogram/adaptive/protos/python/ya.make @@ -0,0 +1,3 @@ +OWNER(abogutskiy) + +PY_PROTOS_FOR(library/cpp/histogram/adaptive/protos) diff --git a/library/cpp/histogram/adaptive/protos/ya.make b/library/cpp/histogram/adaptive/protos/ya.make new file mode 100644 index 0000000000..7635cfcb8c --- /dev/null +++ b/library/cpp/histogram/adaptive/protos/ya.make @@ -0,0 +1,13 @@ +PROTO_LIBRARY() + +OWNER(g:crawl) + +SRCS( + histo.proto +) + +IF (NOT PY_PROTOS_FOR) + EXCLUDE_TAGS(GO_PROTO) +ENDIF() + +END() diff --git a/library/cpp/histogram/adaptive/ya.make b/library/cpp/histogram/adaptive/ya.make new file mode 100644 index 0000000000..b589801b27 --- /dev/null +++ b/library/cpp/histogram/adaptive/ya.make @@ -0,0 +1,20 @@ +LIBRARY() + +OWNER( + zosimov + svirg +) + +SRCS( + common.cpp + adaptive_histogram.cpp + block_histogram.cpp + fixed_bin_histogram.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/histogram/adaptive/protos +) + +END() diff --git a/library/cpp/histogram/ya.make b/library/cpp/histogram/ya.make new file mode 100644 index 0000000000..77cfd96902 --- /dev/null +++ b/library/cpp/histogram/ya.make @@ -0,0 +1,9 @@ +RECURSE( + adaptive + hdr + hdr/ut + simple + simple/ut + rt + rt/ut +) |