diff options
author | zosimov <zosimov@yandex-team.ru> | 2022-02-10 16:50:32 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:32 +0300 |
commit | a1d67d6a31f789aa011250c3edce5751c0cadad2 (patch) | |
tree | 5d5cb817648f650d76cf1076100726fd9b8448e8 /library/cpp/histogram | |
parent | a8f009e06d613c9567eb4c0f461dbed5e0d8092b (diff) | |
download | ydb-a1d67d6a31f789aa011250c3edce5751c0cadad2.tar.gz |
Restoring authorship annotation for <zosimov@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/histogram')
-rw-r--r-- | library/cpp/histogram/adaptive/adaptive_histogram.cpp | 850 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/adaptive_histogram.h | 120 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/auto_histogram.h | 216 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/block_histogram.cpp | 106 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/block_histogram.h | 10 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/fixed_bin_histogram.cpp | 946 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/fixed_bin_histogram.h | 122 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/histogram.h | 84 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/merger.h | 130 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/multi_histogram.h | 230 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/protos/histo.proto | 28 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/protos/ya.make | 2 | ||||
-rw-r--r-- | library/cpp/histogram/adaptive/ya.make | 2 |
13 files changed, 1423 insertions, 1423 deletions
diff --git a/library/cpp/histogram/adaptive/adaptive_histogram.cpp b/library/cpp/histogram/adaptive/adaptive_histogram.cpp index e73b45c2c6..cbfc494021 100644 --- a/library/cpp/histogram/adaptive/adaptive_histogram.cpp +++ b/library/cpp/histogram/adaptive/adaptive_histogram.cpp @@ -1,72 +1,72 @@ -#include "adaptive_histogram.h" - +#include "adaptive_histogram.h" + #include <util/generic/algorithm.h> -#include <util/generic/yexception.h> -#include <util/generic/ymath.h> -#include <util/string/printf.h> - +#include <util/generic/yexception.h> +#include <util/generic/ymath.h> +#include <util/string/printf.h> + #include <util/system/backtrace.h> -namespace NKiwiAggr { +namespace NKiwiAggr { TAdaptiveHistogram::TAdaptiveHistogram(size_t intervals, ui64 id, TQualityFunction qualityFunc) - : Id(id) - , Sum(0.0) - , Intervals(intervals) - , CalcQuality(qualityFunc) - { - } - + : Id(id) + , Sum(0.0) + , Intervals(intervals) + , CalcQuality(qualityFunc) + { + } + TAdaptiveHistogram::TAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, TQualityFunction qualityFunc) : TAdaptiveHistogram(defaultIntervals, defaultId, qualityFunc) - { - FromProto(histo); - } - + { + FromProto(histo); + } + TAdaptiveHistogram::TAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, TQualityFunction qualityFunc) : TAdaptiveHistogram(defaultIntervals, defaultId, qualityFunc) - { - TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histo); - if (!adaptiveHisto) { - FromIHistogram(histo); - return; - } - Id = adaptiveHisto->Id; - MinValue = adaptiveHisto->MinValue; - MaxValue = adaptiveHisto->MaxValue; - Sum = adaptiveHisto->Sum; - Intervals = adaptiveHisto->Intervals; - Bins = adaptiveHisto->Bins; - BinsByQuality = adaptiveHisto->BinsByQuality; + { + TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histo); + if (!adaptiveHisto) { + FromIHistogram(histo); + return; + } + Id = adaptiveHisto->Id; + MinValue = adaptiveHisto->MinValue; + MaxValue = adaptiveHisto->MaxValue; + Sum = adaptiveHisto->Sum; + Intervals = adaptiveHisto->Intervals; + Bins = adaptiveHisto->Bins; + BinsByQuality = adaptiveHisto->BinsByQuality; if (CalcQuality == nullptr) { - CalcQuality = adaptiveHisto->CalcQuality; - } - } - + CalcQuality = adaptiveHisto->CalcQuality; + } + } + TQualityFunction TAdaptiveHistogram::GetQualityFunc() { - return CalcQuality; - } - - void TAdaptiveHistogram::Clear() { - Sum = 0.0; - Bins.clear(); - BinsByQuality.clear(); - } - - void TAdaptiveHistogram::Add(const THistoRec& histoRec) { - if (!histoRec.HasId() || histoRec.GetId() == Id) { - Add(histoRec.GetValue(), histoRec.GetWeight()); - } - } - - void TAdaptiveHistogram::Add(double value, double weight) { - if (!IsValidFloat(value) || !IsValidFloat(weight)) { + return CalcQuality; + } + + void TAdaptiveHistogram::Clear() { + Sum = 0.0; + Bins.clear(); + BinsByQuality.clear(); + } + + void TAdaptiveHistogram::Add(const THistoRec& histoRec) { + if (!histoRec.HasId() || histoRec.GetId() == Id) { + Add(histoRec.GetValue(), histoRec.GetWeight()); + } + } + + void TAdaptiveHistogram::Add(double value, double weight) { + if (!IsValidFloat(value) || !IsValidFloat(weight)) { ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight); - } - TWeightedValue weightedValue(value, weight); - Add(weightedValue, true); + } + TWeightedValue weightedValue(value, weight); + Add(weightedValue, true); PrecomputedBins.clear(); - } - + } + void TAdaptiveHistogram::Merge(const THistogram& histo, double multiplier) { if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { fprintf(stderr, "Merging in histogram id %lu: skip bad histo with minvalue %f maxvalue %f\n", Id, histo.GetMinValue(), histo.GetMaxValue()); @@ -87,7 +87,7 @@ namespace NKiwiAggr { if (!IsValidFloat(value) || !IsValidFloat(weight)) { fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); continue; - } + } Add(value, weight * multiplier); } @@ -99,20 +99,20 @@ namespace NKiwiAggr { double weight = histo.GetFreq(j); if (!IsValidFloat(pos) || !IsValidFloat(weight)) { fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, pos, weight); - pos += histo.GetBinRange(); + pos += histo.GetBinRange(); continue; - } + } Add(pos, weight * multiplier); pos += histo.GetBinRange(); } - + MinValue = Min(MinValue, histo.GetMinValue()); MaxValue = Max(MaxValue, histo.GetMaxValue()); } else { ythrow yexception() << "Unknown THistogram type"; - } - } - + } + } + void TAdaptiveHistogram::Merge(const TVector<THistogram>& histogramsToMerge) { for (size_t i = 0; i < histogramsToMerge.size(); ++i) { Merge(histogramsToMerge[i], 1.0); @@ -122,64 +122,64 @@ namespace NKiwiAggr { void TAdaptiveHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) { TVector<IHistogramPtr> histogramsToMergeRepacked(0); TVector<TAdaptiveHistogram*> histograms(0); - for (size_t i = 0; i < histogramsToMerge.size(); ++i) { - if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) { - continue; - } - TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histogramsToMerge[i].Get()); - if (adaptiveHisto) { - histogramsToMergeRepacked.push_back(histogramsToMerge[i]); - } else { + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) { + continue; + } + TAdaptiveHistogram* adaptiveHisto = dynamic_cast<TAdaptiveHistogram*>(histogramsToMerge[i].Get()); + if (adaptiveHisto) { + histogramsToMergeRepacked.push_back(histogramsToMerge[i]); + } else { histogramsToMergeRepacked.push_back(IHistogramPtr(new TAdaptiveHistogram(histogramsToMerge[i].Get(), Intervals, Id, CalcQuality))); // Convert histograms that are not of TFixedBinHistogram type - } - if (histogramsToMergeRepacked.back()->Empty()) { - continue; - } - histograms.push_back(dynamic_cast<TAdaptiveHistogram*>(histogramsToMergeRepacked.back().Get())); - } - - if (histograms.size() == 0) { - return; - } - - for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) { - TAdaptiveHistogram* histo = histograms[histoIndex]; - for (TPairSet::const_iterator it = histo->Bins.begin(); it != histo->Bins.end(); ++it) { - Add(*it, true); - } - } - - for (size_t i = 0; i < histograms.size(); ++i) { - MinValue = Min(MinValue, histograms[i]->MinValue); - MaxValue = Max(MaxValue, histograms[i]->MaxValue); - } - } - - void TAdaptiveHistogram::Multiply(double factor) { - if (!IsValidFloat(factor) || factor <= 0) { - ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; - } - Sum *= factor; - - TPairSet newBins; - for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) { - newBins.insert(TWeightedValue(it->first, it->second * factor)); - } - Bins = newBins; - - TPairSet newBinsByQuality; - for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) { - TPairSet::iterator rightBin = it; - ++rightBin; - if (rightBin == Bins.end()) { - break; - } - newBinsByQuality.insert(CalcQuality(*it, *rightBin)); - } - BinsByQuality = newBinsByQuality; - } - - void TAdaptiveHistogram::FromProto(const THistogram& histo) { + } + if (histogramsToMergeRepacked.back()->Empty()) { + continue; + } + histograms.push_back(dynamic_cast<TAdaptiveHistogram*>(histogramsToMergeRepacked.back().Get())); + } + + if (histograms.size() == 0) { + return; + } + + for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) { + TAdaptiveHistogram* histo = histograms[histoIndex]; + for (TPairSet::const_iterator it = histo->Bins.begin(); it != histo->Bins.end(); ++it) { + Add(*it, true); + } + } + + for (size_t i = 0; i < histograms.size(); ++i) { + MinValue = Min(MinValue, histograms[i]->MinValue); + MaxValue = Max(MaxValue, histograms[i]->MaxValue); + } + } + + void TAdaptiveHistogram::Multiply(double factor) { + if (!IsValidFloat(factor) || factor <= 0) { + ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; + } + Sum *= factor; + + TPairSet newBins; + for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) { + newBins.insert(TWeightedValue(it->first, it->second * factor)); + } + Bins = newBins; + + TPairSet newBinsByQuality; + for (TPairSet::iterator it = Bins.begin(); it != Bins.end(); ++it) { + TPairSet::iterator rightBin = it; + ++rightBin; + if (rightBin == Bins.end()) { + break; + } + newBinsByQuality.insert(CalcQuality(*it, *rightBin)); + } + BinsByQuality = newBinsByQuality; + } + + void TAdaptiveHistogram::FromProto(const THistogram& histo) { Y_VERIFY(histo.HasType(), "Attempt to parse TAdaptiveHistogram from THistogram protobuf with no Type field set"); ; switch (histo.GetType()) { // check that histogram type could be deduced @@ -193,107 +193,107 @@ namespace NKiwiAggr { [[fallthrough]]; default: // not ok ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record of type = " << (ui32)histo.GetType(); - } + } - if (histo.FreqSize() != histo.PositionSize()) { - ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize(); - } + if (histo.FreqSize() != histo.PositionSize()) { + ythrow yexception() << "Attempt to parse TAdaptiveHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize(); + } if (CalcQuality == nullptr) { - if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) { - CalcQuality = CalcDistanceQuality; - } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) { - CalcQuality = CalcWeightQuality; + if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) { + CalcQuality = CalcDistanceQuality; + } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) { + CalcQuality = CalcWeightQuality; } else if (histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM) { CalcQuality = CalcWardQuality; - } else { - ythrow yexception() << "Attempt to parse an HT_ADAPTIVE_HISTOGRAM without default quality function"; - } - } - Id = histo.GetId(); - Sum = 0.0; - Intervals = Max(Intervals, histo.FreqSize()); - for (size_t i = 0; i < histo.FreqSize(); ++i) { - double value = histo.GetPosition(i); - double weight = histo.GetFreq(i); - if (!IsValidFloat(value) || !IsValidFloat(weight)) { + } else { + ythrow yexception() << "Attempt to parse an HT_ADAPTIVE_HISTOGRAM without default quality function"; + } + } + Id = histo.GetId(); + Sum = 0.0; + Intervals = Max(Intervals, histo.FreqSize()); + for (size_t i = 0; i < histo.FreqSize(); ++i) { + double value = histo.GetPosition(i); + double weight = histo.GetFreq(i); + if (!IsValidFloat(value) || !IsValidFloat(weight)) { fprintf(stderr, "FromProto in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); - continue; - } - Add(value, weight); - } - - if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { + continue; + } + Add(value, weight); + } + + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue()); - } - - MinValue = histo.GetMinValue(); - MaxValue = histo.GetMaxValue(); - } - - void TAdaptiveHistogram::ToProto(THistogram& histo) { - histo.Clear(); - if (CalcQuality == CalcDistanceQuality) { - histo.SetType(HT_ADAPTIVE_DISTANCE_HISTOGRAM); - } else if (CalcQuality == CalcWeightQuality) { - histo.SetType(HT_ADAPTIVE_WEIGHT_HISTOGRAM); + } + + MinValue = histo.GetMinValue(); + MaxValue = histo.GetMaxValue(); + } + + void TAdaptiveHistogram::ToProto(THistogram& histo) { + histo.Clear(); + if (CalcQuality == CalcDistanceQuality) { + histo.SetType(HT_ADAPTIVE_DISTANCE_HISTOGRAM); + } else if (CalcQuality == CalcWeightQuality) { + histo.SetType(HT_ADAPTIVE_WEIGHT_HISTOGRAM); } else if (CalcQuality == CalcWardQuality) { histo.SetType(HT_ADAPTIVE_WARD_HISTOGRAM); - } else { - histo.SetType(HT_ADAPTIVE_HISTOGRAM); - } - histo.SetId(Id); - if (Empty()) { - return; - } - histo.SetMinValue(MinValue); - histo.SetMaxValue(MaxValue); - for (TPairSet::const_iterator it = Bins.begin(); it != Bins.end(); ++it) { - histo.AddFreq(it->second); - histo.AddPosition(it->first); - } - } - + } else { + histo.SetType(HT_ADAPTIVE_HISTOGRAM); + } + histo.SetId(Id); + if (Empty()) { + return; + } + histo.SetMinValue(MinValue); + histo.SetMaxValue(MaxValue); + for (TPairSet::const_iterator it = Bins.begin(); it != Bins.end(); ++it) { + histo.AddFreq(it->second); + histo.AddPosition(it->first); + } + } + void TAdaptiveHistogram::SetId(ui64 id) { - Id = id; - } - + Id = id; + } + ui64 TAdaptiveHistogram::GetId() { - return Id; - } - - bool TAdaptiveHistogram::Empty() { - return Bins.size() == 0; - } - - double TAdaptiveHistogram::GetMinValue() { - return MinValue; - } - - double TAdaptiveHistogram::GetMaxValue() { - return MaxValue; - } - - double TAdaptiveHistogram::GetSum() { - return Sum; - } - - double TAdaptiveHistogram::GetSumInRange(double leftBound, double rightBound) { - if (leftBound > rightBound) { - return 0.0; - } - return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum; - } - - double TAdaptiveHistogram::GetSumAboveBound(double bound) { - if (Empty()) { - return 0.0; - } - if (bound < MinValue) { - return Sum; - } - if (bound > MaxValue) { - return 0.0; - } + return Id; + } + + bool TAdaptiveHistogram::Empty() { + return Bins.size() == 0; + } + + double TAdaptiveHistogram::GetMinValue() { + return MinValue; + } + + double TAdaptiveHistogram::GetMaxValue() { + return MaxValue; + } + + double TAdaptiveHistogram::GetSum() { + return Sum; + } + + double TAdaptiveHistogram::GetSumInRange(double leftBound, double rightBound) { + if (leftBound > rightBound) { + return 0.0; + } + return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum; + } + + double TAdaptiveHistogram::GetSumAboveBound(double bound) { + if (Empty()) { + return 0.0; + } + if (bound < MinValue) { + return Sum; + } + if (bound > MaxValue) { + return 0.0; + } if (!PrecomputedBins.empty()) { return GetSumAboveBoundImpl( @@ -314,19 +314,19 @@ namespace NKiwiAggr { } return sum; }); - } - } - - double TAdaptiveHistogram::GetSumBelowBound(double bound) { - if (Empty()) { - return 0.0; - } - if (bound < MinValue) { - return 0.0; - } - if (bound > MaxValue) { - return Sum; - } + } + } + + double TAdaptiveHistogram::GetSumBelowBound(double bound) { + if (Empty()) { + return 0.0; + } + if (bound < MinValue) { + return 0.0; + } + if (bound > MaxValue) { + return Sum; + } if (!PrecomputedBins.empty()) { return GetSumBelowBoundImpl( @@ -346,90 +346,90 @@ namespace NKiwiAggr { } return sum; }); - } - } - - double TAdaptiveHistogram::CalcUpperBound(double sum) { + } + } + + double TAdaptiveHistogram::CalcUpperBound(double sum) { Y_VERIFY(sum >= 0, "Sum must be >= 0"); if (sum == 0.0) { return MinValue; } - if (Empty()) { - return MaxValue; - } - TPairSet::iterator current = Bins.begin(); - double gatheredSum = 0.0; - while (current != Bins.end() && gatheredSum < sum) { - gatheredSum += current->second; - ++current; - } - --current; - if (gatheredSum < sum) { - return MaxValue; - } - TWeightedValue left(MinValue, 0.0); - TWeightedValue right(MaxValue, 0.0); - if (current != Bins.begin()) { - TPairSet::iterator leftBin = current; - --leftBin; - left = *leftBin; - } - { - TPairSet::iterator rightBin = current; - ++rightBin; - if (rightBin != Bins.end()) { - right = *rightBin; - } - } - double sumToAdd = sum - (gatheredSum - current->second - left.second / 2); - if (sumToAdd <= ((current->second + left.second) / 2)) { - return left.first + 2 * sumToAdd * (current->first - left.first) / (current->second + left.second); - } else { - sumToAdd -= (current->second + left.second) / 2; - return current->first + 2 * sumToAdd * (right.first - current->first) / (right.second + current->second); - } - } - - double TAdaptiveHistogram::CalcLowerBound(double sum) { + if (Empty()) { + return MaxValue; + } + TPairSet::iterator current = Bins.begin(); + double gatheredSum = 0.0; + while (current != Bins.end() && gatheredSum < sum) { + gatheredSum += current->second; + ++current; + } + --current; + if (gatheredSum < sum) { + return MaxValue; + } + TWeightedValue left(MinValue, 0.0); + TWeightedValue right(MaxValue, 0.0); + if (current != Bins.begin()) { + TPairSet::iterator leftBin = current; + --leftBin; + left = *leftBin; + } + { + TPairSet::iterator rightBin = current; + ++rightBin; + if (rightBin != Bins.end()) { + right = *rightBin; + } + } + double sumToAdd = sum - (gatheredSum - current->second - left.second / 2); + if (sumToAdd <= ((current->second + left.second) / 2)) { + return left.first + 2 * sumToAdd * (current->first - left.first) / (current->second + left.second); + } else { + sumToAdd -= (current->second + left.second) / 2; + return current->first + 2 * sumToAdd * (right.first - current->first) / (right.second + current->second); + } + } + + double TAdaptiveHistogram::CalcLowerBound(double sum) { Y_VERIFY(sum >= 0, "Sum must be >= 0"); if (sum == 0.0) { return MaxValue; } - if (Empty()) { - return MinValue; - } - TPairSet::iterator current = Bins.end(); - double gatheredSum = 0.0; - while (current != Bins.begin() && gatheredSum < sum) { - --current; - gatheredSum += current->second; - } - if (gatheredSum < sum) { - return MinValue; - } - TWeightedValue left(MinValue, 0.0); - TWeightedValue right(MaxValue, 0.0); - if (current != Bins.begin()) { - TPairSet::iterator leftBin = current; - --leftBin; - left = *leftBin; - } - { - TPairSet::iterator rightBin = current; - ++rightBin; - if (rightBin != Bins.end()) { - right = *rightBin; - } - } - double sumToAdd = sum - (gatheredSum - current->second - right.second / 2); - if (sumToAdd <= ((current->second + right.second) / 2)) { - return right.first - 2 * sumToAdd * (right.first - current->first) / (current->second + right.second); - } else { - sumToAdd -= (current->second + right.second) / 2; - return current->first - 2 * sumToAdd * (current->first - left.first) / (left.second + current->second); - } - } - + if (Empty()) { + return MinValue; + } + TPairSet::iterator current = Bins.end(); + double gatheredSum = 0.0; + while (current != Bins.begin() && gatheredSum < sum) { + --current; + gatheredSum += current->second; + } + if (gatheredSum < sum) { + return MinValue; + } + TWeightedValue left(MinValue, 0.0); + TWeightedValue right(MaxValue, 0.0); + if (current != Bins.begin()) { + TPairSet::iterator leftBin = current; + --leftBin; + left = *leftBin; + } + { + TPairSet::iterator rightBin = current; + ++rightBin; + if (rightBin != Bins.end()) { + right = *rightBin; + } + } + double sumToAdd = sum - (gatheredSum - current->second - right.second / 2); + if (sumToAdd <= ((current->second + right.second) / 2)) { + return right.first - 2 * sumToAdd * (right.first - current->first) / (current->second + right.second); + } else { + sumToAdd -= (current->second + right.second) / 2; + return current->first - 2 * sumToAdd * (current->first - left.first) / (left.second + current->second); + } + } + double TAdaptiveHistogram::CalcUpperBoundSafe(double sum) { if (!Empty()) { sum = Max(Bins.begin()->second, sum); @@ -444,145 +444,145 @@ namespace NKiwiAggr { return CalcLowerBound(sum); } - void TAdaptiveHistogram::FromIHistogram(IHistogram* histo) { - if (!histo) { - ythrow yexception() << "Attempt to create TAdaptiveHistogram from a NULL pointer"; - } + void TAdaptiveHistogram::FromIHistogram(IHistogram* histo) { + if (!histo) { + ythrow yexception() << "Attempt to create TAdaptiveHistogram from a NULL pointer"; + } if (CalcQuality == CalcWardQuality) { ythrow yexception() << "Not implemented"; } else if (CalcQuality != CalcDistanceQuality && CalcQuality != CalcWeightQuality) { - ythrow yexception() << "Attempt to create TAdaptiveHistogram from a pointer without default CalcQuality"; - } - Id = histo->GetId(); - if (histo->Empty()) { - return; - } - double sum = histo->GetSum(); - double minValue = histo->GetMinValue(); - double maxValue = histo->GetMaxValue(); - if (minValue == maxValue) { - Add(minValue, sum); - return; - } - if (CalcQuality == CalcDistanceQuality) { - double binRange = (maxValue - minValue) / (Intervals); - for (size_t i = 0; i < Intervals; ++i) { - Add(minValue + binRange * (i + 0.5), histo->GetSumInRange(minValue + binRange * i, minValue + binRange * (i + 1))); - } - } else if (CalcQuality == CalcWeightQuality && sum != 0.0) { - double slab = sum / Intervals; - double prevBound = minValue; - for (size_t i = 0; i < Intervals; ++i) { - double bound = histo->CalcUpperBound(slab * (i + 1)); - Add((bound + prevBound) / 2, slab); - prevBound = bound; - } - } - MinValue = minValue; - MaxValue = maxValue; - } - - void TAdaptiveHistogram::Add(const TWeightedValue& weightedValue, bool initial) { - const double& value = weightedValue.first; - const double& weight = weightedValue.second; - if (weight <= 0.0) { - return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden - } - if (initial) { - Sum += weight; - } - if (Bins.size() == 0) { - MinValue = value; - MaxValue = value; - Bins.insert(weightedValue); - return; - } - if (value < MinValue) { - MinValue = value; - } - if (value > MaxValue) { - MaxValue = value; - } - TPairSet::iterator rightBin = Bins.lower_bound(TWeightedValue(value, -1.0)); - if (rightBin != Bins.end() && rightBin->first == value) { - TPairSet::iterator currentBin = rightBin; - ++rightBin; - TWeightedValue newBin(value, weight + currentBin->second); - if (rightBin != Bins.end()) { + ythrow yexception() << "Attempt to create TAdaptiveHistogram from a pointer without default CalcQuality"; + } + Id = histo->GetId(); + if (histo->Empty()) { + return; + } + double sum = histo->GetSum(); + double minValue = histo->GetMinValue(); + double maxValue = histo->GetMaxValue(); + if (minValue == maxValue) { + Add(minValue, sum); + return; + } + if (CalcQuality == CalcDistanceQuality) { + double binRange = (maxValue - minValue) / (Intervals); + for (size_t i = 0; i < Intervals; ++i) { + Add(minValue + binRange * (i + 0.5), histo->GetSumInRange(minValue + binRange * i, minValue + binRange * (i + 1))); + } + } else if (CalcQuality == CalcWeightQuality && sum != 0.0) { + double slab = sum / Intervals; + double prevBound = minValue; + for (size_t i = 0; i < Intervals; ++i) { + double bound = histo->CalcUpperBound(slab * (i + 1)); + Add((bound + prevBound) / 2, slab); + prevBound = bound; + } + } + MinValue = minValue; + MaxValue = maxValue; + } + + void TAdaptiveHistogram::Add(const TWeightedValue& weightedValue, bool initial) { + const double& value = weightedValue.first; + const double& weight = weightedValue.second; + if (weight <= 0.0) { + return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden + } + if (initial) { + Sum += weight; + } + if (Bins.size() == 0) { + MinValue = value; + MaxValue = value; + Bins.insert(weightedValue); + return; + } + if (value < MinValue) { + MinValue = value; + } + if (value > MaxValue) { + MaxValue = value; + } + TPairSet::iterator rightBin = Bins.lower_bound(TWeightedValue(value, -1.0)); + if (rightBin != Bins.end() && rightBin->first == value) { + TPairSet::iterator currentBin = rightBin; + ++rightBin; + TWeightedValue newBin(value, weight + currentBin->second); + if (rightBin != Bins.end()) { Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) == 1, "Erase failed"); - BinsByQuality.insert(CalcQuality(newBin, *rightBin)); - } - if (currentBin != Bins.begin()) { - TPairSet::iterator leftBin = currentBin; - --leftBin; + BinsByQuality.insert(CalcQuality(newBin, *rightBin)); + } + if (currentBin != Bins.begin()) { + TPairSet::iterator leftBin = currentBin; + --leftBin; Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) == 1, "Erase failed"); - BinsByQuality.insert(CalcQuality(*leftBin, newBin)); - } - Bins.erase(currentBin); - Bins.insert(newBin); - return; - } - if (rightBin == Bins.begin()) { - BinsByQuality.insert(CalcQuality(weightedValue, *rightBin)); - } else { - TPairSet::iterator leftBin = rightBin; - --leftBin; - if (rightBin == Bins.end()) { - BinsByQuality.insert(CalcQuality(*leftBin, weightedValue)); - } else { + BinsByQuality.insert(CalcQuality(*leftBin, newBin)); + } + Bins.erase(currentBin); + Bins.insert(newBin); + return; + } + if (rightBin == Bins.begin()) { + BinsByQuality.insert(CalcQuality(weightedValue, *rightBin)); + } else { + TPairSet::iterator leftBin = rightBin; + --leftBin; + if (rightBin == Bins.end()) { + BinsByQuality.insert(CalcQuality(*leftBin, weightedValue)); + } else { Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *rightBin)) == 1, "Erase failed"); - BinsByQuality.insert(CalcQuality(*leftBin, weightedValue)); - BinsByQuality.insert(CalcQuality(weightedValue, *rightBin)); - } - } - Bins.insert(weightedValue); - if (Bins.size() > Intervals) { - Shrink(); - } - } - - void TAdaptiveHistogram::Erase(double value) { - TPairSet::iterator currentBin = Bins.lower_bound(TWeightedValue(value, -1.0)); + BinsByQuality.insert(CalcQuality(*leftBin, weightedValue)); + BinsByQuality.insert(CalcQuality(weightedValue, *rightBin)); + } + } + Bins.insert(weightedValue); + if (Bins.size() > Intervals) { + Shrink(); + } + } + + void TAdaptiveHistogram::Erase(double value) { + TPairSet::iterator currentBin = Bins.lower_bound(TWeightedValue(value, -1.0)); Y_VERIFY(currentBin != Bins.end() && currentBin->first == value, "Can't find bin that should be erased"); - TPairSet::iterator rightBin = currentBin; - ++rightBin; - if (currentBin == Bins.begin()) { + TPairSet::iterator rightBin = currentBin; + ++rightBin; + if (currentBin == Bins.begin()) { Y_VERIFY(rightBin != Bins.end(), "No right bin for the first bin"); Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) != 0, "Erase failed"); - } else { - TPairSet::iterator leftBin = currentBin; - --leftBin; - if (rightBin == Bins.end()) { + } else { + TPairSet::iterator leftBin = currentBin; + --leftBin; + if (rightBin == Bins.end()) { Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) != 0, "Erase failed"); - } else { + } else { Y_VERIFY(BinsByQuality.erase(CalcQuality(*leftBin, *currentBin)) != 0, "Erase failed"); Y_VERIFY(BinsByQuality.erase(CalcQuality(*currentBin, *rightBin)) != 0, "Erase failed"); - BinsByQuality.insert(CalcQuality(*leftBin, *rightBin)); - } - } - Bins.erase(currentBin); - } - - void TAdaptiveHistogram::Shrink() { - TPairSet::iterator worstBin = BinsByQuality.begin(); + BinsByQuality.insert(CalcQuality(*leftBin, *rightBin)); + } + } + Bins.erase(currentBin); + } + + void TAdaptiveHistogram::Shrink() { + TPairSet::iterator worstBin = BinsByQuality.begin(); Y_VERIFY(worstBin != BinsByQuality.end(), "No right bin for the first bin"); - TPairSet::iterator leftBin = Bins.lower_bound(TWeightedValue(worstBin->second, -1.0)); + TPairSet::iterator leftBin = Bins.lower_bound(TWeightedValue(worstBin->second, -1.0)); Y_VERIFY(leftBin != Bins.end() && leftBin->first == worstBin->second, "Can't find worst bin"); - TPairSet::iterator rightBin = leftBin; - ++rightBin; + TPairSet::iterator rightBin = leftBin; + ++rightBin; Y_VERIFY(rightBin != Bins.end(), "Can't find right bin"); - - TWeightedValue newBin((leftBin->first * leftBin->second + rightBin->first * rightBin->second) / (leftBin->second + rightBin->second), leftBin->second + rightBin->second); - if (Bins.size() > 2) { - Erase(leftBin->first); - Erase(rightBin->first); - } else { - Bins.clear(); - BinsByQuality.clear(); - } - Add(newBin, false); - } - + + TWeightedValue newBin((leftBin->first * leftBin->second + rightBin->first * rightBin->second) / (leftBin->second + rightBin->second), leftBin->second + rightBin->second); + if (Bins.size() > 2) { + Erase(leftBin->first); + Erase(rightBin->first); + } else { + Bins.clear(); + BinsByQuality.clear(); + } + Add(newBin, false); + } + void TAdaptiveHistogram::PrecomputePartialSums() { PrecomputedBins.clear(); PrecomputedBins.reserve(Bins.size()); diff --git a/library/cpp/histogram/adaptive/adaptive_histogram.h b/library/cpp/histogram/adaptive/adaptive_histogram.h index daf772584a..fa8f48433f 100644 --- a/library/cpp/histogram/adaptive/adaptive_histogram.h +++ b/library/cpp/histogram/adaptive/adaptive_histogram.h @@ -1,22 +1,22 @@ -#pragma once - -#include "histogram.h" +#pragma once + +#include "histogram.h" #include "common.h" - + #include <library/cpp/histogram/adaptive/protos/histo.pb.h> - -#include <util/generic/ptr.h> -#include <util/generic/set.h> -#include <util/generic/vector.h> - -namespace NKiwiAggr { + +#include <util/generic/ptr.h> +#include <util/generic/set.h> +#include <util/generic/vector.h> + +namespace NKiwiAggr { class TAdaptiveHistogram: private TNonCopyable, public IHistogram { - protected: - static const size_t DEFAULT_INTERVALS = 100; - - private: + protected: + static const size_t DEFAULT_INTERVALS = 100; + + private: using TPairSet = TSet<TWeightedValue>; - + struct TFastBin { // these names are for compatibility with TWeightedValue double first; @@ -39,40 +39,40 @@ namespace NKiwiAggr { }; ui64 Id; - double MinValue; - double MaxValue; - double Sum; - size_t Intervals; - TPairSet Bins; - TPairSet BinsByQuality; - TQualityFunction CalcQuality; - + double MinValue; + double MaxValue; + double Sum; + size_t Intervals; + TPairSet Bins; + TPairSet BinsByQuality; + TQualityFunction CalcQuality; + TVector<TFastBin> PrecomputedBins; - public: + public: TAdaptiveHistogram(size_t intervals, ui64 id = 0, TQualityFunction qualityFunc = CalcWeightQuality); TAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, TQualityFunction qualityFunc = nullptr); TAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, TQualityFunction qualityFunc = CalcWeightQuality); - + ~TAdaptiveHistogram() override { - } - - TQualityFunction GetQualityFunc(); - + } + + TQualityFunction GetQualityFunc(); + void Clear() override; - + void Add(double value, double weight) override; void Add(const THistoRec& histoRec) override; - + void Merge(const THistogram& histo, double multiplier) final; void Merge(const TVector<THistogram>& histogramsToMerge) final; void Merge(TVector<IHistogramPtr> histogramsToMerge) final; - + void Multiply(double factor) final; - + void FromProto(const THistogram& histo) final; void ToProto(THistogram& histo) final; - + void SetId(ui64 id) final; ui64 GetId() final; bool Empty() final; @@ -86,46 +86,46 @@ namespace NKiwiAggr { double CalcLowerBound(double sum) final; double CalcUpperBoundSafe(double sum) final; double CalcLowerBoundSafe(double sum) final; - + void PrecomputePartialSums() final; - private: - void FromIHistogram(IHistogram* histo); - void Add(const TWeightedValue& weightedValue, bool initial); - void Erase(double value); - void Shrink(); + private: + void FromIHistogram(IHistogram* histo); + void Add(const TWeightedValue& weightedValue, bool initial); + void Erase(double value); + void Shrink(); template <typename TBins, typename TGetSumAbove> double GetSumAboveBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumAbove& getSumAbove) const; template <typename TBins, typename TGetSumBelow> double GetSumBelowBoundImpl(double bound, const TBins& bins, typename TBins::const_iterator rightBin, const TGetSumBelow& getSumBelow) const; - }; - + }; + template <TQualityFunction QualityFunction> - class TDefinedAdaptiveHistogram: public TAdaptiveHistogram { - public: + class TDefinedAdaptiveHistogram: public TAdaptiveHistogram { + public: TDefinedAdaptiveHistogram(size_t intervals, ui64 id = 0) - : TAdaptiveHistogram(intervals, id, QualityFunction) - { - } - + : TAdaptiveHistogram(intervals, id, QualityFunction) + { + } + TDefinedAdaptiveHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) - : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction) - { - } - + : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction) + { + } + TDefinedAdaptiveHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) - : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction) - { - } - + : TAdaptiveHistogram(histo, defaultIntervals, defaultId, QualityFunction) + { + } + ~TDefinedAdaptiveHistogram() override { - } - }; - + } + }; + typedef TDefinedAdaptiveHistogram<CalcDistanceQuality> TAdaptiveDistanceHistogram; typedef TDefinedAdaptiveHistogram<CalcWeightQuality> TAdaptiveWeightHistogram; typedef TDefinedAdaptiveHistogram<CalcWardQuality> TAdaptiveWardHistogram; - + } diff --git a/library/cpp/histogram/adaptive/auto_histogram.h b/library/cpp/histogram/adaptive/auto_histogram.h index 57c4c7b7fe..9fdf0b9abe 100644 --- a/library/cpp/histogram/adaptive/auto_histogram.h +++ b/library/cpp/histogram/adaptive/auto_histogram.h @@ -1,136 +1,136 @@ -#pragma once - -#include "adaptive_histogram.h" -#include "fixed_bin_histogram.h" -#include "histogram.h" - +#pragma once + +#include "adaptive_histogram.h" +#include "fixed_bin_histogram.h" +#include "histogram.h" + #include <library/cpp/histogram/adaptive/protos/histo.pb.h> - -#include <util/generic/ptr.h> + +#include <util/generic/ptr.h> #include <util/generic/yexception.h> - -namespace NKiwiAggr { + +namespace NKiwiAggr { class TAutoHistogram: private TNonCopyable, public IHistogram { - private: - static const size_t DEFAULT_INTERVALS = 100; - + private: + static const size_t DEFAULT_INTERVALS = 100; + ui64 Id; - size_t Intervals; - THolder<IHistogram> HistogramImpl; - - public: + size_t Intervals; + THolder<IHistogram> HistogramImpl; + + public: TAutoHistogram(size_t intervals, ui64 id = 0) { Y_UNUSED(intervals); Y_UNUSED(id); - ythrow yexception() << "Empty constructor is not defined for TAutoHistogram"; - } - + ythrow yexception() << "Empty constructor is not defined for TAutoHistogram"; + } + TAutoHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) - : Id(defaultId) - , Intervals(defaultIntervals) - { - FromProto(histo); - } - + : Id(defaultId) + , Intervals(defaultIntervals) + { + FromProto(histo); + } + TAutoHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0) { Y_UNUSED(histo); Y_UNUSED(defaultIntervals); Y_UNUSED(defaultId); - ythrow yexception() << "IHistogram constructor is not defined for TAutoHistogram"; - } - + ythrow yexception() << "IHistogram constructor is not defined for TAutoHistogram"; + } + virtual ~TAutoHistogram() { - } - - virtual void Clear() { - HistogramImpl->Clear(); - } - - virtual void Add(double value, double weight) { - HistogramImpl->Add(value, weight); - } - - virtual void Add(const THistoRec& histoRec) { - HistogramImpl->Add(histoRec); - } - + } + + virtual void Clear() { + HistogramImpl->Clear(); + } + + virtual void Add(double value, double weight) { + HistogramImpl->Add(value, weight); + } + + virtual void Add(const THistoRec& histoRec) { + HistogramImpl->Add(histoRec); + } + virtual void Merge(const THistogram& histo, double multiplier) { HistogramImpl->Merge(histo, multiplier); } virtual void Merge(const TVector<THistogram>& histogramsToMerge) { - HistogramImpl->Merge(histogramsToMerge); - } - + HistogramImpl->Merge(histogramsToMerge); + } + virtual void Merge(TVector<IHistogramPtr> histogramsToMerge) { - HistogramImpl->Merge(histogramsToMerge); - } - - virtual void Multiply(double factor) { - HistogramImpl->Multiply(factor); - } - - virtual void FromProto(const THistogram& histo) { - if (!histo.HasType() || histo.GetType() == HT_FIXED_BIN_HISTOGRAM) { - HistogramImpl.Reset(new TFixedBinHistogram(histo, Intervals, Id)); + HistogramImpl->Merge(histogramsToMerge); + } + + virtual void Multiply(double factor) { + HistogramImpl->Multiply(factor); + } + + virtual void FromProto(const THistogram& histo) { + if (!histo.HasType() || histo.GetType() == HT_FIXED_BIN_HISTOGRAM) { + HistogramImpl.Reset(new TFixedBinHistogram(histo, Intervals, Id)); } else if (histo.GetType() == HT_ADAPTIVE_DISTANCE_HISTOGRAM) { - HistogramImpl.Reset(new TAdaptiveDistanceHistogram(histo, Intervals, Id)); + HistogramImpl.Reset(new TAdaptiveDistanceHistogram(histo, Intervals, Id)); } else if (histo.GetType() == HT_ADAPTIVE_WEIGHT_HISTOGRAM) { - HistogramImpl.Reset(new TAdaptiveWeightHistogram(histo, Intervals, Id)); + HistogramImpl.Reset(new TAdaptiveWeightHistogram(histo, Intervals, Id)); } else if (histo.GetType() == HT_ADAPTIVE_WARD_HISTOGRAM) { HistogramImpl.Reset(new TAdaptiveWardHistogram(histo, Intervals, Id)); } else { ythrow yexception() << "Can't parse TAutoHistogram from a THistogram of type " << (ui32)histo.GetType(); - } - } - - virtual void ToProto(THistogram& histo) { - HistogramImpl->ToProto(histo); - } - + } + } + + virtual void ToProto(THistogram& histo) { + HistogramImpl->ToProto(histo); + } + virtual void SetId(ui64 id) { - HistogramImpl->SetId(id); - } - + HistogramImpl->SetId(id); + } + virtual ui64 GetId() { - return HistogramImpl->GetId(); - } - - virtual bool Empty() { - return HistogramImpl->Empty(); - } - - virtual double GetMinValue() { - return HistogramImpl->GetMinValue(); - } - - virtual double GetMaxValue() { - return HistogramImpl->GetMaxValue(); - } - - virtual double GetSum() { - return HistogramImpl->GetSum(); - } - - virtual double GetSumInRange(double leftBound, double rightBound) { - return HistogramImpl->GetSumInRange(leftBound, rightBound); - } - - virtual double GetSumAboveBound(double bound) { - return HistogramImpl->GetSumAboveBound(bound); - } - - virtual double GetSumBelowBound(double bound) { - return HistogramImpl->GetSumBelowBound(bound); - } - - virtual double CalcUpperBound(double sum) { - return HistogramImpl->CalcUpperBound(sum); - } - - virtual double CalcLowerBound(double sum) { - return HistogramImpl->CalcLowerBound(sum); - } + return HistogramImpl->GetId(); + } + + virtual bool Empty() { + return HistogramImpl->Empty(); + } + + virtual double GetMinValue() { + return HistogramImpl->GetMinValue(); + } + + virtual double GetMaxValue() { + return HistogramImpl->GetMaxValue(); + } + + virtual double GetSum() { + return HistogramImpl->GetSum(); + } + + virtual double GetSumInRange(double leftBound, double rightBound) { + return HistogramImpl->GetSumInRange(leftBound, rightBound); + } + + virtual double GetSumAboveBound(double bound) { + return HistogramImpl->GetSumAboveBound(bound); + } + + virtual double GetSumBelowBound(double bound) { + return HistogramImpl->GetSumBelowBound(bound); + } + + virtual double CalcUpperBound(double sum) { + return HistogramImpl->CalcUpperBound(sum); + } + + virtual double CalcLowerBound(double sum) { + return HistogramImpl->CalcLowerBound(sum); + } virtual double CalcUpperBoundSafe(double sum) { return HistogramImpl->CalcUpperBoundSafe(sum); @@ -143,6 +143,6 @@ namespace NKiwiAggr { virtual void PrecomputePartialSums() { return HistogramImpl->PrecomputePartialSums(); } - }; - + }; + } diff --git a/library/cpp/histogram/adaptive/block_histogram.cpp b/library/cpp/histogram/adaptive/block_histogram.cpp index d1528b64ba..6586d13ff6 100644 --- a/library/cpp/histogram/adaptive/block_histogram.cpp +++ b/library/cpp/histogram/adaptive/block_histogram.cpp @@ -7,8 +7,8 @@ #include <util/generic/intrlist.h> #include <util/generic/ptr.h> #include <util/generic/queue.h> -#include <util/generic/ymath.h> -#include <util/string/printf.h> +#include <util/generic/ymath.h> +#include <util/string/printf.h> namespace { struct TEmpty { @@ -97,11 +97,11 @@ namespace NKiwiAggr { } void TBlockHistogram::Clear() { - PrevSize = 0; - Sum = 0.0; - MinValue = 0.0; - MaxValue = 0.0; - Bins.clear(); + PrevSize = 0; + Sum = 0.0; + MinValue = 0.0; + MaxValue = 0.0; + Bins.clear(); } void TBlockHistogram::Add(const THistoRec& rec) { @@ -111,10 +111,10 @@ namespace NKiwiAggr { } void TBlockHistogram::Add(double value, double weight) { - if (!IsValidFloat(value) || !IsValidFloat(weight)) { + if (!IsValidFloat(value) || !IsValidFloat(weight)) { ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight); - } - + } + if (weight <= 0.0) { return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden } @@ -129,9 +129,9 @@ namespace NKiwiAggr { Sum += weight; - if (Bins.size() > ShrinkSize) { + if (Bins.size() > ShrinkSize) { SortAndShrink(Intervals * SHRINK_MULTIPLIER); - } + } Bins.push_back(TWeightedValue(value, weight)); } @@ -156,7 +156,7 @@ namespace NKiwiAggr { if (!IsValidFloat(value) || !IsValidFloat(weight)) { fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); continue; - } + } Add(value, weight * multiplier); } @@ -168,20 +168,20 @@ namespace NKiwiAggr { double weight = histo.GetFreq(j); if (!IsValidFloat(pos) || !IsValidFloat(weight)) { fprintf(stderr, "Merging in histogram id %lu: skip bad value %f weight %f\n", Id, pos, weight); - pos += histo.GetBinRange(); + pos += histo.GetBinRange(); continue; - } + } Add(pos, weight * multiplier); pos += histo.GetBinRange(); } - + MinValue = Min(MinValue, histo.GetMinValue()); MaxValue = Max(MaxValue, histo.GetMaxValue()); } else { ythrow yexception() << "Unknown THistogram type"; - } - } - + } + } + void TBlockHistogram::Merge(const TVector<THistogram>& histogramsToMerge) { for (size_t i = 0; i < histogramsToMerge.size(); ++i) { Merge(histogramsToMerge[i], 1.0); @@ -191,18 +191,18 @@ namespace NKiwiAggr { void TBlockHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) { Y_UNUSED(histogramsToMerge); ythrow yexception() << "IHistogram::Merge(TVector<IHistogramPtr>) is not defined for TBlockHistogram"; - } - + } + void TBlockHistogram::Multiply(double factor) { - if (!IsValidFloat(factor) || factor <= 0) { - ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; - } - Sum *= factor; + if (!IsValidFloat(factor) || factor <= 0) { + ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; + } + Sum *= factor; for (TVector<TWeightedValue>::iterator it = Bins.begin(); it != Bins.end(); ++it) { - it->second *= factor; - } - } - + it->second *= factor; + } + } + void TBlockHistogram::FromProto(const THistogram& histo) { Y_VERIFY(histo.HasType(), "Attempt to parse TBlockHistogram from THistogram protobuf with no Type field set"); ; @@ -226,20 +226,20 @@ namespace NKiwiAggr { Bins.resize(histo.FreqSize()); PrevSize = Bins.size(); for (size_t i = 0; i < histo.FreqSize(); ++i) { - double value = histo.GetPosition(i); - double weight = histo.GetFreq(i); - if (!IsValidFloat(value) || !IsValidFloat(weight)) { + double value = histo.GetPosition(i); + double weight = histo.GetFreq(i); + if (!IsValidFloat(value) || !IsValidFloat(weight)) { fprintf(stderr, "FromProto in histogram id %lu: skip bad value %f weight %f\n", Id, value, weight); - continue; - } - Bins[i].first = value; - Bins[i].second = weight; + continue; + } + Bins[i].first = value; + Bins[i].second = weight; Sum += Bins[i].second; } - - if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { + + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) { ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue()); - } + } MinValue = histo.GetMinValue(); MaxValue = histo.GetMaxValue(); } @@ -262,9 +262,9 @@ namespace NKiwiAggr { } void TBlockHistogram::SetId(ui64 id) { - Id = id; - } - + Id = id; + } + ui64 TBlockHistogram::GetId() { return Id; } @@ -274,11 +274,11 @@ namespace NKiwiAggr { } double TBlockHistogram::GetMinValue() { - return MinValue; + return MinValue; } double TBlockHistogram::GetMaxValue() { - return MaxValue; + return MaxValue; } double TBlockHistogram::GetSum() { @@ -288,9 +288,9 @@ namespace NKiwiAggr { void TBlockHistogram::SortAndShrink(size_t intervals, bool final) { Y_VERIFY(intervals > 0); - if (Bins.size() <= intervals) { + if (Bins.size() <= intervals) { return; - } + } if (Bins.size() >= Intervals * GREEDY_SHRINK_MULTIPLIER) { SortBins(); @@ -301,7 +301,7 @@ namespace NKiwiAggr { SlowShrink(intervals); } - } else { + } else { SortBins(); UniquifyBins(); SlowShrink(intervals); @@ -351,9 +351,9 @@ namespace NKiwiAggr { PrevSize = pos + 1; } - if (Bins.size() <= intervals) { + if (Bins.size() <= intervals) { return; - } + } typedef TIntrusiveListItem<TEmpty> TListItem; @@ -363,15 +363,15 @@ namespace NKiwiAggr { TArrayHolder<TListItem> listElementsHolder(new TListItem[end + 1]); TListItem* const bins = listElementsHolder.Get(); - for (ui32 i = 1; i <= end; ++i) { + for (ui32 i = 1; i <= end; ++i) { bins[i].LinkAfter(&bins[i - 1]); - } + } TVector<double> pairWeights(n); - for (ui32 i = 0; i < n; ++i) { + for (ui32 i = 0; i < n; ++i) { pairWeights[i] = CalcQuality(Bins[i], Bins[i + 1]).first; - } + } TSmartHeap heap(pairWeights); diff --git a/library/cpp/histogram/adaptive/block_histogram.h b/library/cpp/histogram/adaptive/block_histogram.h index 3a8d91af01..266bb2f2b2 100644 --- a/library/cpp/histogram/adaptive/block_histogram.h +++ b/library/cpp/histogram/adaptive/block_histogram.h @@ -54,17 +54,17 @@ namespace NKiwiAggr { virtual ~TBlockHistogram() { } - virtual void Clear(); - + virtual void Clear(); + virtual void Add(double value, double weight); virtual void Add(const THistoRec& histoRec); virtual void Merge(const THistogram& histo, double multiplier); virtual void Merge(const TVector<THistogram>& histogramsToMerge); virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented - - virtual void Multiply(double factor); - + + virtual void Multiply(double factor); + virtual void FromProto(const THistogram& histo); virtual void ToProto(THistogram& histo); diff --git a/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp b/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp index 44d0cc57c6..558aba9e2d 100644 --- a/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp +++ b/library/cpp/histogram/adaptive/fixed_bin_histogram.cpp @@ -1,425 +1,425 @@ #include "fixed_bin_histogram.h" -#include "auto_histogram.h" +#include "auto_histogram.h" #include <library/cpp/histogram/adaptive/protos/histo.pb.h> #include <util/generic/algorithm.h> -#include <util/generic/yexception.h> -#include <util/generic/ymath.h> -#include <util/string/printf.h> - +#include <util/generic/yexception.h> +#include <util/generic/ymath.h> +#include <util/string/printf.h> + namespace NKiwiAggr { TFixedBinHistogram::TFixedBinHistogram(size_t intervals, ui64 id, size_t trainingSetSize) - : TrainingSetSize(trainingSetSize) - , IsInitialized(false) - , IsEmpty(true) - , Id(id) - , Sum(0.0) - , Freqs(0) - , ReserveFreqs(0) - , BinRange(0.0) - , Intervals(intervals) - , BaseIndex(intervals / 2) - { - } - + : TrainingSetSize(trainingSetSize) + , IsInitialized(false) + , IsEmpty(true) + , Id(id) + , Sum(0.0) + , Freqs(0) + , ReserveFreqs(0) + , BinRange(0.0) + , Intervals(intervals) + , BaseIndex(intervals / 2) + { + } + TFixedBinHistogram::TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize) - : TrainingSetSize(trainingSetSize) - , IsInitialized(false) - , IsEmpty(true) - , Id(defaultId) - , Sum(0.0) - , Freqs(0) - , ReserveFreqs(0) - , BinRange(0.0) - , Intervals(defaultIntervals) - , BaseIndex(defaultIntervals / 2) - { - FromProto(histo); - } + : TrainingSetSize(trainingSetSize) + , IsInitialized(false) + , IsEmpty(true) + , Id(defaultId) + , Sum(0.0) + , Freqs(0) + , ReserveFreqs(0) + , BinRange(0.0) + , Intervals(defaultIntervals) + , BaseIndex(defaultIntervals / 2) + { + FromProto(histo); + } TFixedBinHistogram::TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals, ui64 defaultId, size_t trainingSetSize) - : TrainingSetSize(trainingSetSize) - , IsInitialized(false) - , IsEmpty(true) - , Id(defaultId) - , Sum(0.0) - , Freqs(0) - , ReserveFreqs(0) - , BinRange(0.0) - , Intervals(defaultIntervals) - , BaseIndex(defaultIntervals / 2) - { - TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histo); - if (!fixedBinHisto) { - FromIHistogram(histo); - return; - } - fixedBinHisto->Initialize(); - TrainingSetSize = fixedBinHisto->TrainingSetSize; - IsInitialized = fixedBinHisto->IsInitialized; - IsEmpty = fixedBinHisto->IsEmpty; - Id = fixedBinHisto->Id; - MinValue = fixedBinHisto->MinValue; - MaxValue = fixedBinHisto->MaxValue; - Sum = fixedBinHisto->Sum; - Freqs.assign(fixedBinHisto->Freqs.begin(), fixedBinHisto->Freqs.end()); - ReserveFreqs.assign(fixedBinHisto->ReserveFreqs.begin(), fixedBinHisto->ReserveFreqs.end()); - ReferencePoint = fixedBinHisto->ReferencePoint; - BinRange = fixedBinHisto->BinRange; - Intervals = fixedBinHisto->Intervals; - FirstUsedBin = fixedBinHisto->FirstUsedBin; - LastUsedBin = fixedBinHisto->LastUsedBin; - BaseIndex = fixedBinHisto->BaseIndex; - } - - void TFixedBinHistogram::Clear() { - TrainingSet.Destroy(); - IsInitialized = false; - IsEmpty = true; - Sum = 0.0; - Freqs.clear(); - ReserveFreqs.clear(); - BinRange = 0.0; - } - - void TFixedBinHistogram::Add(const THistoRec& rec) { - if (!rec.HasId() || rec.GetId() == Id) { - Add(rec.GetValue(), rec.GetWeight()); - } - } - - void TFixedBinHistogram::Add(double value, double weight) { - if (!IsValidFloat(value) || !IsValidFloat(weight)) { + : TrainingSetSize(trainingSetSize) + , IsInitialized(false) + , IsEmpty(true) + , Id(defaultId) + , Sum(0.0) + , Freqs(0) + , ReserveFreqs(0) + , BinRange(0.0) + , Intervals(defaultIntervals) + , BaseIndex(defaultIntervals / 2) + { + TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histo); + if (!fixedBinHisto) { + FromIHistogram(histo); + return; + } + fixedBinHisto->Initialize(); + TrainingSetSize = fixedBinHisto->TrainingSetSize; + IsInitialized = fixedBinHisto->IsInitialized; + IsEmpty = fixedBinHisto->IsEmpty; + Id = fixedBinHisto->Id; + MinValue = fixedBinHisto->MinValue; + MaxValue = fixedBinHisto->MaxValue; + Sum = fixedBinHisto->Sum; + Freqs.assign(fixedBinHisto->Freqs.begin(), fixedBinHisto->Freqs.end()); + ReserveFreqs.assign(fixedBinHisto->ReserveFreqs.begin(), fixedBinHisto->ReserveFreqs.end()); + ReferencePoint = fixedBinHisto->ReferencePoint; + BinRange = fixedBinHisto->BinRange; + Intervals = fixedBinHisto->Intervals; + FirstUsedBin = fixedBinHisto->FirstUsedBin; + LastUsedBin = fixedBinHisto->LastUsedBin; + BaseIndex = fixedBinHisto->BaseIndex; + } + + void TFixedBinHistogram::Clear() { + TrainingSet.Destroy(); + IsInitialized = false; + IsEmpty = true; + Sum = 0.0; + Freqs.clear(); + ReserveFreqs.clear(); + BinRange = 0.0; + } + + void TFixedBinHistogram::Add(const THistoRec& rec) { + if (!rec.HasId() || rec.GetId() == Id) { + Add(rec.GetValue(), rec.GetWeight()); + } + } + + void TFixedBinHistogram::Add(double value, double weight) { + if (!IsValidFloat(value) || !IsValidFloat(weight)) { ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight); - } - - if (weight <= 0.0) { - return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden - } - - Sum += weight; - - if (!IsInitialized) { - if (!TrainingSet) { + } + + if (weight <= 0.0) { + return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden + } + + Sum += weight; + + if (!IsInitialized) { + if (!TrainingSet) { TrainingSet.Reset(new TVector<TWeightedValue>(0)); - } - TrainingSet->push_back(TWeightedValue(value, weight)); - if (TrainingSet->size() >= TrainingSetSize) { - Initialize(); - } - return; - } - - i32 bin = CalcBin(value); - if (bin < 0 || bin >= (i32)Freqs.size() || (BinRange == 0.0 && value != ReferencePoint)) { - Shrink(Min(value, MinValue), Max(value, MaxValue)); - Freqs[CalcBin(value)] += weight; - } else { - MinValue = Min(value, MinValue); - MaxValue = Max(value, MaxValue); - FirstUsedBin = Min(FirstUsedBin, bin); - LastUsedBin = Max(LastUsedBin, bin); - Freqs[bin] += weight; - } - } - + } + TrainingSet->push_back(TWeightedValue(value, weight)); + if (TrainingSet->size() >= TrainingSetSize) { + Initialize(); + } + return; + } + + i32 bin = CalcBin(value); + if (bin < 0 || bin >= (i32)Freqs.size() || (BinRange == 0.0 && value != ReferencePoint)) { + Shrink(Min(value, MinValue), Max(value, MaxValue)); + Freqs[CalcBin(value)] += weight; + } else { + MinValue = Min(value, MinValue); + MaxValue = Max(value, MaxValue); + FirstUsedBin = Min(FirstUsedBin, bin); + LastUsedBin = Max(LastUsedBin, bin); + Freqs[bin] += weight; + } + } + void TFixedBinHistogram::Merge(const THistogram& /*histo*/, double /*multiplier*/) { ythrow yexception() << "Method is not implemented for TFixedBinHistogram"; } void TFixedBinHistogram::Merge(const TVector<THistogram>& histogramsToMerge) { TVector<IHistogramPtr> parsedHistogramsToMerge; - for (size_t i = 0; i < histogramsToMerge.size(); ++i) { - parsedHistogramsToMerge.push_back(IHistogramPtr(new TAutoHistogram(histogramsToMerge[i], Intervals, Id))); - } - Merge(parsedHistogramsToMerge); - } - + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + parsedHistogramsToMerge.push_back(IHistogramPtr(new TAutoHistogram(histogramsToMerge[i], Intervals, Id))); + } + Merge(parsedHistogramsToMerge); + } + void TFixedBinHistogram::Merge(TVector<IHistogramPtr> histogramsToMerge) { TVector<IHistogramPtr> histogramsToMergeRepacked(0); TVector<TFixedBinHistogram*> histograms(0); - - // put current histogram to the vector of histograms to merge and clear self - if (!Empty()) { - histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(this, Intervals, Id, TrainingSetSize))); - histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get())); - } - Clear(); - - for (size_t i = 0; i < histogramsToMerge.size(); ++i) { - if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) { - continue; - } - TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histogramsToMerge[i].Get()); - if (fixedBinHisto) { - fixedBinHisto->Initialize(); - histogramsToMergeRepacked.push_back(histogramsToMerge[i]); - } else { + + // put current histogram to the vector of histograms to merge and clear self + if (!Empty()) { + histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(this, Intervals, Id, TrainingSetSize))); + histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get())); + } + Clear(); + + for (size_t i = 0; i < histogramsToMerge.size(); ++i) { + if (!histogramsToMerge[i] || histogramsToMerge[i]->Empty()) { + continue; + } + TFixedBinHistogram* fixedBinHisto = dynamic_cast<TFixedBinHistogram*>(histogramsToMerge[i].Get()); + if (fixedBinHisto) { + fixedBinHisto->Initialize(); + histogramsToMergeRepacked.push_back(histogramsToMerge[i]); + } else { histogramsToMergeRepacked.push_back(IHistogramPtr(new TFixedBinHistogram(histogramsToMerge[i].Get(), Intervals, Id, TrainingSetSize))); // Convert histograms that are not of TFixedBinHistogram type - } - histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get())); - } - - if (histograms.size() == 0) { - return; - } - - double minValue = histograms[0]->MinValue; - double maxValue = histograms[0]->MaxValue; - Sum = histograms[0]->Sum; - for (size_t i = 1; i < histograms.size(); ++i) { - minValue = Min(minValue, histograms[i]->MinValue); - maxValue = Max(maxValue, histograms[i]->MaxValue); - Sum += histograms[i]->Sum; - } - SetFrame(minValue, maxValue, true); - - if (BinRange == 0.0) { - Freqs[BaseIndex] = Sum; - return; - } - - for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) { - TFixedBinHistogram* histo = histograms[histoIndex]; - for (i32 bin = histo->FirstUsedBin; bin <= histo->LastUsedBin; ++bin) { - double binStart = histo->BinStart(bin); - double binEnd = histo->BinEnd(bin); - double freq = histo->Freqs[bin]; - if (binStart == binEnd) { - Freqs[CalcBin(binStart)] += freq; - continue; - } - size_t firstCross = CalcBin(binStart); - size_t lastCross = CalcBin(binEnd); - for (size_t i = firstCross; i <= lastCross; ++i) { - double mergedBinStart = BinStart(i); - double mergedBinEnd = BinEnd(i); - double crossStart = Max(mergedBinStart, binStart); - double crossEnd = Min(mergedBinEnd, binEnd); - if (binStart == binEnd) { - } - Freqs[i] += freq * (crossEnd - crossStart) / (binEnd - binStart); - } - } - } - } - - void TFixedBinHistogram::Multiply(double factor) { - if (!IsValidFloat(factor) || factor <= 0) { - ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; - } - if (!IsInitialized) { - Initialize(); - } - Sum *= factor; + } + histograms.push_back(dynamic_cast<TFixedBinHistogram*>(histogramsToMergeRepacked.back().Get())); + } + + if (histograms.size() == 0) { + return; + } + + double minValue = histograms[0]->MinValue; + double maxValue = histograms[0]->MaxValue; + Sum = histograms[0]->Sum; + for (size_t i = 1; i < histograms.size(); ++i) { + minValue = Min(minValue, histograms[i]->MinValue); + maxValue = Max(maxValue, histograms[i]->MaxValue); + Sum += histograms[i]->Sum; + } + SetFrame(minValue, maxValue, true); + + if (BinRange == 0.0) { + Freqs[BaseIndex] = Sum; + return; + } + + for (size_t histoIndex = 0; histoIndex < histograms.size(); ++histoIndex) { + TFixedBinHistogram* histo = histograms[histoIndex]; + for (i32 bin = histo->FirstUsedBin; bin <= histo->LastUsedBin; ++bin) { + double binStart = histo->BinStart(bin); + double binEnd = histo->BinEnd(bin); + double freq = histo->Freqs[bin]; + if (binStart == binEnd) { + Freqs[CalcBin(binStart)] += freq; + continue; + } + size_t firstCross = CalcBin(binStart); + size_t lastCross = CalcBin(binEnd); + for (size_t i = firstCross; i <= lastCross; ++i) { + double mergedBinStart = BinStart(i); + double mergedBinEnd = BinEnd(i); + double crossStart = Max(mergedBinStart, binStart); + double crossEnd = Min(mergedBinEnd, binEnd); + if (binStart == binEnd) { + } + Freqs[i] += freq * (crossEnd - crossStart) / (binEnd - binStart); + } + } + } + } + + void TFixedBinHistogram::Multiply(double factor) { + if (!IsValidFloat(factor) || factor <= 0) { + ythrow yexception() << "Not valid factor in IHistogram::Multiply(): " << factor; + } + if (!IsInitialized) { + Initialize(); + } + Sum *= factor; for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { - Freqs[i] *= factor; - } - } - - void TFixedBinHistogram::FromProto(const THistogram& histo) { - if (histo.HasType() && histo.GetType() != HT_FIXED_BIN_HISTOGRAM) { - ythrow yexception() << "Attempt to parse TFixedBinHistogram from THistogram protobuf record of wrong type = " << (ui32)histo.GetType(); - } - TrainingSet.Destroy(); - IsInitialized = false; - Sum = 0.0; - - Id = histo.GetId(); - size_t intervals = histo.FreqSize(); - if (intervals == 0) { - IsEmpty = true; - return; - } - Intervals = intervals; - TrainingSetSize = Intervals; - BaseIndex = Intervals / 2; - - if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue()) || !IsValidFloat(histo.GetBinRange())) { + Freqs[i] *= factor; + } + } + + void TFixedBinHistogram::FromProto(const THistogram& histo) { + if (histo.HasType() && histo.GetType() != HT_FIXED_BIN_HISTOGRAM) { + ythrow yexception() << "Attempt to parse TFixedBinHistogram from THistogram protobuf record of wrong type = " << (ui32)histo.GetType(); + } + TrainingSet.Destroy(); + IsInitialized = false; + Sum = 0.0; + + Id = histo.GetId(); + size_t intervals = histo.FreqSize(); + if (intervals == 0) { + IsEmpty = true; + return; + } + Intervals = intervals; + TrainingSetSize = Intervals; + BaseIndex = Intervals / 2; + + if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue()) || !IsValidFloat(histo.GetBinRange())) { ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f binrange %f", Id, histo.GetMinValue(), histo.GetMaxValue(), histo.GetBinRange()); - } - - double minValue = histo.GetMinValue(); - double binRange = histo.GetBinRange(); - double maxValue = histo.HasMaxValue() ? histo.GetMaxValue() : minValue + binRange * Intervals; - SetFrame(minValue, maxValue, true); - BinRange = binRange; - for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { - Freqs[i] = histo.GetFreq(i - BaseIndex); - if (!IsValidFloat(Freqs[i])) { + } + + double minValue = histo.GetMinValue(); + double binRange = histo.GetBinRange(); + double maxValue = histo.HasMaxValue() ? histo.GetMaxValue() : minValue + binRange * Intervals; + SetFrame(minValue, maxValue, true); + BinRange = binRange; + for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { + Freqs[i] = histo.GetFreq(i - BaseIndex); + if (!IsValidFloat(Freqs[i])) { ythrow yexception() << Sprintf("FromProto in histogram id %lu: bad value %f", Id, Freqs[i]); - } - Sum += Freqs[i]; - } - } - - void TFixedBinHistogram::ToProto(THistogram& histo) { - histo.Clear(); - if (!IsInitialized) { - Initialize(); - } - histo.SetType(HT_FIXED_BIN_HISTOGRAM); - histo.SetId(Id); - if (IsEmpty) { - return; - } - if (FirstUsedBin < (i32)BaseIndex || (LastUsedBin - FirstUsedBin + 1) > (i32)Intervals) { - Shrink(MinValue, MaxValue); - } - histo.SetMinValue(MinValue); - histo.SetMaxValue(MaxValue); - histo.SetBinRange(BinRange); - for (ui32 i = BaseIndex; i < BaseIndex + Intervals; ++i) { - histo.AddFreq(Freqs[i]); - } - } - + } + Sum += Freqs[i]; + } + } + + void TFixedBinHistogram::ToProto(THistogram& histo) { + histo.Clear(); + if (!IsInitialized) { + Initialize(); + } + histo.SetType(HT_FIXED_BIN_HISTOGRAM); + histo.SetId(Id); + if (IsEmpty) { + return; + } + if (FirstUsedBin < (i32)BaseIndex || (LastUsedBin - FirstUsedBin + 1) > (i32)Intervals) { + Shrink(MinValue, MaxValue); + } + histo.SetMinValue(MinValue); + histo.SetMaxValue(MaxValue); + histo.SetBinRange(BinRange); + for (ui32 i = BaseIndex; i < BaseIndex + Intervals; ++i) { + histo.AddFreq(Freqs[i]); + } + } + void TFixedBinHistogram::SetId(ui64 id) { - Id = id; - } - + Id = id; + } + ui64 TFixedBinHistogram::GetId() { - return Id; - } - - bool TFixedBinHistogram::Empty() { - if (!IsInitialized) { - Initialize(); - } - return IsEmpty; - } - - double TFixedBinHistogram::GetMinValue() { - if (!IsInitialized) { - Initialize(); - } - return MinValue; - } - - double TFixedBinHistogram::GetMaxValue() { - if (!IsInitialized) { - Initialize(); - } - return MaxValue; - } - - double TFixedBinHistogram::GetSum() { - return Sum; - } - - double TFixedBinHistogram::GetSumInRange(double leftBound, double rightBound) { - if (!IsInitialized) { - Initialize(); - } - if (leftBound > rightBound) { - return 0.0; - } - return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum; - } - - double TFixedBinHistogram::GetSumAboveBound(double bound) { - if (!IsInitialized) { - Initialize(); - } - if (IsEmpty) { - return 0.0; - } + return Id; + } + + bool TFixedBinHistogram::Empty() { + if (!IsInitialized) { + Initialize(); + } + return IsEmpty; + } + + double TFixedBinHistogram::GetMinValue() { + if (!IsInitialized) { + Initialize(); + } + return MinValue; + } + + double TFixedBinHistogram::GetMaxValue() { + if (!IsInitialized) { + Initialize(); + } + return MaxValue; + } + + double TFixedBinHistogram::GetSum() { + return Sum; + } + + double TFixedBinHistogram::GetSumInRange(double leftBound, double rightBound) { + if (!IsInitialized) { + Initialize(); + } + if (leftBound > rightBound) { + return 0.0; + } + return GetSumAboveBound(leftBound) + GetSumBelowBound(rightBound) - Sum; + } + + double TFixedBinHistogram::GetSumAboveBound(double bound) { + if (!IsInitialized) { + Initialize(); + } + if (IsEmpty) { + return 0.0; + } if (BinRange == 0.0) { // special case - all values added to histogram are the same - return (bound <= ReferencePoint) ? Sum : 0.0; - } - i32 bin = CalcBin(bound); - if (bin < FirstUsedBin) { - return Sum; - } - if (bin > LastUsedBin) { - return 0.0; - } - double binStart = BinStart(bin); - double binEnd = BinEnd(bin); + return (bound <= ReferencePoint) ? Sum : 0.0; + } + i32 bin = CalcBin(bound); + if (bin < FirstUsedBin) { + return Sum; + } + if (bin > LastUsedBin) { + return 0.0; + } + double binStart = BinStart(bin); + double binEnd = BinEnd(bin); double result = (bound < binStart) ? Freqs[bin] : Freqs[bin] * (binEnd - bound) / (binEnd - binStart); - for (i32 i = bin + 1; i <= LastUsedBin; ++i) { - result += Freqs[i]; - } - return result; - } - - double TFixedBinHistogram::GetSumBelowBound(double bound) { - if (!IsInitialized) { - Initialize(); - } - if (IsEmpty) { - return 0.0; - } + for (i32 i = bin + 1; i <= LastUsedBin; ++i) { + result += Freqs[i]; + } + return result; + } + + double TFixedBinHistogram::GetSumBelowBound(double bound) { + if (!IsInitialized) { + Initialize(); + } + if (IsEmpty) { + return 0.0; + } if (BinRange == 0.0) { // special case - all values added to histogram are the same - return (bound > ReferencePoint) ? Sum : 0.0; - } - i32 bin = CalcBin(bound); - if (bin < FirstUsedBin) { - return 0.0; - } - if (bin > LastUsedBin) { - return Sum; - } - double binStart = BinStart(bin); - double binEnd = BinEnd(bin); + return (bound > ReferencePoint) ? Sum : 0.0; + } + i32 bin = CalcBin(bound); + if (bin < FirstUsedBin) { + return 0.0; + } + if (bin > LastUsedBin) { + return Sum; + } + double binStart = BinStart(bin); + double binEnd = BinEnd(bin); double result = (bound > binEnd) ? Freqs[bin] : Freqs[bin] * (bound - binStart) / (binEnd - binStart); - for (i32 i = bin - 1; i >= FirstUsedBin; --i) { - result += Freqs[i]; - } - return result; - } - - double TFixedBinHistogram::CalcUpperBound(double sum) { - if (!IsInitialized) { - Initialize(); - } + for (i32 i = bin - 1; i >= FirstUsedBin; --i) { + result += Freqs[i]; + } + return result; + } + + double TFixedBinHistogram::CalcUpperBound(double sum) { + if (!IsInitialized) { + Initialize(); + } if (sum == 0.0) { return MinValue; } - if (IsEmpty) { - return MaxValue; - } - i32 currentBin = FirstUsedBin; - double gatheredSum = 0.0; - while (gatheredSum < sum && currentBin <= LastUsedBin) { - gatheredSum += Freqs[currentBin]; - ++currentBin; - } - --currentBin; + if (IsEmpty) { + return MaxValue; + } + i32 currentBin = FirstUsedBin; + double gatheredSum = 0.0; + while (gatheredSum < sum && currentBin <= LastUsedBin) { + gatheredSum += Freqs[currentBin]; + ++currentBin; + } + --currentBin; if ((gatheredSum <= sum && currentBin == LastUsedBin) || (Freqs[currentBin] == 0)) { - return MaxValue; - } - double binStart = BinStart(currentBin); - double binEnd = BinEnd(currentBin); - return binEnd - (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin]; - } - - double TFixedBinHistogram::CalcLowerBound(double sum) { - if (!IsInitialized) { - Initialize(); - } + return MaxValue; + } + double binStart = BinStart(currentBin); + double binEnd = BinEnd(currentBin); + return binEnd - (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin]; + } + + double TFixedBinHistogram::CalcLowerBound(double sum) { + if (!IsInitialized) { + Initialize(); + } if (sum == 0.0) { return MaxValue; } - if (IsEmpty) { - return MinValue; - } - i32 currentBin = LastUsedBin; - double gatheredSum = 0.0; - while (gatheredSum < sum && currentBin >= FirstUsedBin) { - gatheredSum += Freqs[currentBin]; - --currentBin; - } - ++currentBin; + if (IsEmpty) { + return MinValue; + } + i32 currentBin = LastUsedBin; + double gatheredSum = 0.0; + while (gatheredSum < sum && currentBin >= FirstUsedBin) { + gatheredSum += Freqs[currentBin]; + --currentBin; + } + ++currentBin; if ((gatheredSum <= sum && currentBin == FirstUsedBin) || (Freqs[currentBin] == 0)) { - return MinValue; - } - double binStart = BinStart(currentBin); - double binEnd = BinEnd(currentBin); - return binStart + (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin]; - } - + return MinValue; + } + double binStart = BinStart(currentBin); + double binEnd = BinEnd(currentBin); + return binStart + (binEnd - binStart) * (gatheredSum - sum) / Freqs[currentBin]; + } + double TFixedBinHistogram::CalcUpperBoundSafe(double sum) { if (!Empty()) { sum = Max(Freqs[FirstUsedBin], sum); @@ -434,67 +434,67 @@ namespace NKiwiAggr { return CalcLowerBound(sum); } - double TFixedBinHistogram::CalcBinRange(double referencePoint, double maxValue) { - return (maxValue - referencePoint) / ((double)Intervals - 0.02); - } - - void TFixedBinHistogram::SetFrame(double minValue, double maxValue, bool clear) { - MinValue = minValue; - MaxValue = maxValue; - ReferencePoint = MinValue; - BinRange = CalcBinRange(ReferencePoint, MaxValue); - FirstUsedBin = BaseIndex; - LastUsedBin = (BinRange == 0.0) ? BaseIndex : BaseIndex + Intervals - 1; - if (clear) { - Freqs.assign(2 * Intervals, 0.0); - ReserveFreqs.assign(2 * Intervals, 0.0); - IsEmpty = false; - IsInitialized = true; - } - } - - void TFixedBinHistogram::FromIHistogram(IHistogram* histo) { - if (!histo) { - ythrow yexception() << "Attempt to create TFixedBinFistogram from a NULL pointer"; - } - Id = histo->GetId(); - if (histo->Empty()) { - IsInitialized = false; - IsEmpty = true; - return; - } - SetFrame(histo->GetMinValue(), histo->GetMaxValue(), true); - Sum = histo->GetSum(); - if (BinRange == 0.0) { - Freqs[BaseIndex] = Sum; - return; - } - for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { - Freqs[i] = histo->GetSumInRange(BinStart(i), BinEnd(i)); - } - return; - } - - void TFixedBinHistogram::Initialize() { - if (IsInitialized) { - return; - } - if (!TrainingSet || TrainingSet->size() == 0) { - IsEmpty = true; - return; - } - SetFrame(MinElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, - MaxElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, true); + double TFixedBinHistogram::CalcBinRange(double referencePoint, double maxValue) { + return (maxValue - referencePoint) / ((double)Intervals - 0.02); + } + + void TFixedBinHistogram::SetFrame(double minValue, double maxValue, bool clear) { + MinValue = minValue; + MaxValue = maxValue; + ReferencePoint = MinValue; + BinRange = CalcBinRange(ReferencePoint, MaxValue); + FirstUsedBin = BaseIndex; + LastUsedBin = (BinRange == 0.0) ? BaseIndex : BaseIndex + Intervals - 1; + if (clear) { + Freqs.assign(2 * Intervals, 0.0); + ReserveFreqs.assign(2 * Intervals, 0.0); + IsEmpty = false; + IsInitialized = true; + } + } + + void TFixedBinHistogram::FromIHistogram(IHistogram* histo) { + if (!histo) { + ythrow yexception() << "Attempt to create TFixedBinFistogram from a NULL pointer"; + } + Id = histo->GetId(); + if (histo->Empty()) { + IsInitialized = false; + IsEmpty = true; + return; + } + SetFrame(histo->GetMinValue(), histo->GetMaxValue(), true); + Sum = histo->GetSum(); + if (BinRange == 0.0) { + Freqs[BaseIndex] = Sum; + return; + } + for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { + Freqs[i] = histo->GetSumInRange(BinStart(i), BinEnd(i)); + } + return; + } + + void TFixedBinHistogram::Initialize() { + if (IsInitialized) { + return; + } + if (!TrainingSet || TrainingSet->size() == 0) { + IsEmpty = true; + return; + } + SetFrame(MinElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, + MaxElement(TrainingSet->begin(), TrainingSet->end(), CompareWeightedValue)->first, true); for (TVector<TWeightedValue>::const_iterator it = TrainingSet->begin(); it != TrainingSet->end(); ++it) { - Freqs[CalcBin(it->first)] += it->second; - } - TrainingSet.Destroy(); - } - - i32 TFixedBinHistogram::CalcBin(double value) { - return (BinRange == 0.0) ? BaseIndex : static_cast<i32>(BaseIndex + (value - ReferencePoint) / BinRange); - } - + Freqs[CalcBin(it->first)] += it->second; + } + TrainingSet.Destroy(); + } + + i32 TFixedBinHistogram::CalcBin(double value) { + return (BinRange == 0.0) ? BaseIndex : static_cast<i32>(BaseIndex + (value - ReferencePoint) / BinRange); + } + double TFixedBinHistogram::CalcDensity(double value) { i32 bin = CalcBin(value); if (bin < 0 || bin >= (i32)Freqs.size() || BinRange == 0.0 || GetSum() == 0) { @@ -503,36 +503,36 @@ namespace NKiwiAggr { return Freqs[bin] / GetSum() / BinRange; } - double TFixedBinHistogram::BinStart(i32 i) { - return Max(ReferencePoint + (i - BaseIndex) * BinRange, MinValue); - } - - double TFixedBinHistogram::BinEnd(i32 i) { - return Min(ReferencePoint + (i + 1 - BaseIndex) * BinRange, MaxValue); - } - - void TFixedBinHistogram::Shrink(double newReferencePoint, double newMaxValue) { + double TFixedBinHistogram::BinStart(i32 i) { + return Max(ReferencePoint + (i - BaseIndex) * BinRange, MinValue); + } + + double TFixedBinHistogram::BinEnd(i32 i) { + return Min(ReferencePoint + (i + 1 - BaseIndex) * BinRange, MaxValue); + } + + void TFixedBinHistogram::Shrink(double newReferencePoint, double newMaxValue) { Y_VERIFY(newReferencePoint < newMaxValue, "Invalid Shrink()"); - memset(&(ReserveFreqs[0]), 0, ReserveFreqs.size() * sizeof(double)); - - double newBinRange = CalcBinRange(newReferencePoint, newMaxValue); + memset(&(ReserveFreqs[0]), 0, ReserveFreqs.size() * sizeof(double)); + + double newBinRange = CalcBinRange(newReferencePoint, newMaxValue); for (i32 i = FirstUsedBin; i <= LastUsedBin; ++i) { - double binStart = BinStart(i); - double binEnd = BinEnd(i); - double freq = Freqs[i]; - i32 firstCross = static_cast<i32>(BaseIndex + (binStart - newReferencePoint) / newBinRange); - i32 lastCross = static_cast<i32>(BaseIndex + (binEnd - newReferencePoint) / newBinRange); - for (i32 j = firstCross; j <= lastCross; ++j) { - double newBinStart = newReferencePoint + (j - BaseIndex) * newBinRange; - double newBinEnd = newReferencePoint + (j + 1 - BaseIndex) * newBinRange; - double crossStart = Max(newBinStart, binStart); - double crossEnd = Min(newBinEnd, binEnd); - ReserveFreqs[j] += (binStart == binEnd) ? freq : freq * (crossEnd - crossStart) / (binEnd - binStart); - } - } - - Freqs.swap(ReserveFreqs); - SetFrame(newReferencePoint, newMaxValue, false); - } - + double binStart = BinStart(i); + double binEnd = BinEnd(i); + double freq = Freqs[i]; + i32 firstCross = static_cast<i32>(BaseIndex + (binStart - newReferencePoint) / newBinRange); + i32 lastCross = static_cast<i32>(BaseIndex + (binEnd - newReferencePoint) / newBinRange); + for (i32 j = firstCross; j <= lastCross; ++j) { + double newBinStart = newReferencePoint + (j - BaseIndex) * newBinRange; + double newBinEnd = newReferencePoint + (j + 1 - BaseIndex) * newBinRange; + double crossStart = Max(newBinStart, binStart); + double crossEnd = Min(newBinEnd, binEnd); + ReserveFreqs[j] += (binStart == binEnd) ? freq : freq * (crossEnd - crossStart) / (binEnd - binStart); + } + } + + Freqs.swap(ReserveFreqs); + SetFrame(newReferencePoint, newMaxValue, false); + } + } diff --git a/library/cpp/histogram/adaptive/fixed_bin_histogram.h b/library/cpp/histogram/adaptive/fixed_bin_histogram.h index 326d962c46..bd380bd94a 100644 --- a/library/cpp/histogram/adaptive/fixed_bin_histogram.h +++ b/library/cpp/histogram/adaptive/fixed_bin_histogram.h @@ -1,91 +1,91 @@ #pragma once -#include "histogram.h" - +#include "histogram.h" + #include <library/cpp/histogram/adaptive/protos/histo.pb.h> - -#include <util/generic/ptr.h> + +#include <util/generic/ptr.h> #include <util/generic/vector.h> #include <utility> namespace NKiwiAggr { - class TFixedBinHistogram: private TNonCopyable, public IHistogram { - private: - static const size_t DEFAULT_TRAINING_SET_SIZE = 10000; - static const size_t DEFAULT_INTERVALS = 100; + class TFixedBinHistogram: private TNonCopyable, public IHistogram { + private: + static const size_t DEFAULT_TRAINING_SET_SIZE = 10000; + static const size_t DEFAULT_INTERVALS = 100; typedef std::pair<double, double> TWeightedValue; // value, weight THolder<TVector<TWeightedValue>> TrainingSet; - size_t TrainingSetSize; - - bool IsInitialized; - bool IsEmpty; + size_t TrainingSetSize; + + bool IsInitialized; + bool IsEmpty; ui64 Id; - double MinValue; - double MaxValue; - double Sum; - + double MinValue; + double MaxValue; + double Sum; + TVector<double> Freqs; TVector<double> ReserveFreqs; - double ReferencePoint; - double BinRange; - size_t Intervals; - i32 FirstUsedBin; - i32 LastUsedBin; - i32 BaseIndex; - - public: + double ReferencePoint; + double BinRange; + size_t Intervals; + i32 FirstUsedBin; + i32 LastUsedBin; + i32 BaseIndex; + + public: TFixedBinHistogram(size_t intervals, ui64 id = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE); TFixedBinHistogram(const THistogram& histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE); TFixedBinHistogram(IHistogram* histo, size_t defaultIntervals = DEFAULT_INTERVALS, ui64 defaultId = 0, size_t trainingSetSize = DEFAULT_TRAINING_SET_SIZE); virtual ~TFixedBinHistogram() { - } + } + + virtual void Clear(); + + virtual void Add(double value, double weight); + virtual void Add(const THistoRec& histoRec); - virtual void Clear(); - - virtual void Add(double value, double weight); - virtual void Add(const THistoRec& histoRec); - virtual void Merge(const THistogram& histo, double multiplier); virtual void Merge(const TVector<THistogram>& histogramsToMerge); virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); - - virtual void Multiply(double factor); - - virtual void FromProto(const THistogram& histo); - virtual void ToProto(THistogram& histo); - + + virtual void Multiply(double factor); + + virtual void FromProto(const THistogram& histo); + virtual void ToProto(THistogram& histo); + virtual void SetId(ui64 id); virtual ui64 GetId(); - virtual bool Empty(); - virtual double GetMinValue(); - virtual double GetMaxValue(); - virtual double GetSum(); - virtual double GetSumInRange(double leftBound, double rightBound); - virtual double GetSumAboveBound(double bound); - virtual double GetSumBelowBound(double bound); - virtual double CalcUpperBound(double sum); - virtual double CalcLowerBound(double sum); + virtual bool Empty(); + virtual double GetMinValue(); + virtual double GetMaxValue(); + virtual double GetSum(); + virtual double GetSumInRange(double leftBound, double rightBound); + virtual double GetSumAboveBound(double bound); + virtual double GetSumBelowBound(double bound); + virtual double CalcUpperBound(double sum); + virtual double CalcLowerBound(double sum); virtual double CalcUpperBoundSafe(double sum); virtual double CalcLowerBoundSafe(double sum); - + double CalcDensity(double value); - private: - double CalcBinRange(double referencePoint, double maxValue); - void SetFrame(double minValue, double maxValue, bool clear); - void FromIHistogram(IHistogram* histo); - void Initialize(); - i32 CalcBin(double value); - double BinStart(i32 i); - double BinEnd(i32 i); - void Shrink(double newMinValue, double newMaxValue); - - static bool CompareWeightedValue(const TWeightedValue& left, const TWeightedValue& right) { - return left.first < right.first; - } - }; - + private: + double CalcBinRange(double referencePoint, double maxValue); + void SetFrame(double minValue, double maxValue, bool clear); + void FromIHistogram(IHistogram* histo); + void Initialize(); + i32 CalcBin(double value); + double BinStart(i32 i); + double BinEnd(i32 i); + void Shrink(double newMinValue, double newMaxValue); + + static bool CompareWeightedValue(const TWeightedValue& left, const TWeightedValue& right) { + return left.first < right.first; + } + }; + } diff --git a/library/cpp/histogram/adaptive/histogram.h b/library/cpp/histogram/adaptive/histogram.h index 58a6af4605..360fd9a693 100644 --- a/library/cpp/histogram/adaptive/histogram.h +++ b/library/cpp/histogram/adaptive/histogram.h @@ -1,54 +1,54 @@ -#pragma once - -#include <util/generic/ptr.h> -#include <util/generic/vector.h> - -namespace NKiwiAggr { +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> + +namespace NKiwiAggr { class THistogram; class THistoRec; - - class IHistogram; + + class IHistogram; typedef TAtomicSharedPtr<IHistogram> IHistogramPtr; - - class IHistogram { - public: - // Supposed constructors: - // + + class IHistogram { + public: + // Supposed constructors: + // // TSomeHistogram(size_t intervals, ui64 id = 0); // where intervals is some constant that defines histogram accuracy - // TSomeHistogram(const THistogram& histo); // histo must be acceptable for TSomeHistogram, for example, only with HT_FIXED_BIN_HISTOGRAM for TFixedBinHistogram - // TSomeHistogram(IHistogram* histo); // any kind of IHistogram - + // TSomeHistogram(const THistogram& histo); // histo must be acceptable for TSomeHistogram, for example, only with HT_FIXED_BIN_HISTOGRAM for TFixedBinHistogram + // TSomeHistogram(IHistogram* histo); // any kind of IHistogram + virtual ~IHistogram() { - } - - virtual void Clear() = 0; - - // zero- or negative-weighted values are skipped - virtual void Add(double value, double weight) = 0; - virtual void Add(const THistoRec& histoRec) = 0; - - // Merge some other histos into current + } + + virtual void Clear() = 0; + + // zero- or negative-weighted values are skipped + virtual void Add(double value, double weight) = 0; + virtual void Add(const THistoRec& histoRec) = 0; + + // Merge some other histos into current virtual void Merge(const THistogram& histo, double multiplier) = 0; virtual void Merge(const TVector<THistogram>& histogramsToMerge) = 0; virtual void Merge(TVector<IHistogramPtr> histogramsToMerge) = 0; - - // factor should be greater then zero - virtual void Multiply(double factor) = 0; - + + // factor should be greater then zero + virtual void Multiply(double factor) = 0; + virtual void FromProto(const THistogram& histo) = 0; // throws exception in case of wrong histogram type of histo - virtual void ToProto(THistogram& histo) = 0; - + virtual void ToProto(THistogram& histo) = 0; + virtual void SetId(ui64 id) = 0; virtual ui64 GetId() = 0; - virtual bool Empty() = 0; - virtual double GetMinValue() = 0; - virtual double GetMaxValue() = 0; - virtual double GetSum() = 0; - virtual double GetSumInRange(double leftBound, double rightBound) = 0; - virtual double GetSumAboveBound(double bound) = 0; - virtual double GetSumBelowBound(double bound) = 0; - virtual double CalcUpperBound(double sum) = 0; - virtual double CalcLowerBound(double sum) = 0; + virtual bool Empty() = 0; + virtual double GetMinValue() = 0; + virtual double GetMaxValue() = 0; + virtual double GetSum() = 0; + virtual double GetSumInRange(double leftBound, double rightBound) = 0; + virtual double GetSumAboveBound(double bound) = 0; + virtual double GetSumBelowBound(double bound) = 0; + virtual double CalcUpperBound(double sum) = 0; + virtual double CalcLowerBound(double sum) = 0; virtual double CalcUpperBoundSafe(double sum) = 0; virtual double CalcLowerBoundSafe(double sum) = 0; double GetValueAtPercentile(double percentile) { @@ -61,6 +61,6 @@ namespace NKiwiAggr { // Histogram implementation is supposed to clear all precomputed values() if Add() is called after PrecomputePartialSums() virtual void PrecomputePartialSums() { } - }; - + }; + } diff --git a/library/cpp/histogram/adaptive/merger.h b/library/cpp/histogram/adaptive/merger.h index a1db5e4c4b..fc9a6b6a4f 100644 --- a/library/cpp/histogram/adaptive/merger.h +++ b/library/cpp/histogram/adaptive/merger.h @@ -1,68 +1,68 @@ -#pragma once - -#include <util/generic/buffer.h> - -namespace NKiwiAggr { - class IMerger { - private: - bool IsMerged; +#pragma once + +#include <util/generic/buffer.h> + +namespace NKiwiAggr { + class IMerger { + private: + bool IsMerged; ui32 AutoMergeInterval; // Call Merge() after each AutoMergeInterval calls of Add(); zero means no autoMerge - ui32 NotMergedCount; - - public: - IMerger(ui32 autoMergeInterval = 0) - : IsMerged(true) - , AutoMergeInterval(autoMergeInterval) - , NotMergedCount(0) - { - } - + ui32 NotMergedCount; + + public: + IMerger(ui32 autoMergeInterval = 0) + : IsMerged(true) + , AutoMergeInterval(autoMergeInterval) + , NotMergedCount(0) + { + } + virtual ~IMerger() { - } - - // returns true if something is added - virtual bool Add(const void* data, size_t size) { - if (AddImpl(data, size)) { - AutoMerge(); - return true; - } - return false; - } - - virtual void Merge() { - if (!IsMerged) { - MergeImpl(); - IsMerged = true; - } - } - - virtual void Reset() { - ResetImpl(); - IsMerged = true; - } - - // You can add some more result-getters if you want. - // Do not forget to call Merge() in the beginning of each merger. - virtual void GetResult(TBuffer& buffer) = 0; - - protected: - // AutoMerge() is called in Add() after each AddImpl() - void AutoMerge() { - IsMerged = false; - if (AutoMergeInterval) { - ++NotMergedCount; - if (NotMergedCount >= AutoMergeInterval) { - MergeImpl(); - IsMerged = true; - NotMergedCount = 0; - } - } - } - - // Implementation of merger: define it in derivatives - virtual bool AddImpl(const void* data, size_t size) = 0; // returns true if something is added - virtual void MergeImpl() = 0; - virtual void ResetImpl() = 0; - }; - + } + + // returns true if something is added + virtual bool Add(const void* data, size_t size) { + if (AddImpl(data, size)) { + AutoMerge(); + return true; + } + return false; + } + + virtual void Merge() { + if (!IsMerged) { + MergeImpl(); + IsMerged = true; + } + } + + virtual void Reset() { + ResetImpl(); + IsMerged = true; + } + + // You can add some more result-getters if you want. + // Do not forget to call Merge() in the beginning of each merger. + virtual void GetResult(TBuffer& buffer) = 0; + + protected: + // AutoMerge() is called in Add() after each AddImpl() + void AutoMerge() { + IsMerged = false; + if (AutoMergeInterval) { + ++NotMergedCount; + if (NotMergedCount >= AutoMergeInterval) { + MergeImpl(); + IsMerged = true; + NotMergedCount = 0; + } + } + } + + // Implementation of merger: define it in derivatives + virtual bool AddImpl(const void* data, size_t size) = 0; // returns true if something is added + virtual void MergeImpl() = 0; + virtual void ResetImpl() = 0; + }; + } diff --git a/library/cpp/histogram/adaptive/multi_histogram.h b/library/cpp/histogram/adaptive/multi_histogram.h index 5672407068..41caac5ba6 100644 --- a/library/cpp/histogram/adaptive/multi_histogram.h +++ b/library/cpp/histogram/adaptive/multi_histogram.h @@ -1,83 +1,83 @@ -#pragma once - -#include "histogram.h" -#include "auto_histogram.h" - +#pragma once + +#include "histogram.h" +#include "auto_histogram.h" + #include <library/cpp/histogram/adaptive/protos/histo.pb.h> - + #include <util/generic/hash.h> -#include <util/generic/ptr.h> +#include <util/generic/ptr.h> #include <utility> - -namespace NKiwiAggr { - template <class TMyHistogram> - class TMultiHistogram { - private: - static const size_t DEFAULT_INTERVALS = 100; - + +namespace NKiwiAggr { + template <class TMyHistogram> + class TMultiHistogram { + private: + static const size_t DEFAULT_INTERVALS = 100; + typedef THashMap<ui64, IHistogramPtr> THistogramsMap; - THistogramsMap Histograms; - size_t Intervals; - - public: - TMultiHistogram(size_t intervals = DEFAULT_INTERVALS) - : Intervals(intervals) - { - } - - TMultiHistogram(const THistograms& histograms, size_t defaultIntervals = DEFAULT_INTERVALS) - : Intervals(defaultIntervals) - { - FromProto(histograms); - } - + THistogramsMap Histograms; + size_t Intervals; + + public: + TMultiHistogram(size_t intervals = DEFAULT_INTERVALS) + : Intervals(intervals) + { + } + + TMultiHistogram(const THistograms& histograms, size_t defaultIntervals = DEFAULT_INTERVALS) + : Intervals(defaultIntervals) + { + FromProto(histograms); + } + virtual ~TMultiHistogram() { - } - - void Clear() { - Histograms.clear(); - } - - void Add(const THistoRecs& histoRecs) { - for (size_t i = 0; i < histoRecs.HistoRecsSize(); ++i) { - Add(histoRecs.GetHistoRecs(i).GetId(), histoRecs.GetHistoRecs(i).GetValue(), histoRecs.GetHistoRecs(i).GetWeight()); - } - } - - void Add(const THistoRec& histoRec) { - Add(histoRec.GetId(), histoRec.GetValue(), histoRec.GetWeight()); - } - + } + + void Clear() { + Histograms.clear(); + } + + void Add(const THistoRecs& histoRecs) { + for (size_t i = 0; i < histoRecs.HistoRecsSize(); ++i) { + Add(histoRecs.GetHistoRecs(i).GetId(), histoRecs.GetHistoRecs(i).GetValue(), histoRecs.GetHistoRecs(i).GetWeight()); + } + } + + void Add(const THistoRec& histoRec) { + Add(histoRec.GetId(), histoRec.GetValue(), histoRec.GetWeight()); + } + void Add(ui64 id, double value, double weight) { - THistogramsMap::const_iterator it = Histograms.find(id); - if (it == Histograms.end()) { + THistogramsMap::const_iterator it = Histograms.find(id); + if (it == Histograms.end()) { it = Histograms.insert(std::make_pair(id, IHistogramPtr(new TMyHistogram(Intervals, id)))).first; - } - it->second->Add(value, weight); - } - - void Multiply(double factor) { - for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { - it->second->Multiply(factor); - } - } - + } + it->second->Add(value, weight); + } + + void Multiply(double factor) { + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + it->second->Multiply(factor); + } + } + TVector<ui64> GetIds() const { TVector<ui64> result(0); for (THistogramsMap::const_iterator it = Histograms.begin(); it != Histograms.end(); ++it) { result.push_back(it->first); - } - return result; - } - + } + return result; + } + IHistogramPtr GetHistogram(ui64 id) const { - THistogramsMap::const_iterator it = Histograms.find(id); - if (it != Histograms.end()) { - return it->second; - } - return IHistogramPtr(); - } - + THistogramsMap::const_iterator it = Histograms.find(id); + if (it != Histograms.end()) { + return it->second; + } + return IHistogramPtr(); + } + double GetMaxHistoSum() const { double sum = 0.0; for (THistogramsMap::const_iterator it = Histograms.begin(); it != Histograms.end(); ++it) { @@ -86,58 +86,58 @@ namespace NKiwiAggr { return sum; } - bool Empty() { - for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { - if (!it->second->Empty()) { - return false; - } - } - return true; - } - - virtual double OverallSum() { - double sum = 0.0; - for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { - sum += it->second->GetSum(); - } - return sum; - } - - void FromProto(const THistograms& histograms) { - for (size_t i = 0; i < histograms.HistoRecsSize(); ++i) { - IHistogramPtr newHisto(new TMyHistogram(histograms.GetHistoRecs(i), Intervals)); - if (!newHisto->Empty()) { - Histograms[newHisto->GetId()] = newHisto; - } - } - } - - void ToProto(THistograms& histograms) { - histograms.Clear(); - for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { - THistogram* histo = histograms.AddHistoRecs(); - it->second->ToProto(*histo); - } - } + bool Empty() { + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + if (!it->second->Empty()) { + return false; + } + } + return true; + } + + virtual double OverallSum() { + double sum = 0.0; + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + sum += it->second->GetSum(); + } + return sum; + } + + void FromProto(const THistograms& histograms) { + for (size_t i = 0; i < histograms.HistoRecsSize(); ++i) { + IHistogramPtr newHisto(new TMyHistogram(histograms.GetHistoRecs(i), Intervals)); + if (!newHisto->Empty()) { + Histograms[newHisto->GetId()] = newHisto; + } + } + } + + void ToProto(THistograms& histograms) { + histograms.Clear(); + for (THistogramsMap::iterator it = Histograms.begin(); it != Histograms.end(); ++it) { + THistogram* histo = histograms.AddHistoRecs(); + it->second->ToProto(*histo); + } + } void PrecomputePartialSums() { for (auto& it : Histograms) { it.second->PrecomputePartialSums(); } } - }; - - template <class TMerger, class TSomeMultiHistogram> - static void MergeToMultiHistogram(const void* data, size_t size, TSomeMultiHistogram& multiHistogram, ui32 intervals = 300) { - TMerger merger(intervals); - merger.Add(data, size); - THistograms histograms; - merger.GetResult(histograms); - multiHistogram.FromProto(histograms); - } - - // Good for parsing from THistograms protobuf - typedef TMultiHistogram<TAutoHistogram> TAutoMultiHistogram; + }; + + template <class TMerger, class TSomeMultiHistogram> + static void MergeToMultiHistogram(const void* data, size_t size, TSomeMultiHistogram& multiHistogram, ui32 intervals = 300) { + TMerger merger(intervals); + merger.Add(data, size); + THistograms histograms; + merger.GetResult(histograms); + multiHistogram.FromProto(histograms); + } + + // Good for parsing from THistograms protobuf + typedef TMultiHistogram<TAutoHistogram> TAutoMultiHistogram; typedef TAtomicSharedPtr<TAutoMultiHistogram> TAutoMultiHistogramPtr; - + } diff --git a/library/cpp/histogram/adaptive/protos/histo.proto b/library/cpp/histogram/adaptive/protos/histo.proto index 3ca752d4d7..8961fef022 100644 --- a/library/cpp/histogram/adaptive/protos/histo.proto +++ b/library/cpp/histogram/adaptive/protos/histo.proto @@ -5,32 +5,32 @@ package NKiwiAggr; message THistoRec { optional uint64 Id = 1; // Current histogram identifier optional double Value = 2; - optional double Weight = 3 [default = 1.0]; // You can set a certain weight to each record or just skip records using Weight=0 + optional double Weight = 3 [default = 1.0]; // You can set a certain weight to each record or just skip records using Weight=0 } message THistoRecs { repeated THistoRec HistoRecs = 1; } -enum EHistogramType { - HT_FIXED_BIN_HISTOGRAM = 1; - HT_ADAPTIVE_DISTANCE_HISTOGRAM = 2; - HT_ADAPTIVE_WEIGHT_HISTOGRAM = 3; - HT_ADAPTIVE_HISTOGRAM = 4; // if the quality function is unknown +enum EHistogramType { + HT_FIXED_BIN_HISTOGRAM = 1; + HT_ADAPTIVE_DISTANCE_HISTOGRAM = 2; + HT_ADAPTIVE_WEIGHT_HISTOGRAM = 3; + HT_ADAPTIVE_HISTOGRAM = 4; // if the quality function is unknown HT_ADAPTIVE_WARD_HISTOGRAM = 5; -} - +} + message THistogram { optional uint64 Id = 1; optional double MinValue = 2; - optional double BinRange = 4; // for FIXED_BIN_HISTOGRAM only. And it's OK that it is 4 after 2 - repeated float Freq = 5; - repeated float Position = 6; // for ADAPTIVE histograms only - optional double MaxValue = 7; - optional EHistogramType Type = 8; // Empty field means FIXED_BIN_HISTOGRAM + optional double BinRange = 4; // for FIXED_BIN_HISTOGRAM only. And it's OK that it is 4 after 2 + repeated float Freq = 5; + repeated float Position = 6; // for ADAPTIVE histograms only + optional double MaxValue = 7; + optional EHistogramType Type = 8; // Empty field means FIXED_BIN_HISTOGRAM } // Multihistogam message THistograms { repeated THistogram HistoRecs = 1; -} +} diff --git a/library/cpp/histogram/adaptive/protos/ya.make b/library/cpp/histogram/adaptive/protos/ya.make index b5a0c42114..7635cfcb8c 100644 --- a/library/cpp/histogram/adaptive/protos/ya.make +++ b/library/cpp/histogram/adaptive/protos/ya.make @@ -1,6 +1,6 @@ PROTO_LIBRARY() -OWNER(g:crawl) +OWNER(g:crawl) SRCS( histo.proto diff --git a/library/cpp/histogram/adaptive/ya.make b/library/cpp/histogram/adaptive/ya.make index 54c8d8af9a..b589801b27 100644 --- a/library/cpp/histogram/adaptive/ya.make +++ b/library/cpp/histogram/adaptive/ya.make @@ -7,7 +7,7 @@ OWNER( SRCS( common.cpp - adaptive_histogram.cpp + adaptive_histogram.cpp block_histogram.cpp fixed_bin_histogram.cpp ) |