diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/histogram/adaptive/block_histogram.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/histogram/adaptive/block_histogram.h')
-rw-r--r-- | library/cpp/histogram/adaptive/block_histogram.h | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/library/cpp/histogram/adaptive/block_histogram.h b/library/cpp/histogram/adaptive/block_histogram.h new file mode 100644 index 0000000000..266bb2f2b2 --- /dev/null +++ b/library/cpp/histogram/adaptive/block_histogram.h @@ -0,0 +1,148 @@ +#pragma once + +#include "histogram.h" +#include "common.h" + +#include <library/cpp/histogram/adaptive/protos/histo.pb.h> + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <utility> + +namespace NKiwiAggr { + /////////////////// + // TBlockHistogram + /////////////////// + + /** + * Contrary to adaptive histogram, block histogram doesn't rebuild bins + * after the addition of each point. Instead, it accumulates points and in case the amount + * of points overflows specified limits, it shrinks all the points at once to produce histogram + * Indeed, there exist two limits and two shrinkage operations: + * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit + * (override FastGreedyShrink to set specific behaviour) + * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the + * actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor) + * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization + */ + class TBlockHistogram: private TNonCopyable, public IHistogram { + protected: + static const size_t SHRINK_MULTIPLIER = 2; + static const size_t GREEDY_SHRINK_MULTIPLIER = 4; + static const size_t DEFAULT_INTERVALS = 100; + static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER); + + const EHistogramType Type; + const TQualityFunction CalcQuality; + + size_t Intervals; + size_t ShrinkSize; + size_t PrevSize; + + ui64 Id; + + double Sum; + double MinValue; + double MaxValue; + + TVector<TWeightedValue> Bins; + + public: + TBlockHistogram(EHistogramType type, TQualityFunction calcQuality, + size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE); + + virtual ~TBlockHistogram() { + } + + virtual void Clear(); + + virtual void Add(double value, double weight); + virtual void Add(const THistoRec& histoRec); + + virtual void Merge(const THistogram& histo, double multiplier); + virtual void Merge(const TVector<THistogram>& histogramsToMerge); + virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented + + virtual void Multiply(double factor); + + virtual void FromProto(const THistogram& histo); + virtual void ToProto(THistogram& histo); + + virtual void SetId(ui64 id); + virtual ui64 GetId(); + virtual bool Empty(); + virtual double GetMinValue(); + virtual double GetMaxValue(); + virtual double GetSum(); + + // methods below are not implemented + virtual double GetSumInRange(double leftBound, double rightBound); + virtual double GetSumAboveBound(double bound); + virtual double GetSumBelowBound(double bound); + virtual double CalcUpperBound(double sum); + virtual double CalcLowerBound(double sum); + virtual double CalcUpperBoundSafe(double sum); + virtual double CalcLowerBoundSafe(double sum); + + private: + void SortBins(); + void UniquifyBins(); + void CorrectShrinkSize(); + + void SortAndShrink(size_t intervals, bool final = false); + + void SlowShrink(size_t intervals); + virtual void FastGreedyShrink(size_t intervals) = 0; + }; + + ///////////////////////// + // TBlockWeightHistogram + ///////////////////////// + + class TBlockWeightHistogram: public TBlockHistogram { + public: + TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE); + + virtual ~TBlockWeightHistogram() { + } + + private: + virtual void FastGreedyShrink(size_t intervals) final; + }; + + /////////////////////// + // TBlockWardHistogram + /////////////////////// + + class TBlockWardHistogram: public TBlockHistogram { + public: + TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE); + + virtual ~TBlockWardHistogram() { + } + + private: + using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers) + using TCumulatives = TVector<TCumulative>; + + struct TSplitInfo { + double profit; + + TCumulatives::const_iterator beg; + TCumulatives::const_iterator mid; + TCumulatives::const_iterator end; + + bool operator<(const TSplitInfo& other) const { + return profit < other.profit; + } + }; + + private: + virtual void FastGreedyShrink(size_t intervals) final; + + static bool CalcSplitInfo(const TCumulatives::const_iterator beg, + const TCumulatives::const_iterator end, + TSplitInfo& splitInfo); + }; + +} |