aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/histogram/adaptive/block_histogram.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/histogram/adaptive/block_histogram.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/histogram/adaptive/block_histogram.h')
-rw-r--r--library/cpp/histogram/adaptive/block_histogram.h148
1 files changed, 148 insertions, 0 deletions
diff --git a/library/cpp/histogram/adaptive/block_histogram.h b/library/cpp/histogram/adaptive/block_histogram.h
new file mode 100644
index 0000000000..266bb2f2b2
--- /dev/null
+++ b/library/cpp/histogram/adaptive/block_histogram.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include "histogram.h"
+#include "common.h"
+
+#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <utility>
+
+namespace NKiwiAggr {
+ ///////////////////
+ // TBlockHistogram
+ ///////////////////
+
+ /**
+ * Contrary to adaptive histogram, block histogram doesn't rebuild bins
+ * after the addition of each point. Instead, it accumulates points and in case the amount
+ * of points overflows specified limits, it shrinks all the points at once to produce histogram
+ * Indeed, there exist two limits and two shrinkage operations:
+ * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit
+ * (override FastGreedyShrink to set specific behaviour)
+ * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the
+ * actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor)
+ * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization
+ */
+ class TBlockHistogram: private TNonCopyable, public IHistogram {
+ protected:
+ static const size_t SHRINK_MULTIPLIER = 2;
+ static const size_t GREEDY_SHRINK_MULTIPLIER = 4;
+ static const size_t DEFAULT_INTERVALS = 100;
+ static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER);
+
+ const EHistogramType Type;
+ const TQualityFunction CalcQuality;
+
+ size_t Intervals;
+ size_t ShrinkSize;
+ size_t PrevSize;
+
+ ui64 Id;
+
+ double Sum;
+ double MinValue;
+ double MaxValue;
+
+ TVector<TWeightedValue> Bins;
+
+ public:
+ TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
+ size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
+
+ virtual ~TBlockHistogram() {
+ }
+
+ virtual void Clear();
+
+ virtual void Add(double value, double weight);
+ virtual void Add(const THistoRec& histoRec);
+
+ virtual void Merge(const THistogram& histo, double multiplier);
+ virtual void Merge(const TVector<THistogram>& histogramsToMerge);
+ virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented
+
+ virtual void Multiply(double factor);
+
+ virtual void FromProto(const THistogram& histo);
+ virtual void ToProto(THistogram& histo);
+
+ virtual void SetId(ui64 id);
+ virtual ui64 GetId();
+ virtual bool Empty();
+ virtual double GetMinValue();
+ virtual double GetMaxValue();
+ virtual double GetSum();
+
+ // methods below are not implemented
+ virtual double GetSumInRange(double leftBound, double rightBound);
+ virtual double GetSumAboveBound(double bound);
+ virtual double GetSumBelowBound(double bound);
+ virtual double CalcUpperBound(double sum);
+ virtual double CalcLowerBound(double sum);
+ virtual double CalcUpperBoundSafe(double sum);
+ virtual double CalcLowerBoundSafe(double sum);
+
+ private:
+ void SortBins();
+ void UniquifyBins();
+ void CorrectShrinkSize();
+
+ void SortAndShrink(size_t intervals, bool final = false);
+
+ void SlowShrink(size_t intervals);
+ virtual void FastGreedyShrink(size_t intervals) = 0;
+ };
+
+ /////////////////////////
+ // TBlockWeightHistogram
+ /////////////////////////
+
+ class TBlockWeightHistogram: public TBlockHistogram {
+ public:
+ TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
+
+ virtual ~TBlockWeightHistogram() {
+ }
+
+ private:
+ virtual void FastGreedyShrink(size_t intervals) final;
+ };
+
+ ///////////////////////
+ // TBlockWardHistogram
+ ///////////////////////
+
+ class TBlockWardHistogram: public TBlockHistogram {
+ public:
+ TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);
+
+ virtual ~TBlockWardHistogram() {
+ }
+
+ private:
+ using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers)
+ using TCumulatives = TVector<TCumulative>;
+
+ struct TSplitInfo {
+ double profit;
+
+ TCumulatives::const_iterator beg;
+ TCumulatives::const_iterator mid;
+ TCumulatives::const_iterator end;
+
+ bool operator<(const TSplitInfo& other) const {
+ return profit < other.profit;
+ }
+ };
+
+ private:
+ virtual void FastGreedyShrink(size_t intervals) final;
+
+ static bool CalcSplitInfo(const TCumulatives::const_iterator beg,
+ const TCumulatives::const_iterator end,
+ TSplitInfo& splitInfo);
+ };
+
+}