aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/histogram/adaptive/block_histogram.h
blob: 266bb2f2b2fad947aadbad943d5647334e346b2f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#pragma once

#include "histogram.h"
#include "common.h"

#include <library/cpp/histogram/adaptive/protos/histo.pb.h>

#include <util/generic/ptr.h>
#include <util/generic/vector.h>
#include <utility>

namespace NKiwiAggr {
    ///////////////////
    // TBlockHistogram
    ///////////////////

    /**
     * Contrary to adaptive histogram, block histogram doesn't rebuild bins
     * after the addition of each point. Instead, it accumulates points and in case the amount
     * of points overflows specified limits, it shrinks all the points at once to produce histogram
     * Indeed, there exist two limits and two shrinkage operations:
     * 1) FastGreedyShrink is fast but coarse. It is used to shrink from upper limit to intermediate limit
     *    (override FastGreedyShrink to set specific behaviour)
     * 2) SlowShrink is slow, but produces finer histogram. It shrinks from the intermediate limit to the
     *    actual number of bins in a manner similar to that in adaptive histogram (set CalcQuality in constuctor)
     * While FastGreedyShrink is used most of the time, SlowShrink is mostly used for histogram finalization
     */
    class TBlockHistogram: private TNonCopyable, public IHistogram {
    protected:
        static const size_t SHRINK_MULTIPLIER = 2;
        static const size_t GREEDY_SHRINK_MULTIPLIER = 4;
        static const size_t DEFAULT_INTERVALS = 100;
        static const size_t DEFAULT_SHRINK_SIZE = DEFAULT_INTERVALS * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER);

        const EHistogramType Type;
        const TQualityFunction CalcQuality;

        size_t Intervals;
        size_t ShrinkSize;
        size_t PrevSize;

        ui64 Id;

        double Sum;
        double MinValue;
        double MaxValue;

        TVector<TWeightedValue> Bins;

    public:
        TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
                        size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);

        virtual ~TBlockHistogram() {
        }

        virtual void Clear();

        virtual void Add(double value, double weight);
        virtual void Add(const THistoRec& histoRec);

        virtual void Merge(const THistogram& histo, double multiplier);
        virtual void Merge(const TVector<THistogram>& histogramsToMerge);
        virtual void Merge(TVector<IHistogramPtr> histogramsToMerge); // not implemented

        virtual void Multiply(double factor);

        virtual void FromProto(const THistogram& histo);
        virtual void ToProto(THistogram& histo);

        virtual void SetId(ui64 id);
        virtual ui64 GetId();
        virtual bool Empty();
        virtual double GetMinValue();
        virtual double GetMaxValue();
        virtual double GetSum();

        // methods below are not implemented
        virtual double GetSumInRange(double leftBound, double rightBound);
        virtual double GetSumAboveBound(double bound);
        virtual double GetSumBelowBound(double bound);
        virtual double CalcUpperBound(double sum);
        virtual double CalcLowerBound(double sum);
        virtual double CalcUpperBoundSafe(double sum);
        virtual double CalcLowerBoundSafe(double sum);

    private:
        void SortBins();
        void UniquifyBins();
        void CorrectShrinkSize();

        void SortAndShrink(size_t intervals, bool final = false);

        void SlowShrink(size_t intervals);
        virtual void FastGreedyShrink(size_t intervals) = 0;
    };

    /////////////////////////
    // TBlockWeightHistogram
    /////////////////////////

    class TBlockWeightHistogram: public TBlockHistogram {
    public:
        TBlockWeightHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);

        virtual ~TBlockWeightHistogram() {
        }

    private:
        virtual void FastGreedyShrink(size_t intervals) final;
    };

    ///////////////////////
    // TBlockWardHistogram
    ///////////////////////

    class TBlockWardHistogram: public TBlockHistogram {
    public:
        TBlockWardHistogram(size_t intervals, ui64 id = 0, size_t shrinkSize = DEFAULT_SHRINK_SIZE);

        virtual ~TBlockWardHistogram() {
        }

    private:
        using TCumulative = std::pair<double, double>; // cumulative sum of (weights, weighted centers)
        using TCumulatives = TVector<TCumulative>;

        struct TSplitInfo {
            double profit;

            TCumulatives::const_iterator beg;
            TCumulatives::const_iterator mid;
            TCumulatives::const_iterator end;

            bool operator<(const TSplitInfo& other) const {
                return profit < other.profit;
            }
        };

    private:
        virtual void FastGreedyShrink(size_t intervals) final;

        static bool CalcSplitInfo(const TCumulatives::const_iterator beg,
                                  const TCumulatives::const_iterator end,
                                  TSplitInfo& splitInfo);
    };

}