aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/hyperloglog/hyperloglog.cpp
blob: c162fad798e51c0d40b507d2184be8a2f2e7dcde (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#include "hyperloglog.h"

#include <util/generic/bitops.h>
#include <util/generic/yexception.h>
#include <util/stream/output.h>

#include <algorithm>
#include <array>
#include <cmath>
#include <functional>

namespace {
    using TLookup = std::array<double, 256>;

    struct TCorrection {
        TLookup Estimations;
        TLookup Biases;

        double GetBias(double e) const {
            for (size_t idx = 0;; ++idx) {
                const auto estr = Estimations[idx];
                if (estr >= e) {
                    if (idx == 0) {
                        return Biases[0];
                    }
                    const auto estl = Estimations[idx - 1];
                    const auto biasl = Biases[idx - 1];
                    const auto biasr = Biases[idx];
                    const auto de = estr - estl;
                    const auto db = biasr - biasl;
                    const auto scale = e - estl;
                    return biasl + scale * db / de;
                } else if (std::fabs(estr) < 1e-4) {
                    //limiter
                    return Biases[idx - 1];
                }
            }
        }
    };

    double EstimateBias(double e, unsigned precision) {
        static const TCorrection CORRECTIONS[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = {
#include "hyperloglog_corrections.inc" 
        };
        if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) {
            return 0.;
        }

        return CORRECTIONS[precision - THyperLogLog::PRECISION_MIN].GetBias(e);
    }

    double GetThreshold(unsigned precision) {
        static const double THRESHOLD_DATA[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = {
            10,     // Precision  4 
            20,     // Precision  5 
            40,     // Precision  6 
            80,     // Precision  7 
            220,    // Precision  8 
            400,    // Precision  9 
            900,    // Precision 10 
            1800,   // Precision 11 
            3100,   // Precision 12 
            6500,   // Precision 13 
            11500,  // Precision 14 
            20000,  // Precision 15 
            50000,  // Precision 16 
            120000, // Precision 17 
            350000  // Precision 18 
        };
        if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) {
            return 0.;
        }

        return THRESHOLD_DATA[precision - THyperLogLog::PRECISION_MIN];
    }

    double EmpiricAlpha(size_t m) {
        switch (m) {
            case 16: 
                return 0.673; 
            case 32: 
                return 0.697; 
            case 64: 
                return 0.709; 
            default: 
                return 0.7213 / (1.0 + 1.079 / m); 
        }
    }

    double RawEstimate(const ui8* counts, size_t size) {
        double sum = {};
        for (size_t i = 0; i < size; ++i) {
            sum += std::pow(2.0, -counts[i]);
        }
        return EmpiricAlpha(size) * size * size / sum;
    }

    double LinearCounting(size_t registers, size_t zeroed) {
        return std::log(double(registers) / zeroed) * registers;
    }
}

THyperLogLogBase::THyperLogLogBase(unsigned precision)
    : Precision(precision) {
    Y_ENSURE(precision >= PRECISION_MIN && precision <= PRECISION_MAX);
}

void THyperLogLogBase::Update(ui64 hash) {
    const unsigned subHashBits = 8 * sizeof(hash) - Precision;
    const auto subHash = hash & MaskLowerBits(subHashBits);
    const auto leadingZeroes = subHash ? (subHashBits - GetValueBitCount(subHash)) : subHashBits;
    const ui8 weight = static_cast<ui8>(leadingZeroes + 1); 

    const size_t reg = static_cast<size_t>(hash >> subHashBits);
    RegistersRef[reg] = std::max(RegistersRef[reg], weight);
}

void THyperLogLogBase::Merge(const THyperLogLogBase& rh) {
    Y_ENSURE(Precision == rh.Precision);

    std::transform(RegistersRef.begin(), RegistersRef.end(), rh.RegistersRef.begin(), RegistersRef.begin(), [](ui8 l, ui8 r) { return std::max(l, r); });
}

ui64 THyperLogLogBase::Estimate() const {
    const auto m = RegistersRef.size();
    const auto e = RawEstimate(RegistersRef.data(), m);

    const auto e_ = e <= 5 * m ? (e - EstimateBias(e, Precision)) : e;
    const auto v = std::count(RegistersRef.begin(), RegistersRef.end(), ui8(0));
    const auto h = v != 0 ? LinearCounting(m, v) : e_;
    return h <= GetThreshold(Precision) ? h : e_;
}

void THyperLogLogBase::Save(IOutputStream& out) const {
    out.Write(static_cast<char>(Precision));
    out.Write(RegistersRef.data(), RegistersRef.size() * sizeof(RegistersRef.front()));
}