summaryrefslogtreecommitdiffstats
path: root/yt/yt/core/misc/unittests/hyperloglog_ut.cpp
blob: 188b4ec24fa1c5b1dd5fdeaf5229cddc27b3cb10 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#include <yt/yt/core/test_framework/framework.h>

#include <yt/yt/core/misc/hyperloglog.h>
#include <yt/yt/core/misc/random.h>

#include <library/cpp/yt/farmhash/farm_hash.h>

namespace NYT {
namespace {

////////////////////////////////////////////////////////////////////////////////

class THyperLogLogTest
    : public ::testing::Test
    , public ::testing::WithParamInterface<std::tuple<int, int, int, int>>
{ };

std::pair<THyperLogLog<8>, int> GenerateHyperLogLog(
    TRandomGenerator& rng,
    int size,
    int targetCardinality)
{
    auto hll = THyperLogLog<8>();

    int cardinality = 1;
    ui64 n = 0;
    hll.Add(FarmFingerprint(n));
    for (int i = 0; i < size; i++) {
        if (static_cast<int>(rng.Generate<ui64>() % size) < targetCardinality) {
            cardinality++;
            n += 1 + (rng.Generate<ui32>() % 100);
        }

        hll.Add(FarmFingerprint(n));
    }

    return std::pair(hll, cardinality);
}

TEST_P(THyperLogLogTest, Random)
{
    auto param = GetParam();
    auto seed = std::get<0>(param);
    auto size = std::get<1>(param);
    auto targetCardinality = std::get<2>(param);
    auto iterations = std::get<3>(param);
    auto rng = TRandomGenerator(seed);

    auto error = 0.0;

    for (int i = 0; i < iterations; i++) {
        auto hll = GenerateHyperLogLog(
            rng,
            size,
            targetCardinality);
        auto err = ((double)hll.first.EstimateCardinality() - hll.second) / hll.second;
        error += err;
    }

    auto meanError = error / iterations;

    EXPECT_NEAR(meanError, 0, 0.05);
}

INSTANTIATE_TEST_SUITE_P(
    HyperLogLogTest,
    THyperLogLogTest,
    ::testing::Values(
        std::tuple<int, int, int, int>(123, 100000, 200, 10),
        std::tuple<int, int, int, int>(324, 100000, 1000, 10),
        std::tuple<int, int, int, int>(653, 100000, 5000, 10),
        std::tuple<int, int, int, int>(890, 100000, 10000, 10),
        std::tuple<int, int, int, int>(278, 100000, 50000, 10)));

////////////////////////////////////////////////////////////////////////////////

} // namespace
} // namespace NYT