aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/core/minsketch/count_min_sketch.cpp
blob: 4d4e6c879e7d551e9f30e5eeb706d906163eba5b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include "count_min_sketch.h"

#include <util/system/compiler.h>

namespace NKikimr {

TCountMinSketch* TCountMinSketch::Create(ui64 width, ui64 depth) {
    auto size = StaticSize(width, depth);
    auto* data = ::malloc(size);
    auto* sketch = reinterpret_cast<TCountMinSketch*>(data);
    std::memset(sketch, 0, size);
    sketch->Width = width;
    sketch->Depth = depth;
    sketch->ElementCount = 0;
    return sketch;
}

TCountMinSketch* TCountMinSketch::FromString(const char* data, size_t size) {
    auto* from = reinterpret_cast<const TCountMinSketch*>(data);
    Y_ABORT_UNLESS(StaticSize(from->Width, from->Depth) == size);
    auto* dataDst = ::malloc(size);
    std::memcpy(dataDst, data, size);
    return reinterpret_cast<TCountMinSketch*>(dataDst);
}

void TCountMinSketch::operator delete(void* data) noexcept {
    ::free(data);
}

ui64 TCountMinSketch::Hash(const char* data, size_t size, size_t hashIndex) {
    // fnv1a
    ui64 hash = 14695981039346656037ULL + 31 * hashIndex;
    const unsigned char* ptr = (const unsigned char*)data;
    for (size_t i = 0; i < size; ++i, ++ptr) {
        hash = hash ^ (*ptr);
        hash = hash * 1099511628211ULL;
    }
    return hash;
}

void TCountMinSketch::Count(const char* data, size_t size) {
    ui32* start = Buckets();
    for (size_t d = 0; d < Depth; ++d, start += Width) {
        ui64 hash = Hash(data, size, d);
        ui32* bucket = start + hash % Width;
        if (Y_LIKELY(*bucket < std::numeric_limits<ui32>::max())) {
            ++*bucket;
        }
    }
    ++ElementCount;
}

ui32 TCountMinSketch::Probe(const char* data, size_t size) const {
    ui32 minValue = std::numeric_limits<ui32>::max();
    const ui32* start = Buckets();
    for (size_t d = 0; d < Depth; ++d, start += Width) {
        ui64 hash = Hash(data, size, d);
        const ui32* bucket = start + hash % Width;
        minValue = std::min(minValue, *bucket);
    }
    return minValue;
}

TCountMinSketch& TCountMinSketch::operator+=(const TCountMinSketch& rhs) {
    if (Width != rhs.Width || Depth != rhs.Depth) {
        return *this;
    }
    ui32* dst = Buckets();
    const ui32* src = rhs.Buckets();
    ui32* end = dst + Width * Depth;
    for (; dst != end; ++dst, ++src) {
        ui32 sum = *dst + *src;
        if (Y_UNLIKELY(sum < *dst)) {
            *dst = std::numeric_limits<ui32>::max();
        } else {
            *dst = sum;
        }
    }
    ElementCount += rhs.ElementCount;
    return *this;
}

} // namespace NKikimr