diff options
author | vitamin-ca <vitamin-ca@yandex-team.ru> | 2022-02-10 16:50:46 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:46 +0300 |
commit | 33975e98548306c90ccdc156bc436408a213be00 (patch) | |
tree | f3f70f93263e848986d3f52e04e4e9a980e224b0 /library/cpp/hyperloglog/hyperloglog.cpp | |
parent | a175286682787b2d1213734c5be7458aaf594c1c (diff) | |
download | ydb-33975e98548306c90ccdc156bc436408a213be00.tar.gz |
Restoring authorship annotation for <vitamin-ca@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/hyperloglog/hyperloglog.cpp')
-rw-r--r-- | library/cpp/hyperloglog/hyperloglog.cpp | 178 |
1 files changed, 89 insertions, 89 deletions
diff --git a/library/cpp/hyperloglog/hyperloglog.cpp b/library/cpp/hyperloglog/hyperloglog.cpp index ec8352abe1..df4637a7ee 100644 --- a/library/cpp/hyperloglog/hyperloglog.cpp +++ b/library/cpp/hyperloglog/hyperloglog.cpp @@ -1,55 +1,55 @@ #include "hyperloglog.h" - -#include <util/generic/bitops.h> -#include <util/generic/yexception.h> -#include <util/stream/output.h> - -#include <algorithm> -#include <array> -#include <cmath> -#include <functional> - -namespace { - using TLookup = std::array<double, 256>; - - struct TCorrection { - TLookup Estimations; - TLookup Biases; - - double GetBias(double e) const { - for (size_t idx = 0;; ++idx) { - const auto estr = Estimations[idx]; - if (estr >= e) { - if (idx == 0) { - return Biases[0]; - } - const auto estl = Estimations[idx - 1]; - const auto biasl = Biases[idx - 1]; - const auto biasr = Biases[idx]; - const auto de = estr - estl; - const auto db = biasr - biasl; - const auto scale = e - estl; - return biasl + scale * db / de; - } else if (std::fabs(estr) < 1e-4) { - //limiter - return Biases[idx - 1]; - } - } - } - }; - - double EstimateBias(double e, unsigned precision) { + +#include <util/generic/bitops.h> +#include <util/generic/yexception.h> +#include <util/stream/output.h> + +#include <algorithm> +#include <array> +#include <cmath> +#include <functional> + +namespace { + using TLookup = std::array<double, 256>; + + struct TCorrection { + TLookup Estimations; + TLookup Biases; + + double GetBias(double e) const { + for (size_t idx = 0;; ++idx) { + const auto estr = Estimations[idx]; + if (estr >= e) { + if (idx == 0) { + return Biases[0]; + } + const auto estl = Estimations[idx - 1]; + const auto biasl = Biases[idx - 1]; + const auto biasr = Biases[idx]; + const auto de = estr - estl; + const auto db = biasr - biasl; + const auto scale = e - estl; + return biasl + scale * db / de; + } else if (std::fabs(estr) < 1e-4) { + //limiter + return Biases[idx - 1]; + } + } + } + }; + + double EstimateBias(double e, unsigned precision) { static const TCorrection CORRECTIONS[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = { #include "hyperloglog_corrections.inc" - }; + }; if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) { - return 0.; - } - + return 0.; + } + return CORRECTIONS[precision - THyperLogLog::PRECISION_MIN].GetBias(e); - } - - double GetThreshold(unsigned precision) { + } + + double GetThreshold(unsigned precision) { static const double THRESHOLD_DATA[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = { 10, // Precision 4 20, // Precision 5 @@ -66,16 +66,16 @@ namespace { 50000, // Precision 16 120000, // Precision 17 350000 // Precision 18 - }; + }; if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) { - return 0.; - } - + return 0.; + } + return THRESHOLD_DATA[precision - THyperLogLog::PRECISION_MIN]; - } - - double EmpiricAlpha(size_t m) { - switch (m) { + } + + double EmpiricAlpha(size_t m) { + switch (m) { case 16: return 0.673; case 32: @@ -84,54 +84,54 @@ namespace { return 0.709; default: return 0.7213 / (1.0 + 1.079 / m); - } - } - + } + } + double RawEstimate(const ui8* counts, size_t size) { - double sum = {}; + double sum = {}; for (size_t i = 0; i < size; ++i) { sum += std::pow(2.0, -counts[i]); - } + } return EmpiricAlpha(size) * size * size / sum; - } - - double LinearCounting(size_t registers, size_t zeroed) { - return std::log(double(registers) / zeroed) * registers; - } -} - + } + + double LinearCounting(size_t registers, size_t zeroed) { + return std::log(double(registers) / zeroed) * registers; + } +} + THyperLogLogBase::THyperLogLogBase(unsigned precision) : Precision(precision) { - Y_ENSURE(precision >= PRECISION_MIN && precision <= PRECISION_MAX); -} - + Y_ENSURE(precision >= PRECISION_MIN && precision <= PRECISION_MAX); +} + void THyperLogLogBase::Update(ui64 hash) { - const unsigned subHashBits = 8 * sizeof(hash) - Precision; - const auto subHash = hash & MaskLowerBits(subHashBits); - const auto leadingZeroes = subHash ? (subHashBits - GetValueBitCount(subHash)) : subHashBits; + const unsigned subHashBits = 8 * sizeof(hash) - Precision; + const auto subHash = hash & MaskLowerBits(subHashBits); + const auto leadingZeroes = subHash ? (subHashBits - GetValueBitCount(subHash)) : subHashBits; const ui8 weight = static_cast<ui8>(leadingZeroes + 1); - - const size_t reg = static_cast<size_t>(hash >> subHashBits); + + const size_t reg = static_cast<size_t>(hash >> subHashBits); RegistersRef[reg] = std::max(RegistersRef[reg], weight); -} - +} + void THyperLogLogBase::Merge(const THyperLogLogBase& rh) { - Y_ENSURE(Precision == rh.Precision); - + Y_ENSURE(Precision == rh.Precision); + std::transform(RegistersRef.begin(), RegistersRef.end(), rh.RegistersRef.begin(), RegistersRef.begin(), [](ui8 l, ui8 r) { return std::max(l, r); }); -} - +} + ui64 THyperLogLogBase::Estimate() const { const auto m = RegistersRef.size(); const auto e = RawEstimate(RegistersRef.data(), m); - - const auto e_ = e <= 5 * m ? (e - EstimateBias(e, Precision)) : e; + + const auto e_ = e <= 5 * m ? (e - EstimateBias(e, Precision)) : e; const auto v = std::count(RegistersRef.begin(), RegistersRef.end(), ui8(0)); - const auto h = v != 0 ? LinearCounting(m, v) : e_; - return h <= GetThreshold(Precision) ? h : e_; -} - + const auto h = v != 0 ? LinearCounting(m, v) : e_; + return h <= GetThreshold(Precision) ? h : e_; +} + void THyperLogLogBase::Save(IOutputStream& out) const { - out.Write(static_cast<char>(Precision)); + out.Write(static_cast<char>(Precision)); out.Write(RegistersRef.data(), RegistersRef.size() * sizeof(RegistersRef.front())); -} +} |