aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/hyperloglog/hyperloglog.cpp
diff options
context:
space:
mode:
authorvitamin-ca <vitamin-ca@yandex-team.ru>2022-02-10 16:50:46 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:50:46 +0300
commit33975e98548306c90ccdc156bc436408a213be00 (patch)
treef3f70f93263e848986d3f52e04e4e9a980e224b0 /library/cpp/hyperloglog/hyperloglog.cpp
parenta175286682787b2d1213734c5be7458aaf594c1c (diff)
downloadydb-33975e98548306c90ccdc156bc436408a213be00.tar.gz
Restoring authorship annotation for <vitamin-ca@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/hyperloglog/hyperloglog.cpp')
-rw-r--r--library/cpp/hyperloglog/hyperloglog.cpp178
1 files changed, 89 insertions, 89 deletions
diff --git a/library/cpp/hyperloglog/hyperloglog.cpp b/library/cpp/hyperloglog/hyperloglog.cpp
index ec8352abe1..df4637a7ee 100644
--- a/library/cpp/hyperloglog/hyperloglog.cpp
+++ b/library/cpp/hyperloglog/hyperloglog.cpp
@@ -1,55 +1,55 @@
#include "hyperloglog.h"
-
-#include <util/generic/bitops.h>
-#include <util/generic/yexception.h>
-#include <util/stream/output.h>
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <functional>
-
-namespace {
- using TLookup = std::array<double, 256>;
-
- struct TCorrection {
- TLookup Estimations;
- TLookup Biases;
-
- double GetBias(double e) const {
- for (size_t idx = 0;; ++idx) {
- const auto estr = Estimations[idx];
- if (estr >= e) {
- if (idx == 0) {
- return Biases[0];
- }
- const auto estl = Estimations[idx - 1];
- const auto biasl = Biases[idx - 1];
- const auto biasr = Biases[idx];
- const auto de = estr - estl;
- const auto db = biasr - biasl;
- const auto scale = e - estl;
- return biasl + scale * db / de;
- } else if (std::fabs(estr) < 1e-4) {
- //limiter
- return Biases[idx - 1];
- }
- }
- }
- };
-
- double EstimateBias(double e, unsigned precision) {
+
+#include <util/generic/bitops.h>
+#include <util/generic/yexception.h>
+#include <util/stream/output.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+
+namespace {
+ using TLookup = std::array<double, 256>;
+
+ struct TCorrection {
+ TLookup Estimations;
+ TLookup Biases;
+
+ double GetBias(double e) const {
+ for (size_t idx = 0;; ++idx) {
+ const auto estr = Estimations[idx];
+ if (estr >= e) {
+ if (idx == 0) {
+ return Biases[0];
+ }
+ const auto estl = Estimations[idx - 1];
+ const auto biasl = Biases[idx - 1];
+ const auto biasr = Biases[idx];
+ const auto de = estr - estl;
+ const auto db = biasr - biasl;
+ const auto scale = e - estl;
+ return biasl + scale * db / de;
+ } else if (std::fabs(estr) < 1e-4) {
+ //limiter
+ return Biases[idx - 1];
+ }
+ }
+ }
+ };
+
+ double EstimateBias(double e, unsigned precision) {
static const TCorrection CORRECTIONS[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = {
#include "hyperloglog_corrections.inc"
- };
+ };
if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) {
- return 0.;
- }
-
+ return 0.;
+ }
+
return CORRECTIONS[precision - THyperLogLog::PRECISION_MIN].GetBias(e);
- }
-
- double GetThreshold(unsigned precision) {
+ }
+
+ double GetThreshold(unsigned precision) {
static const double THRESHOLD_DATA[1 + THyperLogLog::PRECISION_MAX - THyperLogLog::PRECISION_MIN] = {
10, // Precision 4
20, // Precision 5
@@ -66,16 +66,16 @@ namespace {
50000, // Precision 16
120000, // Precision 17
350000 // Precision 18
- };
+ };
if (precision < THyperLogLog::PRECISION_MIN || precision > THyperLogLog::PRECISION_MAX) {
- return 0.;
- }
-
+ return 0.;
+ }
+
return THRESHOLD_DATA[precision - THyperLogLog::PRECISION_MIN];
- }
-
- double EmpiricAlpha(size_t m) {
- switch (m) {
+ }
+
+ double EmpiricAlpha(size_t m) {
+ switch (m) {
case 16:
return 0.673;
case 32:
@@ -84,54 +84,54 @@ namespace {
return 0.709;
default:
return 0.7213 / (1.0 + 1.079 / m);
- }
- }
-
+ }
+ }
+
double RawEstimate(const ui8* counts, size_t size) {
- double sum = {};
+ double sum = {};
for (size_t i = 0; i < size; ++i) {
sum += std::pow(2.0, -counts[i]);
- }
+ }
return EmpiricAlpha(size) * size * size / sum;
- }
-
- double LinearCounting(size_t registers, size_t zeroed) {
- return std::log(double(registers) / zeroed) * registers;
- }
-}
-
+ }
+
+ double LinearCounting(size_t registers, size_t zeroed) {
+ return std::log(double(registers) / zeroed) * registers;
+ }
+}
+
THyperLogLogBase::THyperLogLogBase(unsigned precision)
: Precision(precision) {
- Y_ENSURE(precision >= PRECISION_MIN && precision <= PRECISION_MAX);
-}
-
+ Y_ENSURE(precision >= PRECISION_MIN && precision <= PRECISION_MAX);
+}
+
void THyperLogLogBase::Update(ui64 hash) {
- const unsigned subHashBits = 8 * sizeof(hash) - Precision;
- const auto subHash = hash & MaskLowerBits(subHashBits);
- const auto leadingZeroes = subHash ? (subHashBits - GetValueBitCount(subHash)) : subHashBits;
+ const unsigned subHashBits = 8 * sizeof(hash) - Precision;
+ const auto subHash = hash & MaskLowerBits(subHashBits);
+ const auto leadingZeroes = subHash ? (subHashBits - GetValueBitCount(subHash)) : subHashBits;
const ui8 weight = static_cast<ui8>(leadingZeroes + 1);
-
- const size_t reg = static_cast<size_t>(hash >> subHashBits);
+
+ const size_t reg = static_cast<size_t>(hash >> subHashBits);
RegistersRef[reg] = std::max(RegistersRef[reg], weight);
-}
-
+}
+
void THyperLogLogBase::Merge(const THyperLogLogBase& rh) {
- Y_ENSURE(Precision == rh.Precision);
-
+ Y_ENSURE(Precision == rh.Precision);
+
std::transform(RegistersRef.begin(), RegistersRef.end(), rh.RegistersRef.begin(), RegistersRef.begin(), [](ui8 l, ui8 r) { return std::max(l, r); });
-}
-
+}
+
ui64 THyperLogLogBase::Estimate() const {
const auto m = RegistersRef.size();
const auto e = RawEstimate(RegistersRef.data(), m);
-
- const auto e_ = e <= 5 * m ? (e - EstimateBias(e, Precision)) : e;
+
+ const auto e_ = e <= 5 * m ? (e - EstimateBias(e, Precision)) : e;
const auto v = std::count(RegistersRef.begin(), RegistersRef.end(), ui8(0));
- const auto h = v != 0 ? LinearCounting(m, v) : e_;
- return h <= GetThreshold(Precision) ? h : e_;
-}
-
+ const auto h = v != 0 ? LinearCounting(m, v) : e_;
+ return h <= GetThreshold(Precision) ? h : e_;
+}
+
void THyperLogLogBase::Save(IOutputStream& out) const {
- out.Write(static_cast<char>(Precision));
+ out.Write(static_cast<char>(Precision));
out.Write(RegistersRef.data(), RegistersRef.size() * sizeof(RegistersRef.front()));
-}
+}