aboutsummaryrefslogtreecommitdiffstats
path: root/library
diff options
context:
space:
mode:
authorifsmirnov <ifsmirnov@yandex-team.com>2022-12-26 17:41:10 +0300
committerifsmirnov <ifsmirnov@yandex-team.com>2022-12-26 17:41:10 +0300
commit99cb30abce005e4f2073b737ca09b88da18c687f (patch)
treecaca972cc53bf008442714985b16d017609f3fd2 /library
parent3f10cf68f6146c9a0aa13a36ea8d7e05bc8f3725 (diff)
downloadydb-99cb30abce005e4f2073b737ca09b88da18c687f.tar.gz
Row digests for store compactor
Diffstat (limited to 'library')
-rw-r--r--library/cpp/tdigest/tdigest.cpp35
-rw-r--r--library/cpp/tdigest/tdigest.h10
2 files changed, 40 insertions, 5 deletions
diff --git a/library/cpp/tdigest/tdigest.cpp b/library/cpp/tdigest/tdigest.cpp
index 480425d2e2..3bd6d2d7e5 100644
--- a/library/cpp/tdigest/tdigest.cpp
+++ b/library/cpp/tdigest/tdigest.cpp
@@ -19,11 +19,11 @@ TDigest::TDigest(double delta, double k, double firstValue)
AddValue(firstValue);
}
-TDigest::TDigest(const TString& serializedDigest)
+TDigest::TDigest(TStringBuf serializedDigest)
: N(0)
{
NTDigest::TDigest digest;
- Y_VERIFY(digest.ParseFromString(serializedDigest));
+ Y_VERIFY(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size()));
Delta = digest.GetDelta();
K = digest.GetK();
for (int i = 0; i < digest.centroids_size(); ++i) {
@@ -157,6 +157,33 @@ double TDigest::GetPercentile(double percentile) {
return Centroids.back().Mean;
}
+double TDigest::GetRank(double value) {
+ Compress();
+ if (Centroids.empty()) {
+ return 0.0;
+ }
+ if (value < Centroids.front().Mean) {
+ return 0.0;
+ }
+ if (value == Centroids.front().Mean) {
+ return Centroids.front().Count * 0.5 / N;
+ }
+ double sum = 0.0;
+ double prev_x = 0.0;
+ double prev_mean = Centroids.front().Mean;
+ for (const auto& C : Centroids) {
+ double current_x = sum + C.Count * 0.5;
+ if (value <= C.Mean) {
+ double k = (value - prev_mean) / (C.Mean - prev_mean);
+ return (prev_x + k * (current_x - prev_x)) / N;
+ }
+ sum += C.Count;
+ prev_mean = C.Mean;
+ prev_x = current_x;
+ }
+ return 1.0;
+}
+
TString TDigest::Serialize() {
Compress();
NTDigest::TDigest digest;
@@ -169,3 +196,7 @@ TString TDigest::Serialize() {
}
return digest.SerializeAsString();
}
+
+i64 TDigest::GetCount() const {
+ return std::llround(N);
+}
diff --git a/library/cpp/tdigest/tdigest.h b/library/cpp/tdigest/tdigest.h
index acec0a0264..715620258c 100644
--- a/library/cpp/tdigest/tdigest.h
+++ b/library/cpp/tdigest/tdigest.h
@@ -1,7 +1,6 @@
#pragma once
-#include <util/generic/map.h>
-#include <util/generic/list.h>
+#include <util/generic/string.h>
#include <util/generic/vector.h>
class TDigest {
@@ -50,13 +49,18 @@ protected:
public:
TDigest(double delta = 0.01, double k = 25);
TDigest(double delta, double k, double firstValue);
- TDigest(const TString& serializedDigest);
+ TDigest(TStringBuf serializedDigest);
TDigest(const TDigest* digest1, const TDigest* digest2); // merge
+
TString Serialize();
+
TDigest operator+(const TDigest& other);
TDigest& operator+=(const TDigest& other);
+
void AddValue(double value);
void Compress();
void Clear();
double GetPercentile(double percentile);
+ double GetRank(double value);
+ i64 GetCount() const;
};