aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp
diff options
context:
space:
mode:
authorvvvv <vvvv@yandex-team.com>2025-02-28 10:39:13 +0300
committervvvv <vvvv@yandex-team.com>2025-02-28 11:42:27 +0300
commita4ebae0970f4e2748cb954f4fd56b40b42841809 (patch)
treeea7df8ec396ad5916a42577c692c6eefdd6e78bf /library/cpp
parentb0a2365a3ba58c8b1c2ef256d2e061662b2b5900 (diff)
downloadydb-a4ebae0970f4e2748cb954f4fd56b40b42841809.tar.gz
YQL-19495 handle NaNs in TDigest
commit_hash:6ceaf9a8cc4d034c2829780bed37396d25f9056d
Diffstat (limited to 'library/cpp')
-rw-r--r--library/cpp/tdigest/tdigest.cpp44
-rw-r--r--library/cpp/tdigest/tdigest.h10
-rw-r--r--library/cpp/tdigest/tdigest.proto1
3 files changed, 44 insertions, 11 deletions
diff --git a/library/cpp/tdigest/tdigest.cpp b/library/cpp/tdigest/tdigest.cpp
index 145cef78e10..3d3772a9dee 100644
--- a/library/cpp/tdigest/tdigest.cpp
+++ b/library/cpp/tdigest/tdigest.cpp
@@ -3,45 +3,52 @@
#include <library/cpp/tdigest/tdigest.pb.h>
#include <cmath>
+#include <util/generic/yexception.h>
// TODO: rewrite to https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/MergingDigest.java
-TDigest::TDigest(double delta, double k)
+TDigest::TDigest(double delta, double k, bool supportsNaN)
: N(0)
, Delta(delta)
, K(k)
+ , SupportsNaN(supportsNaN)
{
}
-TDigest::TDigest(double delta, double k, double firstValue)
- : TDigest(delta, k)
+TDigest::TDigest(double delta, double k, double firstValue, bool supportsNaN)
+ : TDigest(delta, k, supportsNaN)
{
AddValue(firstValue);
}
-TDigest::TDigest(TStringBuf serializedDigest)
+TDigest::TDigest(TStringBuf serializedDigest, bool supportsNaN)
: N(0)
+ , SupportsNaN(supportsNaN)
{
NTDigest::TDigest digest;
Y_ABORT_UNLESS(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size()));
Delta = digest.delta();
K = digest.k();
+ HasNaN = SupportsNaN && digest.nans();
for (int i = 0; i < digest.centroids_size(); ++i) {
const NTDigest::TDigest::TCentroid& centroid = digest.centroids(i);
Update(centroid.mean(), centroid.weight());
}
}
-TDigest::TDigest(const TDigest* digest1, const TDigest* digest2)
+TDigest::TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN)
: N(0)
, Delta(std::min(digest1->Delta, digest2->Delta))
, K(std::max(digest1->K, digest2->K))
+ , SupportsNaN(supportsNaN)
+ , HasNaN(supportsNaN && (digest1->HasNaN || digest2->HasNaN))
{
Add(*digest1);
Add(*digest2);
}
void TDigest::Add(const TDigest& otherDigest) {
+ Y_ENSURE(SupportsNaN == otherDigest.SupportsNaN);
for (auto& it : otherDigest.Centroids)
Update(it.Mean, it.Count);
for (auto& it : otherDigest.Unmerged)
@@ -49,7 +56,8 @@ void TDigest::Add(const TDigest& otherDigest) {
}
TDigest TDigest::operator+(const TDigest& other) {
- TDigest T(Delta, K);
+ Y_ENSURE(SupportsNaN == other.SupportsNaN);
+ TDigest T(Delta, K, SupportsNaN);
T.Add(*this);
T.Add(other);
return T;
@@ -92,6 +100,12 @@ void TDigest::MergeCentroid(TVector<TCentroid>& merged, double& sum, const TCent
}
void TDigest::Update(double x, double w) {
+ if (SupportsNaN) {
+ if (std::isnan(x)) {
+ HasNaN = true;
+ return;
+ }
+ }
AddCentroid(TCentroid(x, w));
if (Unmerged.size() >= K / Delta) {
Compress();
@@ -136,8 +150,17 @@ void TDigest::AddValue(double value) {
double TDigest::GetPercentile(double percentile) {
Compress();
- if (Centroids.empty())
+ if (Centroids.empty()) {
+ if (HasNaN) {
+ return std::numeric_limits<double>::quiet_NaN();
+ }
return 0.0;
+ }
+
+ if (HasNaN && percentile >= 1.0) {
+ return std::numeric_limits<double>::quiet_NaN();
+ }
+
// This algorithm uses C=1/2 with 0.5 optimized away
// See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C
double x = percentile * N;
@@ -159,6 +182,9 @@ double TDigest::GetPercentile(double percentile) {
double TDigest::GetRank(double value) {
Compress();
+ if (SupportsNaN && std::isnan(value)) {
+ return 1.0;
+ }
if (Centroids.empty()) {
return 0.0;
}
@@ -189,6 +215,10 @@ TString TDigest::Serialize() {
NTDigest::TDigest digest;
digest.set_delta(Delta);
digest.set_k(K);
+ if (HasNaN) {
+ digest.set_nans(HasNaN);
+ }
+
for (const auto& it : Centroids) {
NTDigest::TDigest::TCentroid* centroid = digest.add_centroids();
centroid->set_mean(it.Mean);
diff --git a/library/cpp/tdigest/tdigest.h b/library/cpp/tdigest/tdigest.h
index 715620258c5..22c87e63c33 100644
--- a/library/cpp/tdigest/tdigest.h
+++ b/library/cpp/tdigest/tdigest.h
@@ -36,6 +36,8 @@ class TDigest {
double N;
double Delta;
double K;
+ bool SupportsNaN = false;
+ bool HasNaN = false;
void Add(const TDigest& otherDigest);
void AddCentroid(const TCentroid& centroid);
@@ -47,10 +49,10 @@ protected:
void Update(double x, double w = 1.0);
public:
- TDigest(double delta = 0.01, double k = 25);
- TDigest(double delta, double k, double firstValue);
- TDigest(TStringBuf serializedDigest);
- TDigest(const TDigest* digest1, const TDigest* digest2); // merge
+ TDigest(double delta = 0.01, double k = 25, bool supportsNaN = false);
+ TDigest(double delta, double k, double firstValue, bool supportsNaN = false);
+ TDigest(TStringBuf serializedDigest, bool supportsNaN = false);
+ TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN = false); // merge
TString Serialize();
diff --git a/library/cpp/tdigest/tdigest.proto b/library/cpp/tdigest/tdigest.proto
index 4a2db3e638c..abd8b821cc4 100644
--- a/library/cpp/tdigest/tdigest.proto
+++ b/library/cpp/tdigest/tdigest.proto
@@ -8,4 +8,5 @@ message TDigest {
optional double Weight = 2;
}
repeated TCentroid Centroids = 3;
+ optional bool Nans = 4;
}