diff options
author | vvvv <vvvv@yandex-team.com> | 2025-02-28 10:39:13 +0300 |
---|---|---|
committer | vvvv <vvvv@yandex-team.com> | 2025-02-28 11:42:27 +0300 |
commit | a4ebae0970f4e2748cb954f4fd56b40b42841809 (patch) | |
tree | ea7df8ec396ad5916a42577c692c6eefdd6e78bf /library/cpp/tdigest/tdigest.cpp | |
parent | b0a2365a3ba58c8b1c2ef256d2e061662b2b5900 (diff) | |
download | ydb-a4ebae0970f4e2748cb954f4fd56b40b42841809.tar.gz |
YQL-19495 handle NaNs in TDigest
commit_hash:6ceaf9a8cc4d034c2829780bed37396d25f9056d
Diffstat (limited to 'library/cpp/tdigest/tdigest.cpp')
-rw-r--r-- | library/cpp/tdigest/tdigest.cpp | 44 |
1 files changed, 37 insertions, 7 deletions
diff --git a/library/cpp/tdigest/tdigest.cpp b/library/cpp/tdigest/tdigest.cpp index 145cef78e1..3d3772a9de 100644 --- a/library/cpp/tdigest/tdigest.cpp +++ b/library/cpp/tdigest/tdigest.cpp @@ -3,45 +3,52 @@ #include <library/cpp/tdigest/tdigest.pb.h> #include <cmath> +#include <util/generic/yexception.h> // TODO: rewrite to https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/MergingDigest.java -TDigest::TDigest(double delta, double k) +TDigest::TDigest(double delta, double k, bool supportsNaN) : N(0) , Delta(delta) , K(k) + , SupportsNaN(supportsNaN) { } -TDigest::TDigest(double delta, double k, double firstValue) - : TDigest(delta, k) +TDigest::TDigest(double delta, double k, double firstValue, bool supportsNaN) + : TDigest(delta, k, supportsNaN) { AddValue(firstValue); } -TDigest::TDigest(TStringBuf serializedDigest) +TDigest::TDigest(TStringBuf serializedDigest, bool supportsNaN) : N(0) + , SupportsNaN(supportsNaN) { NTDigest::TDigest digest; Y_ABORT_UNLESS(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size())); Delta = digest.delta(); K = digest.k(); + HasNaN = SupportsNaN && digest.nans(); for (int i = 0; i < digest.centroids_size(); ++i) { const NTDigest::TDigest::TCentroid& centroid = digest.centroids(i); Update(centroid.mean(), centroid.weight()); } } -TDigest::TDigest(const TDigest* digest1, const TDigest* digest2) +TDigest::TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN) : N(0) , Delta(std::min(digest1->Delta, digest2->Delta)) , K(std::max(digest1->K, digest2->K)) + , SupportsNaN(supportsNaN) + , HasNaN(supportsNaN && (digest1->HasNaN || digest2->HasNaN)) { Add(*digest1); Add(*digest2); } void TDigest::Add(const TDigest& otherDigest) { + Y_ENSURE(SupportsNaN == otherDigest.SupportsNaN); for (auto& it : otherDigest.Centroids) Update(it.Mean, it.Count); for (auto& it : otherDigest.Unmerged) @@ -49,7 +56,8 @@ void TDigest::Add(const TDigest& otherDigest) { } TDigest TDigest::operator+(const TDigest& other) { - TDigest T(Delta, K); + Y_ENSURE(SupportsNaN == other.SupportsNaN); + TDigest T(Delta, K, SupportsNaN); T.Add(*this); T.Add(other); return T; @@ -92,6 +100,12 @@ void TDigest::MergeCentroid(TVector<TCentroid>& merged, double& sum, const TCent } void TDigest::Update(double x, double w) { + if (SupportsNaN) { + if (std::isnan(x)) { + HasNaN = true; + return; + } + } AddCentroid(TCentroid(x, w)); if (Unmerged.size() >= K / Delta) { Compress(); @@ -136,8 +150,17 @@ void TDigest::AddValue(double value) { double TDigest::GetPercentile(double percentile) { Compress(); - if (Centroids.empty()) + if (Centroids.empty()) { + if (HasNaN) { + return std::numeric_limits<double>::quiet_NaN(); + } return 0.0; + } + + if (HasNaN && percentile >= 1.0) { + return std::numeric_limits<double>::quiet_NaN(); + } + // This algorithm uses C=1/2 with 0.5 optimized away // See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C double x = percentile * N; @@ -159,6 +182,9 @@ double TDigest::GetPercentile(double percentile) { double TDigest::GetRank(double value) { Compress(); + if (SupportsNaN && std::isnan(value)) { + return 1.0; + } if (Centroids.empty()) { return 0.0; } @@ -189,6 +215,10 @@ TString TDigest::Serialize() { NTDigest::TDigest digest; digest.set_delta(Delta); digest.set_k(K); + if (HasNaN) { + digest.set_nans(HasNaN); + } + for (const auto& it : Centroids) { NTDigest::TDigest::TCentroid* centroid = digest.add_centroids(); centroid->set_mean(it.Mean); |