diff options
| author | vvvv <[email protected]> | 2025-02-28 10:39:13 +0300 | 
|---|---|---|
| committer | vvvv <[email protected]> | 2025-02-28 11:42:27 +0300 | 
| commit | a4ebae0970f4e2748cb954f4fd56b40b42841809 (patch) | |
| tree | ea7df8ec396ad5916a42577c692c6eefdd6e78bf /library/cpp | |
| parent | b0a2365a3ba58c8b1c2ef256d2e061662b2b5900 (diff) | |
YQL-19495 handle NaNs in TDigest
commit_hash:6ceaf9a8cc4d034c2829780bed37396d25f9056d
Diffstat (limited to 'library/cpp')
| -rw-r--r-- | library/cpp/tdigest/tdigest.cpp | 44 | ||||
| -rw-r--r-- | library/cpp/tdigest/tdigest.h | 10 | ||||
| -rw-r--r-- | library/cpp/tdigest/tdigest.proto | 1 | 
3 files changed, 44 insertions, 11 deletions
| diff --git a/library/cpp/tdigest/tdigest.cpp b/library/cpp/tdigest/tdigest.cpp index 145cef78e10..3d3772a9dee 100644 --- a/library/cpp/tdigest/tdigest.cpp +++ b/library/cpp/tdigest/tdigest.cpp @@ -3,45 +3,52 @@  #include <library/cpp/tdigest/tdigest.pb.h>  #include <cmath> +#include <util/generic/yexception.h>  // TODO: rewrite to https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/MergingDigest.java -TDigest::TDigest(double delta, double k) +TDigest::TDigest(double delta, double k, bool supportsNaN)      : N(0)      , Delta(delta)      , K(k) +    , SupportsNaN(supportsNaN)  {  } -TDigest::TDigest(double delta, double k, double firstValue) -    : TDigest(delta, k) +TDigest::TDigest(double delta, double k, double firstValue, bool supportsNaN) +    : TDigest(delta, k, supportsNaN)  {      AddValue(firstValue);  } -TDigest::TDigest(TStringBuf serializedDigest) +TDigest::TDigest(TStringBuf serializedDigest, bool supportsNaN)      : N(0) +    , SupportsNaN(supportsNaN)  {      NTDigest::TDigest digest;      Y_ABORT_UNLESS(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size()));      Delta = digest.delta();      K = digest.k(); +    HasNaN = SupportsNaN && digest.nans();      for (int i = 0; i < digest.centroids_size(); ++i) {          const NTDigest::TDigest::TCentroid& centroid = digest.centroids(i);          Update(centroid.mean(), centroid.weight());      }  } -TDigest::TDigest(const TDigest* digest1, const TDigest* digest2) +TDigest::TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN)      : N(0)      , Delta(std::min(digest1->Delta, digest2->Delta))      , K(std::max(digest1->K, digest2->K)) +    , SupportsNaN(supportsNaN) +    , HasNaN(supportsNaN && (digest1->HasNaN || digest2->HasNaN))  {      Add(*digest1);      Add(*digest2);  }  void TDigest::Add(const TDigest& otherDigest) { +    Y_ENSURE(SupportsNaN == otherDigest.SupportsNaN);      for (auto& it : otherDigest.Centroids)          Update(it.Mean, it.Count);      for (auto& it : otherDigest.Unmerged) @@ -49,7 +56,8 @@ void TDigest::Add(const TDigest& otherDigest) {  }  TDigest TDigest::operator+(const TDigest& other) { -    TDigest T(Delta, K); +    Y_ENSURE(SupportsNaN == other.SupportsNaN); +    TDigest T(Delta, K, SupportsNaN);      T.Add(*this);      T.Add(other);      return T; @@ -92,6 +100,12 @@ void TDigest::MergeCentroid(TVector<TCentroid>& merged, double& sum, const TCent  }  void TDigest::Update(double x, double w) { +    if (SupportsNaN) { +        if (std::isnan(x)) { +            HasNaN = true; +            return; +        } +    }      AddCentroid(TCentroid(x, w));      if (Unmerged.size() >= K / Delta) {          Compress(); @@ -136,8 +150,17 @@ void TDigest::AddValue(double value) {  double TDigest::GetPercentile(double percentile) {      Compress(); -    if (Centroids.empty()) +    if (Centroids.empty()) { +        if (HasNaN) { +            return std::numeric_limits<double>::quiet_NaN(); +        }          return 0.0; +    } + +    if (HasNaN && percentile >= 1.0) { +        return std::numeric_limits<double>::quiet_NaN(); +    } +      // This algorithm uses C=1/2 with 0.5 optimized away      // See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C      double x = percentile * N; @@ -159,6 +182,9 @@ double TDigest::GetPercentile(double percentile) {  double TDigest::GetRank(double value) {      Compress(); +    if (SupportsNaN && std::isnan(value)) { +        return 1.0; +    }      if (Centroids.empty()) {          return 0.0;      } @@ -189,6 +215,10 @@ TString TDigest::Serialize() {      NTDigest::TDigest digest;      digest.set_delta(Delta);      digest.set_k(K); +    if (HasNaN) { +        digest.set_nans(HasNaN); +    } +      for (const auto& it : Centroids) {          NTDigest::TDigest::TCentroid* centroid = digest.add_centroids();          centroid->set_mean(it.Mean); diff --git a/library/cpp/tdigest/tdigest.h b/library/cpp/tdigest/tdigest.h index 715620258c5..22c87e63c33 100644 --- a/library/cpp/tdigest/tdigest.h +++ b/library/cpp/tdigest/tdigest.h @@ -36,6 +36,8 @@ class TDigest {      double N;      double Delta;      double K; +    bool SupportsNaN = false; +    bool HasNaN = false;      void Add(const TDigest& otherDigest);      void AddCentroid(const TCentroid& centroid); @@ -47,10 +49,10 @@ protected:      void Update(double x, double w = 1.0);  public: -    TDigest(double delta = 0.01, double k = 25); -    TDigest(double delta, double k, double firstValue); -    TDigest(TStringBuf serializedDigest); -    TDigest(const TDigest* digest1, const TDigest* digest2); // merge +    TDigest(double delta = 0.01, double k = 25, bool supportsNaN = false); +    TDigest(double delta, double k, double firstValue, bool supportsNaN = false); +    TDigest(TStringBuf serializedDigest, bool supportsNaN = false); +    TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN = false); // merge      TString Serialize(); diff --git a/library/cpp/tdigest/tdigest.proto b/library/cpp/tdigest/tdigest.proto index 4a2db3e638c..abd8b821cc4 100644 --- a/library/cpp/tdigest/tdigest.proto +++ b/library/cpp/tdigest/tdigest.proto @@ -8,4 +8,5 @@ message TDigest {          optional double Weight = 2;      }      repeated TCentroid Centroids = 3; +    optional bool Nans = 4;  } | 
