diff options
author | vvvv <vvvv@yandex-team.com> | 2025-02-28 10:39:13 +0300 |
---|---|---|
committer | vvvv <vvvv@yandex-team.com> | 2025-02-28 11:42:27 +0300 |
commit | a4ebae0970f4e2748cb954f4fd56b40b42841809 (patch) | |
tree | ea7df8ec396ad5916a42577c692c6eefdd6e78bf | |
parent | b0a2365a3ba58c8b1c2ef256d2e061662b2b5900 (diff) | |
download | ydb-a4ebae0970f4e2748cb954f4fd56b40b42841809.tar.gz |
YQL-19495 handle NaNs in TDigest
commit_hash:6ceaf9a8cc4d034c2829780bed37396d25f9056d
-rw-r--r-- | library/cpp/tdigest/tdigest.cpp | 44 | ||||
-rw-r--r-- | library/cpp/tdigest/tdigest.h | 10 | ||||
-rw-r--r-- | library/cpp/tdigest/tdigest.proto | 1 | ||||
-rw-r--r-- | yql/essentials/udfs/common/stat/static/stat_udf.h | 9 | ||||
-rw-r--r-- | yql/essentials/udfs/common/stat/test/canondata/result.json | 7 | ||||
-rw-r--r-- | yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt | 46 | ||||
-rw-r--r-- | yql/essentials/udfs/common/stat/test/cases/nan.sql | 2 | ||||
-rw-r--r-- | yql/essentials/udfs/common/stat/test/ya.make | 13 | ||||
-rw-r--r-- | yql/essentials/udfs/common/stat/ya.make | 1 |
9 files changed, 119 insertions, 14 deletions
diff --git a/library/cpp/tdigest/tdigest.cpp b/library/cpp/tdigest/tdigest.cpp index 145cef78e1..3d3772a9de 100644 --- a/library/cpp/tdigest/tdigest.cpp +++ b/library/cpp/tdigest/tdigest.cpp @@ -3,45 +3,52 @@ #include <library/cpp/tdigest/tdigest.pb.h> #include <cmath> +#include <util/generic/yexception.h> // TODO: rewrite to https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/MergingDigest.java -TDigest::TDigest(double delta, double k) +TDigest::TDigest(double delta, double k, bool supportsNaN) : N(0) , Delta(delta) , K(k) + , SupportsNaN(supportsNaN) { } -TDigest::TDigest(double delta, double k, double firstValue) - : TDigest(delta, k) +TDigest::TDigest(double delta, double k, double firstValue, bool supportsNaN) + : TDigest(delta, k, supportsNaN) { AddValue(firstValue); } -TDigest::TDigest(TStringBuf serializedDigest) +TDigest::TDigest(TStringBuf serializedDigest, bool supportsNaN) : N(0) + , SupportsNaN(supportsNaN) { NTDigest::TDigest digest; Y_ABORT_UNLESS(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size())); Delta = digest.delta(); K = digest.k(); + HasNaN = SupportsNaN && digest.nans(); for (int i = 0; i < digest.centroids_size(); ++i) { const NTDigest::TDigest::TCentroid& centroid = digest.centroids(i); Update(centroid.mean(), centroid.weight()); } } -TDigest::TDigest(const TDigest* digest1, const TDigest* digest2) +TDigest::TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN) : N(0) , Delta(std::min(digest1->Delta, digest2->Delta)) , K(std::max(digest1->K, digest2->K)) + , SupportsNaN(supportsNaN) + , HasNaN(supportsNaN && (digest1->HasNaN || digest2->HasNaN)) { Add(*digest1); Add(*digest2); } void TDigest::Add(const TDigest& otherDigest) { + Y_ENSURE(SupportsNaN == otherDigest.SupportsNaN); for (auto& it : otherDigest.Centroids) Update(it.Mean, it.Count); for (auto& it : otherDigest.Unmerged) @@ -49,7 +56,8 @@ void TDigest::Add(const TDigest& otherDigest) { } TDigest TDigest::operator+(const TDigest& other) { - TDigest T(Delta, K); + Y_ENSURE(SupportsNaN == other.SupportsNaN); + TDigest T(Delta, K, SupportsNaN); T.Add(*this); T.Add(other); return T; @@ -92,6 +100,12 @@ void TDigest::MergeCentroid(TVector<TCentroid>& merged, double& sum, const TCent } void TDigest::Update(double x, double w) { + if (SupportsNaN) { + if (std::isnan(x)) { + HasNaN = true; + return; + } + } AddCentroid(TCentroid(x, w)); if (Unmerged.size() >= K / Delta) { Compress(); @@ -136,8 +150,17 @@ void TDigest::AddValue(double value) { double TDigest::GetPercentile(double percentile) { Compress(); - if (Centroids.empty()) + if (Centroids.empty()) { + if (HasNaN) { + return std::numeric_limits<double>::quiet_NaN(); + } return 0.0; + } + + if (HasNaN && percentile >= 1.0) { + return std::numeric_limits<double>::quiet_NaN(); + } + // This algorithm uses C=1/2 with 0.5 optimized away // See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C double x = percentile * N; @@ -159,6 +182,9 @@ double TDigest::GetPercentile(double percentile) { double TDigest::GetRank(double value) { Compress(); + if (SupportsNaN && std::isnan(value)) { + return 1.0; + } if (Centroids.empty()) { return 0.0; } @@ -189,6 +215,10 @@ TString TDigest::Serialize() { NTDigest::TDigest digest; digest.set_delta(Delta); digest.set_k(K); + if (HasNaN) { + digest.set_nans(HasNaN); + } + for (const auto& it : Centroids) { NTDigest::TDigest::TCentroid* centroid = digest.add_centroids(); centroid->set_mean(it.Mean); diff --git a/library/cpp/tdigest/tdigest.h b/library/cpp/tdigest/tdigest.h index 715620258c..22c87e63c3 100644 --- a/library/cpp/tdigest/tdigest.h +++ b/library/cpp/tdigest/tdigest.h @@ -36,6 +36,8 @@ class TDigest { double N; double Delta; double K; + bool SupportsNaN = false; + bool HasNaN = false; void Add(const TDigest& otherDigest); void AddCentroid(const TCentroid& centroid); @@ -47,10 +49,10 @@ protected: void Update(double x, double w = 1.0); public: - TDigest(double delta = 0.01, double k = 25); - TDigest(double delta, double k, double firstValue); - TDigest(TStringBuf serializedDigest); - TDigest(const TDigest* digest1, const TDigest* digest2); // merge + TDigest(double delta = 0.01, double k = 25, bool supportsNaN = false); + TDigest(double delta, double k, double firstValue, bool supportsNaN = false); + TDigest(TStringBuf serializedDigest, bool supportsNaN = false); + TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN = false); // merge TString Serialize(); diff --git a/library/cpp/tdigest/tdigest.proto b/library/cpp/tdigest/tdigest.proto index 4a2db3e638..abd8b821cc 100644 --- a/library/cpp/tdigest/tdigest.proto +++ b/library/cpp/tdigest/tdigest.proto @@ -8,4 +8,5 @@ message TDigest { optional double Weight = 2; } repeated TCentroid Centroids = 3; + optional bool Nans = 4; } diff --git a/yql/essentials/udfs/common/stat/static/stat_udf.h b/yql/essentials/udfs/common/stat/static/stat_udf.h index f0c11a6812..36a1bad7c4 100644 --- a/yql/essentials/udfs/common/stat/static/stat_udf.h +++ b/yql/essentials/udfs/common/stat/static/stat_udf.h @@ -22,7 +22,7 @@ namespace { UdfTerminate((TStringBuilder() << GetPos() << " Invalid combination of delta/K values").data()); } - return TUnboxedValuePod(new TDigestResource(delta, K, args[0].Get<double>())); + return TUnboxedValuePod(new TDigestResource(delta, K, args[0].Get<double>(), true)); } SIMPLE_STRICT_UDF(TTDigest_AddValue, TResource<DigestResourceName>(TResource<DigestResourceName>, double)) { @@ -46,14 +46,17 @@ namespace { SIMPLE_UDF(TTDigest_Deserialize, TResource<DigestResourceName>(char*)) { Y_UNUSED(valueBuilder); - return TUnboxedValuePod(new TDigestResource(TString(args[0].AsStringRef()))); + return TUnboxedValuePod(new TDigestResource(TString(args[0].AsStringRef()), true)); } SIMPLE_STRICT_UDF(TTDigest_Merge, TResource<DigestResourceName>(TResource<DigestResourceName>, TResource<DigestResourceName>)) { Y_UNUSED(valueBuilder); TDigestResource::Validate(args[0]); TDigestResource::Validate(args[1]); - return TUnboxedValuePod(new TDigestResource(static_cast<TDigestResource*>(args[0].AsBoxed().Get())->Get(), static_cast<TDigestResource*>(args[1].AsBoxed().Get())->Get())); + return TUnboxedValuePod(new TDigestResource( + static_cast<TDigestResource*>(args[0].AsBoxed().Get())->Get(), + static_cast<TDigestResource*>(args[1].AsBoxed().Get())->Get(), + true)); } /* diff --git a/yql/essentials/udfs/common/stat/test/canondata/result.json b/yql/essentials/udfs/common/stat/test/canondata/result.json new file mode 100644 index 0000000000..44314e0309 --- /dev/null +++ b/yql/essentials/udfs/common/stat/test/canondata/result.json @@ -0,0 +1,7 @@ +{ + "test.test[nan]": [ + { + "uri": "file://test.test_nan_/results.txt" + } + ] +} diff --git a/yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt b/yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt new file mode 100644 index 0000000000..1ae063d52a --- /dev/null +++ b/yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt @@ -0,0 +1,46 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "column0"; + [ + "OptionalType"; + [ + "DataType"; + "Double" + ] + ] + ]; + [ + "column1"; + [ + "OptionalType"; + [ + "DataType"; + "Double" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "1.1" + ]; + [ + "nan" + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/stat/test/cases/nan.sql b/yql/essentials/udfs/common/stat/test/cases/nan.sql new file mode 100644 index 0000000000..5ab4027b05 --- /dev/null +++ b/yql/essentials/udfs/common/stat/test/cases/nan.sql @@ -0,0 +1,2 @@ +select percentile(x,0.99),percentile(x,1.0) +from (values (double("nan")),(1.1),(0.5)) as a(x) diff --git a/yql/essentials/udfs/common/stat/test/ya.make b/yql/essentials/udfs/common/stat/test/ya.make new file mode 100644 index 0000000000..4a14f530f1 --- /dev/null +++ b/yql/essentials/udfs/common/stat/test/ya.make @@ -0,0 +1,13 @@ +YQL_UDF_TEST() + +DEPENDS(yql/essentials/udfs/common/stat) + +TIMEOUT(300) + +SIZE(MEDIUM) + +IF (SANITIZER_TYPE == "memory") + TAG(ya:not_autocheck) # YQL-15385 +ENDIF() + +END() diff --git a/yql/essentials/udfs/common/stat/ya.make b/yql/essentials/udfs/common/stat/ya.make index 8a5538b371..d1e622b444 100644 --- a/yql/essentials/udfs/common/stat/ya.make +++ b/yql/essentials/udfs/common/stat/ya.make @@ -18,6 +18,7 @@ YQL_UDF_CONTRIB(stat_udf) IF (NOT EXPORT_CMAKE) RECURSE_FOR_TESTS( + test ut ) ENDIF() |