aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvvvv <vvvv@yandex-team.com>2025-02-28 10:39:13 +0300
committervvvv <vvvv@yandex-team.com>2025-02-28 11:42:27 +0300
commita4ebae0970f4e2748cb954f4fd56b40b42841809 (patch)
treeea7df8ec396ad5916a42577c692c6eefdd6e78bf
parentb0a2365a3ba58c8b1c2ef256d2e061662b2b5900 (diff)
downloadydb-a4ebae0970f4e2748cb954f4fd56b40b42841809.tar.gz
YQL-19495 handle NaNs in TDigest
commit_hash:6ceaf9a8cc4d034c2829780bed37396d25f9056d
-rw-r--r--library/cpp/tdigest/tdigest.cpp44
-rw-r--r--library/cpp/tdigest/tdigest.h10
-rw-r--r--library/cpp/tdigest/tdigest.proto1
-rw-r--r--yql/essentials/udfs/common/stat/static/stat_udf.h9
-rw-r--r--yql/essentials/udfs/common/stat/test/canondata/result.json7
-rw-r--r--yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt46
-rw-r--r--yql/essentials/udfs/common/stat/test/cases/nan.sql2
-rw-r--r--yql/essentials/udfs/common/stat/test/ya.make13
-rw-r--r--yql/essentials/udfs/common/stat/ya.make1
9 files changed, 119 insertions, 14 deletions
diff --git a/library/cpp/tdigest/tdigest.cpp b/library/cpp/tdigest/tdigest.cpp
index 145cef78e1..3d3772a9de 100644
--- a/library/cpp/tdigest/tdigest.cpp
+++ b/library/cpp/tdigest/tdigest.cpp
@@ -3,45 +3,52 @@
#include <library/cpp/tdigest/tdigest.pb.h>
#include <cmath>
+#include <util/generic/yexception.h>
// TODO: rewrite to https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/MergingDigest.java
-TDigest::TDigest(double delta, double k)
+TDigest::TDigest(double delta, double k, bool supportsNaN)
: N(0)
, Delta(delta)
, K(k)
+ , SupportsNaN(supportsNaN)
{
}
-TDigest::TDigest(double delta, double k, double firstValue)
- : TDigest(delta, k)
+TDigest::TDigest(double delta, double k, double firstValue, bool supportsNaN)
+ : TDigest(delta, k, supportsNaN)
{
AddValue(firstValue);
}
-TDigest::TDigest(TStringBuf serializedDigest)
+TDigest::TDigest(TStringBuf serializedDigest, bool supportsNaN)
: N(0)
+ , SupportsNaN(supportsNaN)
{
NTDigest::TDigest digest;
Y_ABORT_UNLESS(digest.ParseFromArray(serializedDigest.data(), serializedDigest.size()));
Delta = digest.delta();
K = digest.k();
+ HasNaN = SupportsNaN && digest.nans();
for (int i = 0; i < digest.centroids_size(); ++i) {
const NTDigest::TDigest::TCentroid& centroid = digest.centroids(i);
Update(centroid.mean(), centroid.weight());
}
}
-TDigest::TDigest(const TDigest* digest1, const TDigest* digest2)
+TDigest::TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN)
: N(0)
, Delta(std::min(digest1->Delta, digest2->Delta))
, K(std::max(digest1->K, digest2->K))
+ , SupportsNaN(supportsNaN)
+ , HasNaN(supportsNaN && (digest1->HasNaN || digest2->HasNaN))
{
Add(*digest1);
Add(*digest2);
}
void TDigest::Add(const TDigest& otherDigest) {
+ Y_ENSURE(SupportsNaN == otherDigest.SupportsNaN);
for (auto& it : otherDigest.Centroids)
Update(it.Mean, it.Count);
for (auto& it : otherDigest.Unmerged)
@@ -49,7 +56,8 @@ void TDigest::Add(const TDigest& otherDigest) {
}
TDigest TDigest::operator+(const TDigest& other) {
- TDigest T(Delta, K);
+ Y_ENSURE(SupportsNaN == other.SupportsNaN);
+ TDigest T(Delta, K, SupportsNaN);
T.Add(*this);
T.Add(other);
return T;
@@ -92,6 +100,12 @@ void TDigest::MergeCentroid(TVector<TCentroid>& merged, double& sum, const TCent
}
void TDigest::Update(double x, double w) {
+ if (SupportsNaN) {
+ if (std::isnan(x)) {
+ HasNaN = true;
+ return;
+ }
+ }
AddCentroid(TCentroid(x, w));
if (Unmerged.size() >= K / Delta) {
Compress();
@@ -136,8 +150,17 @@ void TDigest::AddValue(double value) {
double TDigest::GetPercentile(double percentile) {
Compress();
- if (Centroids.empty())
+ if (Centroids.empty()) {
+ if (HasNaN) {
+ return std::numeric_limits<double>::quiet_NaN();
+ }
return 0.0;
+ }
+
+ if (HasNaN && percentile >= 1.0) {
+ return std::numeric_limits<double>::quiet_NaN();
+ }
+
// This algorithm uses C=1/2 with 0.5 optimized away
// See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C
double x = percentile * N;
@@ -159,6 +182,9 @@ double TDigest::GetPercentile(double percentile) {
double TDigest::GetRank(double value) {
Compress();
+ if (SupportsNaN && std::isnan(value)) {
+ return 1.0;
+ }
if (Centroids.empty()) {
return 0.0;
}
@@ -189,6 +215,10 @@ TString TDigest::Serialize() {
NTDigest::TDigest digest;
digest.set_delta(Delta);
digest.set_k(K);
+ if (HasNaN) {
+ digest.set_nans(HasNaN);
+ }
+
for (const auto& it : Centroids) {
NTDigest::TDigest::TCentroid* centroid = digest.add_centroids();
centroid->set_mean(it.Mean);
diff --git a/library/cpp/tdigest/tdigest.h b/library/cpp/tdigest/tdigest.h
index 715620258c..22c87e63c3 100644
--- a/library/cpp/tdigest/tdigest.h
+++ b/library/cpp/tdigest/tdigest.h
@@ -36,6 +36,8 @@ class TDigest {
double N;
double Delta;
double K;
+ bool SupportsNaN = false;
+ bool HasNaN = false;
void Add(const TDigest& otherDigest);
void AddCentroid(const TCentroid& centroid);
@@ -47,10 +49,10 @@ protected:
void Update(double x, double w = 1.0);
public:
- TDigest(double delta = 0.01, double k = 25);
- TDigest(double delta, double k, double firstValue);
- TDigest(TStringBuf serializedDigest);
- TDigest(const TDigest* digest1, const TDigest* digest2); // merge
+ TDigest(double delta = 0.01, double k = 25, bool supportsNaN = false);
+ TDigest(double delta, double k, double firstValue, bool supportsNaN = false);
+ TDigest(TStringBuf serializedDigest, bool supportsNaN = false);
+ TDigest(const TDigest* digest1, const TDigest* digest2, bool supportsNaN = false); // merge
TString Serialize();
diff --git a/library/cpp/tdigest/tdigest.proto b/library/cpp/tdigest/tdigest.proto
index 4a2db3e638..abd8b821cc 100644
--- a/library/cpp/tdigest/tdigest.proto
+++ b/library/cpp/tdigest/tdigest.proto
@@ -8,4 +8,5 @@ message TDigest {
optional double Weight = 2;
}
repeated TCentroid Centroids = 3;
+ optional bool Nans = 4;
}
diff --git a/yql/essentials/udfs/common/stat/static/stat_udf.h b/yql/essentials/udfs/common/stat/static/stat_udf.h
index f0c11a6812..36a1bad7c4 100644
--- a/yql/essentials/udfs/common/stat/static/stat_udf.h
+++ b/yql/essentials/udfs/common/stat/static/stat_udf.h
@@ -22,7 +22,7 @@ namespace {
UdfTerminate((TStringBuilder() << GetPos() << " Invalid combination of delta/K values").data());
}
- return TUnboxedValuePod(new TDigestResource(delta, K, args[0].Get<double>()));
+ return TUnboxedValuePod(new TDigestResource(delta, K, args[0].Get<double>(), true));
}
SIMPLE_STRICT_UDF(TTDigest_AddValue, TResource<DigestResourceName>(TResource<DigestResourceName>, double)) {
@@ -46,14 +46,17 @@ namespace {
SIMPLE_UDF(TTDigest_Deserialize, TResource<DigestResourceName>(char*)) {
Y_UNUSED(valueBuilder);
- return TUnboxedValuePod(new TDigestResource(TString(args[0].AsStringRef())));
+ return TUnboxedValuePod(new TDigestResource(TString(args[0].AsStringRef()), true));
}
SIMPLE_STRICT_UDF(TTDigest_Merge, TResource<DigestResourceName>(TResource<DigestResourceName>, TResource<DigestResourceName>)) {
Y_UNUSED(valueBuilder);
TDigestResource::Validate(args[0]);
TDigestResource::Validate(args[1]);
- return TUnboxedValuePod(new TDigestResource(static_cast<TDigestResource*>(args[0].AsBoxed().Get())->Get(), static_cast<TDigestResource*>(args[1].AsBoxed().Get())->Get()));
+ return TUnboxedValuePod(new TDigestResource(
+ static_cast<TDigestResource*>(args[0].AsBoxed().Get())->Get(),
+ static_cast<TDigestResource*>(args[1].AsBoxed().Get())->Get(),
+ true));
}
/*
diff --git a/yql/essentials/udfs/common/stat/test/canondata/result.json b/yql/essentials/udfs/common/stat/test/canondata/result.json
new file mode 100644
index 0000000000..44314e0309
--- /dev/null
+++ b/yql/essentials/udfs/common/stat/test/canondata/result.json
@@ -0,0 +1,7 @@
+{
+ "test.test[nan]": [
+ {
+ "uri": "file://test.test_nan_/results.txt"
+ }
+ ]
+}
diff --git a/yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt b/yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt
new file mode 100644
index 0000000000..1ae063d52a
--- /dev/null
+++ b/yql/essentials/udfs/common/stat/test/canondata/test.test_nan_/results.txt
@@ -0,0 +1,46 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "column0";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Double"
+ ]
+ ]
+ ];
+ [
+ "column1";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Double"
+ ]
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ [
+ "1.1"
+ ];
+ [
+ "nan"
+ ]
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/stat/test/cases/nan.sql b/yql/essentials/udfs/common/stat/test/cases/nan.sql
new file mode 100644
index 0000000000..5ab4027b05
--- /dev/null
+++ b/yql/essentials/udfs/common/stat/test/cases/nan.sql
@@ -0,0 +1,2 @@
+select percentile(x,0.99),percentile(x,1.0)
+from (values (double("nan")),(1.1),(0.5)) as a(x)
diff --git a/yql/essentials/udfs/common/stat/test/ya.make b/yql/essentials/udfs/common/stat/test/ya.make
new file mode 100644
index 0000000000..4a14f530f1
--- /dev/null
+++ b/yql/essentials/udfs/common/stat/test/ya.make
@@ -0,0 +1,13 @@
+YQL_UDF_TEST()
+
+DEPENDS(yql/essentials/udfs/common/stat)
+
+TIMEOUT(300)
+
+SIZE(MEDIUM)
+
+IF (SANITIZER_TYPE == "memory")
+ TAG(ya:not_autocheck) # YQL-15385
+ENDIF()
+
+END()
diff --git a/yql/essentials/udfs/common/stat/ya.make b/yql/essentials/udfs/common/stat/ya.make
index 8a5538b371..d1e622b444 100644
--- a/yql/essentials/udfs/common/stat/ya.make
+++ b/yql/essentials/udfs/common/stat/ya.make
@@ -18,6 +18,7 @@ YQL_UDF_CONTRIB(stat_udf)
IF (NOT EXPORT_CMAKE)
RECURSE_FOR_TESTS(
+ test
ut
)
ENDIF()