aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/udfs/common/stat/stat_udf_ut.cpp
diff options
context:
space:
mode:
authorimunkin <imunkin@yandex-team.com>2024-11-08 10:00:23 +0300
committerimunkin <imunkin@yandex-team.com>2024-11-08 10:12:13 +0300
commita784a2f943d6e15caa6241e2e96d80aac6dbf375 (patch)
tree05f1e5366c916b988a8afb75bdab8ddeee0f6e6d /yql/essentials/udfs/common/stat/stat_udf_ut.cpp
parentd70137a7b530ccaa52834274913bbb5a3d1ca06e (diff)
downloadydb-a784a2f943d6e15caa6241e2e96d80aac6dbf375.tar.gz
Move yql/udfs/common/ to /yql/essentials YQL-19206
Except the following directories: * clickhouse/client * datetime * knn * roaring commit_hash:c7da95636144d28db109d6b17ddc762e9bacb59f
Diffstat (limited to 'yql/essentials/udfs/common/stat/stat_udf_ut.cpp')
-rw-r--r--yql/essentials/udfs/common/stat/stat_udf_ut.cpp363
1 files changed, 363 insertions, 0 deletions
diff --git a/yql/essentials/udfs/common/stat/stat_udf_ut.cpp b/yql/essentials/udfs/common/stat/stat_udf_ut.cpp
new file mode 100644
index 0000000000..2a033ff31e
--- /dev/null
+++ b/yql/essentials/udfs/common/stat/stat_udf_ut.cpp
@@ -0,0 +1,363 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <yql/essentials/minikql/mkql_function_registry.h>
+#include <yql/essentials/minikql/mkql_program_builder.h>
+#include <yql/essentials/minikql/computation/mkql_computation_node.h>
+#include <yql/essentials/minikql/comp_nodes/mkql_factories.h>
+#include <yql/essentials/minikql/invoke_builtins/mkql_builtins.h>
+#include <util/random/random.h>
+#include <util/system/sanitizers.h>
+#include <array>
+
+namespace NYql {
+using namespace NKikimr::NMiniKQL;
+
+ namespace NUdf {
+ extern NUdf::TUniquePtr<NUdf::IUdfModule> CreateStatModule();
+ }
+
+ Y_UNIT_TEST_SUITE(TUDFStatTest) {
+ Y_UNIT_TEST(SimplePercentile) {
+ auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone();
+ auto randomProvider = CreateDeterministicRandomProvider(1);
+ auto timeProvider = CreateDeterministicTimeProvider(10000000);
+ NUdf::TUniquePtr<NUdf::IUdfModule> module = NUdf::CreateStatModule();
+ mutableFunctionRegistry->AddModule("", "Stat", std::move(module));
+ TScopedAlloc alloc(__LOCATION__);
+ TTypeEnvironment env(alloc);
+ TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry);
+ auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create");
+ auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue");
+ auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile");
+
+ TRuntimeNode pgmDigest;
+ {
+ auto param1 = pgmBuilder.NewDataLiteral<double>(0.0);
+ TVector<TRuntimeNode> params = {param1};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params);
+ }
+
+ for (int n = 1; n < 10; n += 1) {
+ auto param2 = pgmBuilder.NewDataLiteral((double)n);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params);
+ }
+
+ TRuntimeNode pgmReturn;
+ {
+ auto param2 = pgmBuilder.NewDataLiteral<double>(0.9);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params);
+ }
+
+ TExploringNodeVisitor explorer;
+ explorer.Walk(pgmReturn.GetNode(), env);
+ TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(),
+ NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi);
+ auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts);
+ auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider));
+ auto value = graph->GetValue();
+ UNIT_ASSERT_DOUBLES_EQUAL(value.Get<double>(), 8.5, 0.001);
+ }
+
+ Y_UNIT_TEST(SimplePercentileSpecific) {
+ auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone();
+ auto randomProvider = CreateDeterministicRandomProvider(1);
+ auto timeProvider = CreateDeterministicTimeProvider(1);
+ NUdf::TUniquePtr<NUdf::IUdfModule> module = NUdf::CreateStatModule();
+ mutableFunctionRegistry->AddModule("", "Stat", std::move(module));
+ TScopedAlloc alloc(__LOCATION__);
+ TTypeEnvironment env(alloc);
+ TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry);
+ auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create");
+ auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue");
+ auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile");
+
+ TRuntimeNode pgmDigest;
+ {
+ auto param1 = pgmBuilder.NewDataLiteral<double>(75.0);
+ TVector<TRuntimeNode> params = {param1};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params);
+ }
+
+ TVector<double> vals = {800, 20, 150};
+ for (auto val : vals) {
+ auto param2 = pgmBuilder.NewDataLiteral(val);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params);
+ }
+
+ TRuntimeNode pgmReturn;
+ {
+ auto param2 = pgmBuilder.NewDataLiteral<double>(0.5);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params);
+ }
+
+ TExploringNodeVisitor explorer;
+ explorer.Walk(pgmReturn.GetNode(), env);
+ TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(),
+ NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi);
+ auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts);
+ auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider));
+ auto value = graph->GetValue();
+ Cerr << value.Get<double>() << Endl;
+ //~ UNIT_ASSERT_DOUBLES_EQUAL(value.Get<double>(), 9.0, 0.001);
+ }
+
+ Y_UNIT_TEST(SerializedPercentile) {
+ auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone();
+ auto randomProvider = CreateDeterministicRandomProvider(1);
+ auto timeProvider = CreateDeterministicTimeProvider(1);
+ NUdf::TUniquePtr<NUdf::IUdfModule> module = NUdf::CreateStatModule();
+ mutableFunctionRegistry->AddModule("", "Stat", std::move(module));
+ TScopedAlloc alloc(__LOCATION__);
+ TTypeEnvironment env(alloc);
+ TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry);
+ auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create");
+ auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue");
+ auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile");
+ auto udfTDigest_Serialize = pgmBuilder.Udf("Stat.TDigest_Serialize");
+ auto udfTDigest_Deserialize = pgmBuilder.Udf("Stat.TDigest_Deserialize");
+
+ TRuntimeNode pgmDigest;
+ {
+ auto param1 = pgmBuilder.NewDataLiteral<double>(0.0);
+ TVector<TRuntimeNode> params = {param1};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params);
+ }
+
+ for (int n = 1; n < 10; n += 1) {
+ auto param2 = pgmBuilder.NewDataLiteral((double)n);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params);
+ }
+
+ TRuntimeNode pgmSerializedData;
+ {
+ TVector<TRuntimeNode> params = {pgmDigest};
+ pgmSerializedData = pgmBuilder.Apply(udfTDigest_Serialize, params);
+ }
+
+ TRuntimeNode pgmDigest2;
+ {
+ TVector<TRuntimeNode> params = {pgmSerializedData};
+ pgmDigest2 = pgmBuilder.Apply(udfTDigest_Deserialize, params);
+ }
+
+ TRuntimeNode pgmReturn;
+ {
+ auto param2 = pgmBuilder.NewDataLiteral<double>(0.9);
+ TVector<TRuntimeNode> params = {pgmDigest2, param2};
+ pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params);
+ }
+
+ TExploringNodeVisitor explorer;
+ explorer.Walk(pgmReturn.GetNode(), env);
+ TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(),
+ NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi);
+ auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts);
+ auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider));
+ auto value = graph->GetValue();
+ UNIT_ASSERT_DOUBLES_EQUAL(value.Get<double>(), 8.5, 0.001);
+ }
+
+ Y_UNIT_TEST(SerializedMergedPercentile) {
+ auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone();
+ auto randomProvider = CreateDeterministicRandomProvider(1);
+ auto timeProvider = CreateDeterministicTimeProvider(1);
+ NUdf::TUniquePtr<NUdf::IUdfModule> module = NUdf::CreateStatModule();
+ mutableFunctionRegistry->AddModule("", "Stat", std::move(module));
+ TScopedAlloc alloc(__LOCATION__);
+ TTypeEnvironment env(alloc);
+ TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry);
+ auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create");
+ auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue");
+ auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile");
+ auto udfTDigest_Serialize = pgmBuilder.Udf("Stat.TDigest_Serialize");
+ auto udfTDigest_Deserialize = pgmBuilder.Udf("Stat.TDigest_Deserialize");
+ auto udfTDigest_Merge = pgmBuilder.Udf("Stat.TDigest_Merge");
+
+ TVector<TRuntimeNode> pgmSerializedDataVector;
+
+ for (int i = 0; i < 100; i += 10) {
+ TRuntimeNode pgmDigest;
+ {
+ auto param1 = pgmBuilder.NewDataLiteral(double(i) / 10);
+ TVector<TRuntimeNode> params = {param1};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_Create, params);
+ }
+
+ for (int n = i + 1; n < i + 10; n += 1) {
+ auto param2 = pgmBuilder.NewDataLiteral(double(n) / 10);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_AddValue, params);
+ }
+
+ TRuntimeNode pgmSerializedData;
+ {
+ TVector<TRuntimeNode> params = {pgmDigest};
+ pgmSerializedData = pgmBuilder.Apply(udfTDigest_Serialize, params);
+ }
+ pgmSerializedDataVector.push_back(pgmSerializedData);
+ }
+
+ TRuntimeNode pgmDigest;
+ for (size_t i = 0; i < pgmSerializedDataVector.size(); ++i) {
+ TRuntimeNode pgmDigest2;
+ {
+ TVector<TRuntimeNode> params = {pgmSerializedDataVector[i]};
+ pgmDigest2 = pgmBuilder.Apply(udfTDigest_Deserialize, params);
+ }
+ if (!pgmDigest) {
+ pgmDigest = pgmDigest2;
+ } else {
+ TVector<TRuntimeNode> params = {pgmDigest, pgmDigest2};
+ pgmDigest = pgmBuilder.Apply(udfTDigest_Merge, params);
+ }
+ }
+
+ TRuntimeNode pgmReturn;
+ {
+ auto param2 = pgmBuilder.NewDataLiteral<double>(0.9);
+ TVector<TRuntimeNode> params = {pgmDigest, param2};
+ pgmReturn = pgmBuilder.Apply(udfTDigest_GetPercentile, params);
+ }
+
+ TExploringNodeVisitor explorer;
+ explorer.Walk(pgmReturn.GetNode(), env);
+ TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(),
+ NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi);
+ auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts);
+ auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider));
+ auto value = graph->GetValue();
+ UNIT_ASSERT_DOUBLES_EQUAL(value.Get<double>(), 8.95, 0.001);
+ }
+
+ static double GetParetoRandomNumber(double a) {
+ return 1 / pow(RandomNumber<double>(), double(1) / a);
+ }
+
+ Y_UNIT_TEST(BigPercentile) {
+ auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone();
+ auto randomProvider = CreateDeterministicRandomProvider(1);
+ auto timeProvider = CreateDeterministicTimeProvider(1);
+ NUdf::TUniquePtr<NUdf::IUdfModule> module = NUdf::CreateStatModule();
+ mutableFunctionRegistry->AddModule("", "Stat", std::move(module));
+ TScopedAlloc alloc(__LOCATION__);
+ TTypeEnvironment env(alloc);
+ TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry);
+ auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create");
+ auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue");
+ auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile");
+ const size_t NUMBERS = 100000;
+ const double PERCENTILE = 0.99;
+ const double THRESHOLD = 0.0004; // at q=0.99 threshold is 4*delta*0.0099
+ TVector<double> randomNumbers1;
+ TVector<TRuntimeNode> randomNumbers2;
+ randomNumbers1.reserve(NUMBERS);
+ randomNumbers2.reserve(NUMBERS);
+ for (size_t n = 0; n < NUMBERS; ++n) {
+ double randomNumber = GetParetoRandomNumber(10);
+ randomNumbers1.push_back(randomNumber);
+ randomNumbers2.push_back(pgmBuilder.NewDataLiteral(randomNumber));
+ }
+ TRuntimeNode bigList = pgmBuilder.AsList(randomNumbers2);
+ auto pgmDigest =
+ pgmBuilder.Fold1(bigList,
+ [&](TRuntimeNode item) {
+ std::array<TRuntimeNode, 1> args;
+ args[0] = item;
+ return pgmBuilder.Apply(udfTDigest_Create, args);
+ },
+ [&](TRuntimeNode item, TRuntimeNode state) {
+ std::array<TRuntimeNode, 2> args;
+ args[0] = state;
+ args[1] = item;
+ return pgmBuilder.Apply(udfTDigest_AddValue, args);
+ });
+ TRuntimeNode pgmReturn =
+ pgmBuilder.Map(pgmDigest, [&](TRuntimeNode item) {
+ auto param2 = pgmBuilder.NewDataLiteral(PERCENTILE);
+ std::array<TRuntimeNode, 2> args;
+ args[0] = item;
+ args[1] = param2;
+ return pgmBuilder.Apply(udfTDigest_GetPercentile, args);
+ });
+
+ TExploringNodeVisitor explorer;
+ explorer.Walk(pgmReturn.GetNode(), env);
+ TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(),
+ NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi);
+ auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts);
+ auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider));
+ auto value = graph->GetValue();
+ UNIT_ASSERT(value);
+ double digestValue = value.Get<double>();
+ std::sort(randomNumbers1.begin(), randomNumbers1.end());
+ // This gives us a 1-based index of the last value <= digestValue
+ auto index = std::upper_bound(randomNumbers1.begin(), randomNumbers1.end(), digestValue) - randomNumbers1.begin();
+ // See https://en.wikipedia.org/wiki/Percentile#First_Variant.2C
+ double p = (index - 0.5) / double(randomNumbers1.size());
+ UNIT_ASSERT_DOUBLES_EQUAL(p, PERCENTILE, THRESHOLD);
+ }
+
+ Y_UNIT_TEST(CentroidPrecision) {
+ auto mutableFunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry())->Clone();
+ auto randomProvider = CreateDeterministicRandomProvider(1);
+ auto timeProvider = CreateDeterministicTimeProvider(1);
+ NUdf::TUniquePtr<NUdf::IUdfModule> module = NUdf::CreateStatModule();
+ mutableFunctionRegistry->AddModule("", "Stat", std::move(module));
+ TScopedAlloc alloc(__LOCATION__);
+ TTypeEnvironment env(alloc);
+ TProgramBuilder pgmBuilder(env, *mutableFunctionRegistry);
+ auto udfTDigest_Create = pgmBuilder.Udf("Stat.TDigest_Create");
+ auto udfTDigest_AddValue = pgmBuilder.Udf("Stat.TDigest_AddValue");
+ auto udfTDigest_GetPercentile = pgmBuilder.Udf("Stat.TDigest_GetPercentile");
+ const size_t NUMBERS = 100000;
+ const double PERCENTILE = 0.25;
+ const double minValue = 1.0;
+ const double maxValue = 100.0;
+ const double majorityValue = 50.0;
+ TVector<TRuntimeNode> numbers;
+ numbers.reserve(NUMBERS);
+ for (size_t n = 0; n < NUMBERS - 2; ++n) {
+ numbers.push_back(pgmBuilder.NewDataLiteral(majorityValue));
+ }
+ numbers.push_back(pgmBuilder.NewDataLiteral(minValue));
+ numbers.push_back(pgmBuilder.NewDataLiteral(maxValue));
+ TRuntimeNode bigList = pgmBuilder.AsList(numbers);
+ auto pgmDigest =
+ pgmBuilder.Fold1(bigList,
+ [&](TRuntimeNode item) {
+ std::array<TRuntimeNode, 1> args;
+ args[0] = item;
+ return pgmBuilder.Apply(udfTDigest_Create, args);
+ },
+ [&](TRuntimeNode item, TRuntimeNode state) {
+ std::array<TRuntimeNode, 2> args;
+ args[0] = state;
+ args[1] = item;
+ return pgmBuilder.Apply(udfTDigest_AddValue, args);
+ });
+ TRuntimeNode pgmReturn =
+ pgmBuilder.Map(pgmDigest, [&](TRuntimeNode item) {
+ auto param2 = pgmBuilder.NewDataLiteral(PERCENTILE);
+ std::array<TRuntimeNode, 2> args;
+ args[0] = item;
+ args[1] = param2;
+ return pgmBuilder.Apply(udfTDigest_GetPercentile, args);
+ });
+
+ TExploringNodeVisitor explorer;
+ explorer.Walk(pgmReturn.GetNode(), env);
+ TComputationPatternOpts opts(alloc.Ref(), env, GetBuiltinFactory(), mutableFunctionRegistry.Get(),
+ NUdf::EValidateMode::None, NUdf::EValidatePolicy::Fail, "", EGraphPerProcess::Multi);
+ auto pattern = MakeComputationPattern(explorer, pgmReturn, {}, opts);
+ auto graph = pattern->Clone(opts.ToComputationOptions(*randomProvider, *timeProvider));
+ auto value = graph->GetValue();
+ UNIT_ASSERT(value);
+ double digestValue = value.Get<double>();
+ UNIT_ASSERT_EQUAL(digestValue, majorityValue);
+ }
+ }
+}