aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp
blob: 849f81279e72c69962100df79f7e357135134aff (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionTTest.h>
#include <AggregateFunctions/FactoryHelpers.h>
#include <AggregateFunctions/Moments.h>


namespace ErrorCodes
{
    extern const int BAD_ARGUMENTS;
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}

namespace DB
{
struct Settings;

namespace
{

struct WelchTTestData : public TTestMoments<Float64>
{
    static constexpr auto name = "welchTTest";

    bool hasEnoughObservations() const
    {
        return nx > 1 && ny > 1;
    }

    Float64 getDegreesOfFreedom() const
    {
        Float64 mean_x = getMeanX();
        Float64 mean_y = getMeanY();

        Float64 sx2 = (x2 + nx * mean_x * mean_x - 2 * mean_x * x1) / (nx - 1);
        Float64 sy2 = (y2 + ny * mean_y * mean_y - 2 * mean_y * y1) / (ny - 1);

        Float64 numerator_sqrt = sx2 / nx + sy2 / ny;
        Float64 numerator = numerator_sqrt * numerator_sqrt;

        Float64 denominator_x = sx2 * sx2 / (nx * nx * (nx - 1));
        Float64 denominator_y = sy2 * sy2 / (ny * ny * (ny - 1));

        auto result = numerator / (denominator_x + denominator_y);

        if (result <= 0 || std::isinf(result) || isNaN(result))
            throw Exception(
                ErrorCodes::BAD_ARGUMENTS,
                "Cannot calculate p_value, because the t-distribution \
                has inappropriate value of degrees of freedom (={}). It should be > 0", result);

        return result;
    }

    std::tuple<Float64, Float64> getResult() const
    {
        Float64 mean_x = getMeanX();
        Float64 mean_y = getMeanY();

        /// t-statistic
        Float64 se = getStandardError();
        Float64 t_stat = (mean_x - mean_y) / se;

        if (unlikely(!std::isfinite(t_stat)))
            return {std::numeric_limits<Float64>::quiet_NaN(), std::numeric_limits<Float64>::quiet_NaN()};

        auto students_t_distribution = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
        Float64 pvalue = 0;
        if (t_stat > 0)
            pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, -t_stat);
        else
            pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, t_stat);

        return {t_stat, pvalue};
    }
};

AggregateFunctionPtr createAggregateFunctionWelchTTest(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
{
    assertBinary(name, argument_types);

    if (parameters.size() > 1)
        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires zero or one parameter.", name);

    if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} only supports numerical types", name);

    return std::make_shared<AggregateFunctionTTest<WelchTTestData>>(argument_types, parameters);
}

}

void registerAggregateFunctionWelchTTest(AggregateFunctionFactory & factory)
{
    factory.registerFunction("welchTTest", createAggregateFunctionWelchTTest);
}

}