aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Common/StudentTTest.h
blob: 26c44ebc8c8c10ef957420763b7d7c4a57b5f8d3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#pragma once

#include <array>
#include <string>
#include <map>

/**
 * About:
 * This is implementation of Independent two-sample t-test
 * Read about it on https://en.wikipedia.org/wiki/Student%27s_t-test (Equal or unequal sample sizes, equal variance)
 *
 * Usage:
 * It's it used to assume with some level of confidence that two distributions don't differ.
 * Values can be added with t_test.add(0/1, value) and after compared and reported with compareAndReport().
 */
class StudentTTest
{
private:
    struct DistributionData
    {
        size_t size = 0;
        double sum = 0;
        double squares_sum = 0;

        void add(double value)
        {
            ++size;
            sum += value;
            squares_sum += value * value;
        }

        double avg() const
        {
            return sum / static_cast<double>(size);
        }

        double var() const
        {
            return (squares_sum - (sum * sum / size)) / static_cast<double>(size - 1);
        }

        void clear()
        {
            size = 0;
            sum = 0;
            squares_sum = 0;
        }
    };

    std::array<DistributionData, 2> data {};

public:
    void clear();

    void add(size_t distribution, double value);

    /// Confidence_level_index can be set in range [0, 5]. Corresponding values can be found above. TODO: Trash - no separation of concepts in code.
    std::pair<bool, std::string> compareAndReport(size_t confidence_level_index = 5) const;
};