contrib/clickhouse/src/AggregateFunctions/StatCommon.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

#pragma once

#include <numeric>
#include <algorithm>
#include <utility>

#include <base/sort.h>

#include <Common/ArenaAllocator.h>

#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>


namespace DB
{
struct Settings;

namespace ErrorCodes
{
    extern const int BAD_ARGUMENTS;
}

/// Because ranks are adjusted, we have to store each of them in Float type.
using RanksArray = std::vector<Float64>;

template <typename Values>
std::pair<RanksArray, Float64> computeRanksAndTieCorrection(const Values & values)
{
    const size_t size = values.size();
    /// Save initial positions, than sort indices according to the values.
    std::vector<size_t> indexes(size);
    std::iota(indexes.begin(), indexes.end(), 0);
    std::sort(indexes.begin(), indexes.end(),
        [&] (size_t lhs, size_t rhs) { return values[lhs] < values[rhs]; });

    size_t left = 0;
    Float64 tie_numenator = 0;
    RanksArray out(size);
    while (left < size)
    {
        size_t right = left;
        while (right < size && values[indexes[left]] == values[indexes[right]])
            ++right;
        auto adjusted = (left + right + 1.) / 2.;
        auto count_equal = right - left;

        /// Scipy implementation throws exception in this case too.
        if (count_equal == size)
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "All numbers in both samples are identical");

        tie_numenator += std::pow(count_equal, 3) - count_equal;
        for (size_t iter = left; iter < right; ++iter)
            out[indexes[iter]] = adjusted;
        left = right;
    }
    return {out, 1 - (tie_numenator / (std::pow(size, 3) - size))};
}


template <typename X, typename Y>
struct StatisticalSample
{
    using AllocatorXSample = MixedAlignedArenaAllocator<alignof(X), 4096>;
    using SampleX = PODArray<X, 32, AllocatorXSample>;

    using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
    using SampleY = PODArray<Y, 32, AllocatorYSample>;

    SampleX x{};
    SampleY y{};
    size_t size_x{0};
    size_t size_y{0};

    void addX(X value, Arena * arena)
    {
        if (isNaN(value))
            return;

        ++size_x;
        x.push_back(value, arena);
    }

    void addY(Y value, Arena * arena)
    {
        if (isNaN(value))
            return;

        ++size_y;
        y.push_back(value, arena);
    }

    void merge(const StatisticalSample & rhs, Arena * arena)
    {
        size_x += rhs.size_x;
        size_y += rhs.size_y;
        x.insert(rhs.x.begin(), rhs.x.end(), arena);
        y.insert(rhs.y.begin(), rhs.y.end(), arena);
    }

    void write(WriteBuffer & buf) const
    {
        writeVarUInt(size_x, buf);
        writeVarUInt(size_y, buf);
        buf.write(reinterpret_cast<const char *>(x.data()), size_x * sizeof(x[0]));
        buf.write(reinterpret_cast<const char *>(y.data()), size_y * sizeof(y[0]));
    }

    void read(ReadBuffer & buf, Arena * arena)
    {
        readVarUInt(size_x, buf);
        readVarUInt(size_y, buf);
        x.resize(size_x, arena);
        y.resize(size_y, arena);
        buf.readStrict(reinterpret_cast<char *>(x.data()), size_x * sizeof(x[0]));
        buf.readStrict(reinterpret_cast<char *>(y.data()), size_y * sizeof(y[0]));
    }
};

}