1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
#pragma once
#include <numeric>
#include <algorithm>
#include <utility>
#include <base/sort.h>
#include <Common/ArenaAllocator.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
namespace DB
{
struct Settings;
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
/// Because ranks are adjusted, we have to store each of them in Float type.
using RanksArray = std::vector<Float64>;
template <typename Values>
std::pair<RanksArray, Float64> computeRanksAndTieCorrection(const Values & values)
{
const size_t size = values.size();
/// Save initial positions, than sort indices according to the values.
std::vector<size_t> indexes(size);
std::iota(indexes.begin(), indexes.end(), 0);
std::sort(indexes.begin(), indexes.end(),
[&] (size_t lhs, size_t rhs) { return values[lhs] < values[rhs]; });
size_t left = 0;
Float64 tie_numenator = 0;
RanksArray out(size);
while (left < size)
{
size_t right = left;
while (right < size && values[indexes[left]] == values[indexes[right]])
++right;
auto adjusted = (left + right + 1.) / 2.;
auto count_equal = right - left;
/// Scipy implementation throws exception in this case too.
if (count_equal == size)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "All numbers in both samples are identical");
tie_numenator += std::pow(count_equal, 3) - count_equal;
for (size_t iter = left; iter < right; ++iter)
out[indexes[iter]] = adjusted;
left = right;
}
return {out, 1 - (tie_numenator / (std::pow(size, 3) - size))};
}
template <typename X, typename Y>
struct StatisticalSample
{
using AllocatorXSample = MixedAlignedArenaAllocator<alignof(X), 4096>;
using SampleX = PODArray<X, 32, AllocatorXSample>;
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
using SampleY = PODArray<Y, 32, AllocatorYSample>;
SampleX x{};
SampleY y{};
size_t size_x{0};
size_t size_y{0};
void addX(X value, Arena * arena)
{
if (isNaN(value))
return;
++size_x;
x.push_back(value, arena);
}
void addY(Y value, Arena * arena)
{
if (isNaN(value))
return;
++size_y;
y.push_back(value, arena);
}
void merge(const StatisticalSample & rhs, Arena * arena)
{
size_x += rhs.size_x;
size_y += rhs.size_y;
x.insert(rhs.x.begin(), rhs.x.end(), arena);
y.insert(rhs.y.begin(), rhs.y.end(), arena);
}
void write(WriteBuffer & buf) const
{
writeVarUInt(size_x, buf);
writeVarUInt(size_y, buf);
buf.write(reinterpret_cast<const char *>(x.data()), size_x * sizeof(x[0]));
buf.write(reinterpret_cast<const char *>(y.data()), size_y * sizeof(y[0]));
}
void read(ReadBuffer & buf, Arena * arena)
{
readVarUInt(size_x, buf);
readVarUInt(size_y, buf);
x.resize(size_x, arena);
y.resize(size_y, arena);
buf.readStrict(reinterpret_cast<char *>(x.data()), size_x * sizeof(x[0]));
buf.readStrict(reinterpret_cast<char *>(y.data()), size_y * sizeof(y[0]));
}
};
}
|