1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
#pragma once
#include "yql_cost_function.h"
#include <yql/essentials/core/minsketch/count_min_sketch.h>
#include <yql/essentials/core/histogram/eq_width_histogram.h>
#include <yql/essentials/core/cbo/cbo_interesting_orderings.h>
#include <library/cpp/json/json_reader.h>
#include <util/generic/vector.h>
#include <util/generic/hash.h>
#include <util/generic/hash_set.h>
#include <util/generic/string.h>
#include <optional>
#include <iostream>
namespace NYql {
enum EStatisticsType : ui32 {
BaseTable,
FilteredFactTable,
ManyManyJoin
};
enum EStorageType : ui32 {
NA,
RowStorage,
ColumnStorage
};
// Providers may subclass this struct to associate specific statistics, useful to
// derive stats for higher-level operators in the plan.
struct IProviderStatistics {
virtual ~IProviderStatistics() {}
};
struct TColumnStatistics {
std::optional<double> NumUniqueVals;
std::optional<double> HyperLogLog;
std::shared_ptr<NKikimr::TCountMinSketch> CountMinSketch;
std::shared_ptr<NKikimr::TEqWidthHistogramEstimator> EqWidthHistogramEstimator;
TString Type;
TColumnStatistics() {}
};
/**
* Optimizer Statistics struct records per-table and per-column statistics
* for the current operator in the plan. Currently, only Nrows and Ncols are
* recorded.
* Cost is also included in statistics, as its updated concurrently with statistics
* all of the time.
*/
struct TOptimizerStatistics {
struct TKeyColumns : public TSimpleRefCount<TKeyColumns> {
TVector<TString> Data;
TKeyColumns(TVector<TString> data) : Data(std::move(data)) {}
};
struct TSortColumns : public TSimpleRefCount<TSortColumns> {
TVector<TString> Columns;
TVector<TString> Aliases;
TSortColumns(const TVector<TString>& cols, const TVector<TString>& aliases)
: Columns(cols)
, Aliases(aliases)
{}
};
struct TColumnStatMap : public TSimpleRefCount<TColumnStatMap> {
THashMap<TString,TColumnStatistics> Data;
TColumnStatMap() {}
TColumnStatMap(THashMap<TString,TColumnStatistics> data) : Data(std::move(data)) {}
};
struct TShuffledByColumns : public TSimpleRefCount<TShuffledByColumns> {
TVector<NDq::TJoinColumn> Data;
TShuffledByColumns(TVector<NDq::TJoinColumn> data) : Data(std::move(data)) {}
TString ToString() {
TString result;
for (const auto& column: Data) {
result.append(column.RelName).append(".").append(column.AttributeName).append(", ");
}
if (!result.empty()) {
result.pop_back();
result.pop_back();
}
return result;
}
};
EStatisticsType Type = BaseTable;
double Nrows = 0;
int Ncols = 0;
double ByteSize = 0;
double Cost = 0;
double Selectivity = 1.0;
TIntrusivePtr<TKeyColumns> KeyColumns;
TIntrusivePtr<TColumnStatMap> ColumnStatistics;
TIntrusivePtr<TShuffledByColumns> ShuffledByColumns;
TIntrusivePtr<TSortColumns> SortColumns;
EStorageType StorageType = EStorageType::NA;
std::shared_ptr<IProviderStatistics> Specific;
std::shared_ptr<TVector<TString>> Labels = {};
TString SourceTableName;
TSimpleSharedPtr<THashSet<TString>> Aliases;
TIntrusivePtr<NDq::TTableAliasMap> TableAliases;
NDq::TOrderingsStateMachine::TLogicalOrderings LogicalOrderings;
std::optional<std::size_t> ShuffleOrderingIdx;
TOptimizerStatistics(TOptimizerStatistics&&) = default;
TOptimizerStatistics& operator=(TOptimizerStatistics&&) = default;
TOptimizerStatistics(const TOptimizerStatistics&) = default;
TOptimizerStatistics& operator=(const TOptimizerStatistics&) = default;
TOptimizerStatistics() = default;
TOptimizerStatistics(
EStatisticsType type,
double nrows = 0.0,
int ncols = 0,
double byteSize = 0.0,
double cost = 0.0,
TIntrusivePtr<TKeyColumns> keyColumns = {},
TIntrusivePtr<TColumnStatMap> columnMap = {},
EStorageType storageType = EStorageType::NA,
std::shared_ptr<IProviderStatistics> specific = nullptr
);
TOptimizerStatistics& operator+=(const TOptimizerStatistics& other);
bool Empty() const;
friend std::ostream& operator<<(std::ostream& os, const TOptimizerStatistics& s);
TString ToString() const;
};
std::shared_ptr<TOptimizerStatistics> OverrideStatistics(const TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats);
}
|