1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
|
#pragma once
#include <string>
#include <map>
#include <unordered_map>
#include <vector>
#include <memory>
#include <utility>
#include <mutex>
#include <Core/Block.h>
#include <Storages/StorageInMemoryMetadata.h>
#include <Storages/MergeTree/GinIndexStore.h>
#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
#include <Storages/SelectQueryInfo.h>
#include <Storages/MergeTree/MarkRange.h>
#include <Storages/MergeTree/IDataPartStorage.h>
#include <Interpreters/ExpressionActions.h>
#include <DataTypes/DataTypeLowCardinality.h>
constexpr auto INDEX_FILE_PREFIX = "skp_idx_";
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
using MergeTreeIndexVersion = uint8_t;
struct MergeTreeIndexFormat
{
MergeTreeIndexVersion version;
const char* extension;
explicit operator bool() const { return version != 0; }
};
/// Stores some info about a single block of data.
struct IMergeTreeIndexGranule
{
virtual ~IMergeTreeIndexGranule() = default;
/// Serialize always last version.
virtual void serializeBinary(WriteBuffer & ostr) const = 0;
/// Version of the index to deserialize:
///
/// - 2 -- minmax index for proper Nullable support,
/// - 1 -- everything else.
///
/// Implementation is responsible for version check,
/// and throw LOGICAL_ERROR in case of unsupported version.
///
/// See also:
/// - IMergeTreeIndex::getSerializedFileExtension()
/// - IMergeTreeIndex::getDeserializedFormat()
/// - MergeTreeDataMergerMutator::collectFilesToSkip()
/// - MergeTreeDataMergerMutator::collectFilesForRenames()
virtual void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) = 0;
virtual bool empty() const = 0;
};
using MergeTreeIndexGranulePtr = std::shared_ptr<IMergeTreeIndexGranule>;
using MergeTreeIndexGranules = std::vector<MergeTreeIndexGranulePtr>;
/// Aggregates info about a single block of data.
struct IMergeTreeIndexAggregator
{
virtual ~IMergeTreeIndexAggregator() = default;
virtual bool empty() const = 0;
virtual MergeTreeIndexGranulePtr getGranuleAndReset() = 0;
/// Updates the stored info using rows of the specified block.
/// Reads no more than `limit` rows.
/// After finishing updating `pos` will store the position of the first row which was not read.
virtual void update(const Block & block, size_t * pos, size_t limit) = 0;
};
using MergeTreeIndexAggregatorPtr = std::shared_ptr<IMergeTreeIndexAggregator>;
using MergeTreeIndexAggregators = std::vector<MergeTreeIndexAggregatorPtr>;
/// Condition on the index.
class IMergeTreeIndexCondition
{
public:
virtual ~IMergeTreeIndexCondition() = default;
/// Checks if this index is useful for query.
virtual bool alwaysUnknownOrTrue() const = 0;
virtual bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const = 0;
};
using MergeTreeIndexConditionPtr = std::shared_ptr<IMergeTreeIndexCondition>;
using MergeTreeIndexConditions = std::vector<MergeTreeIndexConditionPtr>;
struct IMergeTreeIndex;
using MergeTreeIndexPtr = std::shared_ptr<const IMergeTreeIndex>;
/// IndexCondition that checks several indexes at the same time.
class IMergeTreeIndexMergedCondition
{
public:
explicit IMergeTreeIndexMergedCondition(size_t granularity_)
: granularity(granularity_)
{
}
virtual ~IMergeTreeIndexMergedCondition() = default;
virtual void addIndex(const MergeTreeIndexPtr & index) = 0;
virtual bool alwaysUnknownOrTrue() const = 0;
virtual bool mayBeTrueOnGranule(const MergeTreeIndexGranules & granules) const = 0;
protected:
const size_t granularity;
};
using MergeTreeIndexMergedConditionPtr = std::shared_ptr<IMergeTreeIndexMergedCondition>;
using MergeTreeIndexMergedConditions = std::vector<IMergeTreeIndexMergedCondition>;
struct IMergeTreeIndex
{
explicit IMergeTreeIndex(const IndexDescription & index_)
: index(index_)
{
}
virtual ~IMergeTreeIndex() = default;
/// Returns filename without extension.
String getFileName() const { return INDEX_FILE_PREFIX + index.name; }
size_t getGranularity() const { return index.granularity; }
virtual bool isMergeable() const { return false; }
/// Returns extension for serialization.
/// Reimplement if you want new index format.
///
/// NOTE: In case getSerializedFileExtension() is reimplemented,
/// getDeserializedFormat() should be reimplemented too,
/// and check all previous extensions too
/// (to avoid breaking backward compatibility).
virtual const char* getSerializedFileExtension() const { return ".idx"; }
/// Returns extension for deserialization.
///
/// Return pair<extension, version>.
virtual MergeTreeIndexFormat getDeserializedFormat(const IDataPartStorage & data_part_storage, const std::string & relative_path_prefix) const
{
if (data_part_storage.exists(relative_path_prefix + ".idx"))
return {1, ".idx"};
return {0 /*unknown*/, ""};
}
/// Checks whether the column is in data skipping index.
virtual bool mayBenefitFromIndexForIn(const ASTPtr & node) const = 0;
virtual MergeTreeIndexGranulePtr createIndexGranule() const = 0;
virtual MergeTreeIndexAggregatorPtr createIndexAggregator() const = 0;
virtual MergeTreeIndexAggregatorPtr createIndexAggregatorForPart([[maybe_unused]]const GinIndexStorePtr &store) const
{
return createIndexAggregator();
}
virtual MergeTreeIndexConditionPtr createIndexCondition(
const SelectQueryInfo & query_info, ContextPtr context) const = 0;
virtual MergeTreeIndexMergedConditionPtr createIndexMergedCondition(
const SelectQueryInfo & /*query_info*/, StorageMetadataPtr /*storage_metadata*/) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"MergedCondition is not implemented for index of type {}", index.type);
}
Names getColumnsRequiredForIndexCalc() const { return index.expression->getRequiredColumns(); }
const IndexDescription & index;
};
using MergeTreeIndexPtr = std::shared_ptr<const IMergeTreeIndex>;
using MergeTreeIndices = std::vector<MergeTreeIndexPtr>;
class MergeTreeIndexFactory : private boost::noncopyable
{
public:
static MergeTreeIndexFactory & instance();
using Creator = std::function<MergeTreeIndexPtr(const IndexDescription & index)>;
using Validator = std::function<void(const IndexDescription & index, bool attach)>;
void validate(const IndexDescription & index, bool attach) const;
MergeTreeIndexPtr get(const IndexDescription & index) const;
MergeTreeIndices getMany(const std::vector<IndexDescription> & indices) const;
void registerCreator(const std::string & index_type, Creator creator);
void registerValidator(const std::string & index_type, Validator validator);
protected:
MergeTreeIndexFactory();
private:
using Creators = std::unordered_map<std::string, Creator>;
using Validators = std::unordered_map<std::string, Validator>;
Creators creators;
Validators validators;
};
MergeTreeIndexPtr minmaxIndexCreator(const IndexDescription & index);
void minmaxIndexValidator(const IndexDescription & index, bool attach);
MergeTreeIndexPtr setIndexCreator(const IndexDescription & index);
void setIndexValidator(const IndexDescription & index, bool attach);
MergeTreeIndexPtr bloomFilterIndexCreator(const IndexDescription & index);
void bloomFilterIndexValidator(const IndexDescription & index, bool attach);
MergeTreeIndexPtr bloomFilterIndexCreatorNew(const IndexDescription & index);
void bloomFilterIndexValidatorNew(const IndexDescription & index, bool attach);
MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index);
void hypothesisIndexValidator(const IndexDescription & index, bool attach);
#ifdef ENABLE_ANNOY
MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index);
void annoyIndexValidator(const IndexDescription & index, bool attach);
#endif
#ifdef ENABLE_USEARCH
MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index);
void usearchIndexValidator(const IndexDescription& index, bool attach);
#endif
MergeTreeIndexPtr invertedIndexCreator(const IndexDescription& index);
void invertedIndexValidator(const IndexDescription& index, bool attach);
}
|