1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
#pragma once
#include <Columns/IColumn.h>
#include <unordered_map>
namespace DB
{
class ChunkInfo
{
public:
virtual ~ChunkInfo() = default;
ChunkInfo() = default;
};
using ChunkInfoPtr = std::shared_ptr<const ChunkInfo>;
/**
* Chunk is a list of columns with the same length.
* Chunk stores the number of rows in a separate field and supports invariant of equal column length.
*
* Chunk has move-only semantic. It's more lightweight than block cause doesn't store names, types and index_by_name.
*
* Chunk can have empty set of columns but non-zero number of rows. It helps when only the number of rows is needed.
* Chunk can have columns with zero number of rows. It may happen, for example, if all rows were filtered.
* Chunk is empty only if it has zero rows and empty list of columns.
*
* Any ChunkInfo may be attached to chunk.
* It may be useful if additional info per chunk is needed. For example, bucket number for aggregated data.
**/
class Chunk
{
public:
Chunk() = default;
Chunk(const Chunk & other) = delete;
Chunk(Chunk && other) noexcept
: columns(std::move(other.columns))
, num_rows(other.num_rows)
, chunk_info(std::move(other.chunk_info))
{
other.num_rows = 0;
}
Chunk(Columns columns_, UInt64 num_rows_);
Chunk(Columns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_);
Chunk(MutableColumns columns_, UInt64 num_rows_);
Chunk(MutableColumns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_);
Chunk & operator=(const Chunk & other) = delete;
Chunk & operator=(Chunk && other) noexcept
{
columns = std::move(other.columns);
chunk_info = std::move(other.chunk_info);
num_rows = other.num_rows;
other.num_rows = 0;
return *this;
}
Chunk clone() const;
void swap(Chunk & other)
{
columns.swap(other.columns);
chunk_info.swap(other.chunk_info);
std::swap(num_rows, other.num_rows);
}
void clear()
{
num_rows = 0;
columns.clear();
chunk_info.reset();
}
const Columns & getColumns() const { return columns; }
void setColumns(Columns columns_, UInt64 num_rows_);
void setColumns(MutableColumns columns_, UInt64 num_rows_);
Columns detachColumns();
MutableColumns mutateColumns();
/** Get empty columns with the same types as in block. */
MutableColumns cloneEmptyColumns() const;
const ChunkInfoPtr & getChunkInfo() const { return chunk_info; }
bool hasChunkInfo() const { return chunk_info != nullptr; }
void setChunkInfo(ChunkInfoPtr chunk_info_) { chunk_info = std::move(chunk_info_); }
UInt64 getNumRows() const { return num_rows; }
UInt64 getNumColumns() const { return columns.size(); }
bool hasRows() const { return num_rows > 0; }
bool hasColumns() const { return !columns.empty(); }
bool empty() const { return !hasRows() && !hasColumns(); }
explicit operator bool() const { return !empty(); }
void addColumn(ColumnPtr column);
void addColumn(size_t position, ColumnPtr column);
void erase(size_t position);
UInt64 bytes() const;
UInt64 allocatedBytes() const;
std::string dumpStructure() const;
void append(const Chunk & chunk);
void append(const Chunk & chunk, size_t from, size_t length); // append rows [from, from+length) of chunk
private:
Columns columns;
UInt64 num_rows = 0;
ChunkInfoPtr chunk_info;
void checkNumRowsIsConsistent();
};
using Chunks = std::vector<Chunk>;
/// AsyncInsert needs two kinds of information:
/// - offsets of different sub-chunks
/// - tokens of different sub-chunks, which are assigned by setting `insert_deduplication_token`.
class AsyncInsertInfo : public ChunkInfo
{
public:
AsyncInsertInfo() = default;
explicit AsyncInsertInfo(const std::vector<size_t> & offsets_, const std::vector<String> & tokens_) : offsets(offsets_), tokens(tokens_) {}
std::vector<size_t> offsets;
std::vector<String> tokens;
};
using AsyncInsertInfoPtr = std::shared_ptr<AsyncInsertInfo>;
/// Extension to support delayed defaults. AddingDefaultsProcessor uses it to replace missing values with column defaults.
class ChunkMissingValues : public ChunkInfo
{
public:
using RowsBitMask = std::vector<bool>; /// a bit per row for a column
const RowsBitMask & getDefaultsBitmask(size_t column_idx) const;
void setBit(size_t column_idx, size_t row_idx);
bool empty() const { return rows_mask_by_column_id.empty(); }
size_t size() const { return rows_mask_by_column_id.size(); }
void clear() { rows_mask_by_column_id.clear(); }
private:
using RowsMaskByColumnId = std::unordered_map<size_t, RowsBitMask>;
/// If rows_mask_by_column_id[column_id][row_id] is true related value in Block should be replaced with column default.
/// It could contain less columns and rows then related block.
RowsMaskByColumnId rows_mask_by_column_id;
};
/// Converts all columns to full serialization in chunk.
/// It's needed, when you have to access to the internals of the column,
/// or when you need to perform operation with two columns
/// and their structure must be equal (e.g. compareAt).
void convertToFullIfConst(Chunk & chunk);
void convertToFullIfSparse(Chunk & chunk);
/// Creates a chunk with the same columns but makes them constants with a default value and a specified number of rows.
Chunk cloneConstWithDefault(const Chunk & chunk, size_t num_rows);
}
|