aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Columns/ColumnLowCardinality.h
blob: 91bd5945fd912cffa7aa72b88f82ed67de37fea0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#pragma once
#include <Columns/IColumn.h>
#include <Columns/IColumnUnique.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include "ColumnsNumber.h"


namespace DB
{

namespace ErrorCodes
{
    extern const int LOGICAL_ERROR;
}

/**
 * How data is stored (in a nutshell):
 * we have a dictionary @e reverse_index in ColumnUnique that holds pairs (DataType, UIntXX) and a column
 * with UIntXX holding actual data indices.
 * To obtain the value's index, call #getOrFindIndex.
 * To operate on the data (so called indices column), call #getIndexes.
 *
 * @note The indices column always contains the default value (empty StringRef) with the first index.
 */
class ColumnLowCardinality final : public COWHelper<IColumn, ColumnLowCardinality>
{
    friend class COWHelper<IColumn, ColumnLowCardinality>;

    ColumnLowCardinality(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared = false);
    ColumnLowCardinality(const ColumnLowCardinality & other) = default;

public:
    /** Create immutable column using immutable arguments. This arguments may be shared with other columns.
      * Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
      */
    using Base = COWHelper<IColumn, ColumnLowCardinality>;
    static Ptr create(const ColumnPtr & column_unique_, const ColumnPtr & indexes_, bool is_shared = false)
    {
        return ColumnLowCardinality::create(column_unique_->assumeMutable(), indexes_->assumeMutable(), is_shared);
    }

    static MutablePtr create(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared = false)
    {
        return Base::create(std::move(column_unique), std::move(indexes), is_shared);
    }

    std::string getName() const override { return "ColumnLowCardinality"; }
    const char * getFamilyName() const override { return "ColumnLowCardinality"; }
    TypeIndex getDataType() const override { return TypeIndex::LowCardinality; }

    ColumnPtr convertToFullColumn() const { return getDictionary().getNestedColumn()->index(getIndexes(), 0); }
    ColumnPtr convertToFullColumnIfLowCardinality() const override { return convertToFullColumn(); }

    MutableColumnPtr cloneResized(size_t size) const override;
    size_t size() const override { return getIndexes().size(); }

    Field operator[](size_t n) const override { return getDictionary()[getIndexes().getUInt(n)]; }
    void get(size_t n, Field & res) const override { getDictionary().get(getIndexes().getUInt(n), res); }

    StringRef getDataAt(size_t n) const override { return getDictionary().getDataAt(getIndexes().getUInt(n)); }

    bool isDefaultAt(size_t n) const override { return getDictionary().isDefaultAt(getIndexes().getUInt(n)); }
    UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); }
    UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); }
    Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); }
    Float64 getFloat64(size_t n) const override { return getDictionary().getFloat64(getIndexes().getUInt(n)); }
    Float32 getFloat32(size_t n) const override { return getDictionary().getFloat32(getIndexes().getUInt(n)); }
    bool getBool(size_t n) const override { return getDictionary().getBool(getIndexes().getUInt(n)); }
    bool isNullAt(size_t n) const override { return getDictionary().isNullAt(getIndexes().getUInt(n)); }
    ColumnPtr cut(size_t start, size_t length) const override
    {
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().cut(start, length));
    }

    void insert(const Field & x) override;
    void insertDefault() override;

    void insertFrom(const IColumn & src, size_t n) override;
    void insertFromFullColumn(const IColumn & src, size_t n);

    void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
    void insertRangeFromFullColumn(const IColumn & src, size_t start, size_t length);
    void insertRangeFromDictionaryEncodedColumn(const IColumn & keys, const IColumn & positions);

    void insertData(const char * pos, size_t length) override;

    void popBack(size_t n) override { idx.popBack(n); }

    StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const UInt8 *) const override;

    const char * deserializeAndInsertFromArena(const char * pos) override;

    const char * skipSerializedInArena(const char * pos) const override;

    void updateHashWithValue(size_t n, SipHash & hash) const override
    {
        return getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash);
    }

    void updateWeakHash32(WeakHash32 & hash) const override;

    void updateHashFast(SipHash &) const override;

    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
    {
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().filter(filt, result_size_hint));
    }

    void expand(const Filter & mask, bool inverted) override
    {
        idx.getPositionsPtr()->expand(mask, inverted);
    }

    ColumnPtr permute(const Permutation & perm, size_t limit) const override
    {
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().permute(perm, limit));
    }

    ColumnPtr index(const IColumn & indexes_, size_t limit) const override
    {
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().index(indexes_, limit));
    }

    int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override;

    void compareColumn(const IColumn & rhs, size_t rhs_row_num,
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;

    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator &) const override;

    bool hasEqualValues() const override;

    void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
                        size_t limit, int nan_direction_hint, Permutation & res) const override;

    void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
                        size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;

    void getPermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
                        size_t limit, int nan_direction_hint, Permutation & res) const override;

    void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
                        size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;

    ColumnPtr replicate(const Offsets & offsets) const override
    {
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().replicate(offsets));
    }

    std::vector<MutableColumnPtr> scatter(ColumnIndex num_columns, const Selector & selector) const override;

    void gather(ColumnGathererStream & gatherer_stream) override;

    void getExtremes(Field & min, Field & max) const override
    {
        return dictionary.getColumnUnique().getNestedColumn()->index(getIndexes(), 0)->getExtremes(min, max); /// TODO: optimize
    }

    void reserve(size_t n) override { idx.reserve(n); }

    /// Don't count the dictionary size as it can be shared between different blocks.
    size_t byteSize() const override { return idx.getPositions()->byteSize(); }

    size_t byteSizeAt(size_t n) const override { return getDictionary().byteSizeAt(getIndexes().getUInt(n)); }
    size_t allocatedBytes() const override { return idx.getPositions()->allocatedBytes() + getDictionary().allocatedBytes(); }

    void forEachSubcolumn(MutableColumnCallback callback) override
    {
        callback(idx.getPositionsPtr());

        /// Column doesn't own dictionary if it's shared.
        if (!dictionary.isShared())
            callback(dictionary.getColumnUniquePtr());
    }

    void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override
    {
        callback(*idx.getPositionsPtr());
        idx.getPositionsPtr()->forEachSubcolumnRecursively(callback);

        /// Column doesn't own dictionary if it's shared.
        if (!dictionary.isShared())
        {
            callback(*dictionary.getColumnUniquePtr());
            dictionary.getColumnUniquePtr()->forEachSubcolumnRecursively(callback);
        }
    }

    bool structureEquals(const IColumn & rhs) const override
    {
        if (const auto * rhs_low_cardinality = typeid_cast<const ColumnLowCardinality *>(&rhs))
            return idx.getPositions()->structureEquals(*rhs_low_cardinality->idx.getPositions())
                && dictionary.getColumnUnique().structureEquals(rhs_low_cardinality->dictionary.getColumnUnique());
        return false;
    }

    double getRatioOfDefaultRows(double sample_ratio) const override
    {
        return getIndexes().getRatioOfDefaultRows(sample_ratio);
    }

    UInt64 getNumberOfDefaultRows() const override
    {
        return getIndexes().getNumberOfDefaultRows();
    }

    void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
    {
        return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
    }

    bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
    bool isFixedAndContiguous() const override { return false; }
    size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }
    bool isNumeric() const override { return getDictionary().isNumeric(); }
    bool lowCardinality() const override { return true; }
    bool isCollationSupported() const override { return getDictionary().getNestedColumn()->isCollationSupported(); }

    /**
     * Checks if the dictionary column is Nullable(T).
     * So LC(Nullable(T)) would return true, LC(U) -- false.
     */
    bool nestedIsNullable() const { return isColumnNullable(*dictionary.getColumnUnique().getNestedColumn()); }
    bool nestedCanBeInsideNullable() const { return dictionary.getColumnUnique().getNestedColumn()->canBeInsideNullable(); }
    void nestedToNullable() { dictionary.getColumnUnique().nestedToNullable(); }
    void nestedRemoveNullable() { dictionary.getColumnUnique().nestedRemoveNullable(); }
    MutableColumnPtr cloneNullable() const;

    ColumnPtr cloneWithDefaultOnNull() const;

    const IColumnUnique & getDictionary() const { return dictionary.getColumnUnique(); }
    IColumnUnique & getDictionary() { return dictionary.getColumnUnique(); }
    const ColumnPtr & getDictionaryPtr() const { return dictionary.getColumnUniquePtr(); }
    ColumnPtr & getDictionaryPtr() { return dictionary.getColumnUniquePtr(); }
    /// IColumnUnique & getUnique() { return static_cast<IColumnUnique &>(*column_unique); }
    /// ColumnPtr getUniquePtr() const { return column_unique; }

    /// IColumn & getIndexes() { return *idx.getPositions(); }
    const IColumn & getIndexes() const { return *idx.getPositions(); }
    const ColumnPtr & getIndexesPtr() const { return idx.getPositions(); }
    size_t getSizeOfIndexType() const { return idx.getSizeOfIndexType(); }

    ALWAYS_INLINE size_t getIndexAt(size_t row) const
    {
        const IColumn * indexes = &getIndexes();

        switch (idx.getSizeOfIndexType())
        {
            case sizeof(UInt8): return assert_cast<const ColumnUInt8 *>(indexes)->getElement(row);
            case sizeof(UInt16): return assert_cast<const ColumnUInt16 *>(indexes)->getElement(row);
            case sizeof(UInt32): return assert_cast<const ColumnUInt32 *>(indexes)->getElement(row);
            case sizeof(UInt64): return assert_cast<const ColumnUInt64 *>(indexes)->getElement(row);
            default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size of index type for low cardinality column.");
        }
    }

    ///void setIndexes(MutableColumnPtr && indexes_) { indexes = std::move(indexes_); }

    /// Set shared ColumnUnique for empty low cardinality column.
    void setSharedDictionary(const ColumnPtr & column_unique);
    bool isSharedDictionary() const { return dictionary.isShared(); }

    /// Create column with new dictionary from column part.
    /// Dictionary will have only keys that are mentioned in index.
    MutablePtr cutAndCompact(size_t start, size_t length) const;

    struct DictionaryEncodedColumn
    {
        ColumnPtr dictionary;
        ColumnPtr indexes;
    };

    DictionaryEncodedColumn getMinimalDictionaryEncodedColumn(UInt64 offset, UInt64 limit) const;

    ColumnPtr countKeys() const;

    bool containsNull() const;

    class Index
    {
    public:
        Index();
        Index(const Index & other) = default;
        explicit Index(MutableColumnPtr && positions_);
        explicit Index(ColumnPtr positions_);

        const ColumnPtr & getPositions() const { return positions; }
        WrappedPtr & getPositionsPtr() { return positions; }
        const WrappedPtr & getPositionsPtr() const { return positions; }
        size_t getPositionAt(size_t row) const;
        void insertPosition(UInt64 position);
        void insertPositionsRange(const IColumn & column, UInt64 offset, UInt64 limit);

        void popBack(size_t n) { positions->popBack(n); }
        void reserve(size_t n) { positions->reserve(n); }

        UInt64 getMaxPositionForCurrentType() const;

        static size_t getSizeOfIndexType(const IColumn & column, size_t hint);
        size_t getSizeOfIndexType() const { return size_of_type; }

        void checkSizeOfType();

        MutableColumnPtr detachPositions() { return IColumn::mutate(std::move(positions)); }
        void attachPositions(MutableColumnPtr positions_);

        void countKeys(ColumnUInt64::Container & counts) const;

        bool containsDefault() const;

        void updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const;

    private:
        WrappedPtr positions;
        size_t size_of_type = 0;

        void updateSizeOfType() { size_of_type = getSizeOfIndexType(*positions, size_of_type); }
        void expandType();

        template <typename IndexType>
        typename ColumnVector<IndexType>::Container & getPositionsData();

        template <typename IndexType>
        const typename ColumnVector<IndexType>::Container & getPositionsData() const;

        template <typename IndexType>
        void convertPositions();

        template <typename Callback>
        static void callForType(Callback && callback, size_t size_of_type);
    };

private:
    class Dictionary
    {
    public:
        Dictionary(const Dictionary & other) = default;
        explicit Dictionary(MutableColumnPtr && column_unique, bool is_shared);
        explicit Dictionary(ColumnPtr column_unique, bool is_shared);

        const WrappedPtr & getColumnUniquePtr() const { return column_unique; }
        WrappedPtr & getColumnUniquePtr() { return column_unique; }

        const IColumnUnique & getColumnUnique() const { return static_cast<const IColumnUnique &>(*column_unique); }
        IColumnUnique & getColumnUnique() { return static_cast<IColumnUnique &>(*column_unique); }

        /// Dictionary may be shared for several mutable columns.
        /// Immutable columns may have the same column unique, which isn't necessarily shared dictionary.
        void setShared(const ColumnPtr & column_unique_);
        bool isShared() const { return shared; }

        /// Create new dictionary with only keys that are mentioned in positions.
        void compact(MutableColumnPtr & positions);

        static MutableColumnPtr compact(const IColumnUnique & column_unique, MutableColumnPtr & positions);

    private:
        WrappedPtr column_unique;
        bool shared = false;
    };

    Dictionary dictionary;
    Index idx;

    void compactInplace();
    void compactIfSharedDictionary();

    int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const;

    void getPermutationImpl(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, size_t limit, int nan_direction_hint, Permutation & res, const Collator * collator = nullptr) const;
};

bool isColumnLowCardinalityNullable(const IColumn & column);


}