aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Common/CompactArray.h
blob: 289b78d7d9ac75eb6465fab4bb8ed749434b61d4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#pragma once

#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Core/Defines.h>

namespace DB
{

namespace ErrorCodes
{
    extern const int NO_AVAILABLE_DATA;
}


/** Compact array for data storage, size `content_width`, in bits, of which is
  * less than one byte. Instead of storing each value in a separate
  * bytes, which leads to a waste of 37.5% of the space for content_width = 5, CompactArray stores
  * adjacent `content_width`-bit values in the byte array, that is actually CompactArray
  * simulates an array of `content_width`-bit values.
  */
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
class CompactArray final
{
public:
    class Reader;
    class Locus;

    CompactArray() = default;

    UInt8 ALWAYS_INLINE operator[](BucketIndex bucket_index) const
    {
        Locus locus(bucket_index);

        if (locus.index_l == locus.index_r)
            return locus.read(bitset[locus.index_l]);
        else
            return locus.read(bitset[locus.index_l], bitset[locus.index_r]);
    }

    Locus ALWAYS_INLINE operator[](BucketIndex bucket_index)
    {
        Locus locus(bucket_index);

        locus.content_l = &bitset[locus.index_l];

        if (locus.index_l == locus.index_r)
            locus.content_r = locus.content_l;
        else
            locus.content_r = &bitset[locus.index_r];

        return locus;
    }

private:
    /// number of bytes in bitset
    static constexpr size_t BITSET_SIZE = (static_cast<size_t>(bucket_count) * content_width + 7) / 8;
    UInt8 bitset[BITSET_SIZE] = { 0 };
};

/** A class for sequentially reading cells from a compact array on a disk.
  */
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
class CompactArray<BucketIndex, content_width, bucket_count>::Reader final
{
public:
    explicit Reader(ReadBuffer & in_)
        : in(in_)
    {
    }

    Reader(const Reader &) = delete;
    Reader & operator=(const Reader &) = delete;

    bool next()
    {
        if (current_bucket_index == bucket_count)
        {
            is_eof = true;
            return false;
        }

        locus.init(current_bucket_index);

        if (current_bucket_index == 0)
        {
            in.readStrict(reinterpret_cast<char *>(&value_l), 1);
            ++read_count;
        }
        else
            value_l = value_r;

        if (locus.index_l != locus.index_r)
        {
            if (read_count == BITSET_SIZE)
                fits_in_byte = true;
            else
            {
                fits_in_byte = false;
                in.readStrict(reinterpret_cast<char *>(&value_r), 1);
                ++read_count;
            }
        }
        else
        {
            fits_in_byte = true;
            value_r = value_l;
        }

        ++current_bucket_index;

        return true;
    }

    /** Return the current cell number and the corresponding content.
      */
    inline std::pair<BucketIndex, UInt8> get() const
    {
        if ((current_bucket_index == 0) || is_eof)
            throw Exception(ErrorCodes::NO_AVAILABLE_DATA, "No available data.");

        if (fits_in_byte)
            return std::make_pair(current_bucket_index - 1, locus.read(value_l));
        else
            return std::make_pair(current_bucket_index - 1, locus.read(value_l, value_r));
    }

private:
    ReadBuffer & in;
    /// The physical location of the current cell.
    Locus locus{};
    /// The current position in the file as a cell number.
    BucketIndex current_bucket_index = 0;
    /// The number of bytes read.
    size_t read_count = 0;
    /// The content in the current position.
    UInt8 value_l = 0;
    UInt8 value_r = 0;
    ///
    bool is_eof = false;
    /// Does the cell fully fit into one byte?
    bool fits_in_byte = false;
};

/** TODO This code looks very suboptimal.
  *
  * The `Locus` structure contains the necessary information to find for each cell
  * the corresponding byte and offset, in bits, from the beginning of the cell. Since in general
  * case the size of one byte is not divisible by the size of one cell, cases possible
  * when one cell overlaps two bytes. Therefore, the `Locus` structure contains two
  * pairs (index, offset).
  */
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
class CompactArray<BucketIndex, content_width, bucket_count>::Locus final
{
    friend class CompactArray;
    friend class CompactArray::Reader;

public:
    ALWAYS_INLINE operator UInt8() const /// NOLINT
    {
        if (content_l == content_r)
            return read(*content_l);
        else
            return read(*content_l, *content_r);
    }

    Locus ALWAYS_INLINE & operator=(UInt8 content)
    {
        if ((index_l == index_r) || (index_l == (BITSET_SIZE - 1)))
        {
            /// The cell completely fits into one byte.
            *content_l &= ~(((1 << content_width) - 1) << offset_l);
            *content_l |= content << offset_l;
        }
        else
        {
            /// The cell overlaps two bytes.
            size_t left = 8 - offset_l;

            *content_l &= ~(((1 << left) - 1) << offset_l);
            *content_l |= (content & ((1 << left) - 1)) << offset_l;

            *content_r &= ~((1 << offset_r) - 1);
            *content_r |= content >> left;
        }

        return *this;
    }

private:
    Locus() = default;

    explicit Locus(BucketIndex bucket_index)
    {
        init(bucket_index);
    }

    void ALWAYS_INLINE init(BucketIndex bucket_index)
    {
        /// offset in bits to the leftmost bit
        size_t l = static_cast<size_t>(bucket_index) * content_width;

        /// offset of byte that contains the leftmost bit
        index_l = l / 8;

        /// offset in bits to the leftmost bit at that byte
        offset_l = l % 8;

        /// offset of byte that contains the rightmost bit
        index_r = (l + content_width - 1) / 8;

        /// offset in bits to the next to the rightmost bit at that byte; or zero if the rightmost bit is the rightmost bit in that byte.
        offset_r = (l + content_width) % 8;

        content_l = nullptr;
        content_r = nullptr;
    }

    UInt8 ALWAYS_INLINE read(UInt8 value_l) const
    {
        /// The cell completely fits into one byte.
        return (value_l >> offset_l) & ((1 << content_width) - 1);
    }

    UInt8 ALWAYS_INLINE read(UInt8 value_l, UInt8 value_r) const
    {
        /// The cell overlaps two bytes.
        return ((value_l >> offset_l) & ((1 << (8 - offset_l)) - 1))
            | ((value_r & ((1 << offset_r) - 1)) << (8 - offset_l));
    }

    size_t index_l;
    size_t offset_l;
    size_t index_r;
    size_t offset_r;

    UInt8 * content_l;
    UInt8 * content_r;

    /// Checks
    static_assert((content_width > 0) && (content_width < 8), "Invalid parameter value");
    static_assert(bucket_count <= (std::numeric_limits<size_t>::max() / content_width), "Invalid parameter value");
};

}