aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Formats/MarkInCompressedFile.cpp
blob: 41f6152dc13cb9a784fe1310a43323a2db1ecdaf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <Formats/MarkInCompressedFile.h>

#include <Common/BitHelpers.h>

namespace DB
{

// Write a range of bits in a bit-packed array.
// The array must be overallocated by one element.
// The bit range must be pre-filled with zeros.
void writeBits(UInt64 * dest, size_t bit_offset, UInt64 value)
{
    size_t mod = bit_offset % 64;
    dest[bit_offset / 64] |= value << mod;
    if (mod)
        dest[bit_offset / 64 + 1] |= value >> (64 - mod);
}

// The array must be overallocated by one element.
UInt64 readBits(const UInt64 * src, size_t bit_offset, size_t num_bits)
{
    size_t mod = bit_offset % 64;
    UInt64 value = src[bit_offset / 64] >> mod;
    if (mod)
        value |= src[bit_offset / 64 + 1] << (64 - mod);
    return value & maskLowBits<UInt64>(num_bits);
}

MarksInCompressedFile::MarksInCompressedFile(const PlainArray & marks)
    : num_marks(marks.size()), blocks((marks.size() + MARKS_PER_BLOCK - 1) / MARKS_PER_BLOCK, BlockInfo{})
{
    if (num_marks == 0)
    {
        return;
    }

    // First pass: calculate layout of all blocks and total memory required.
    size_t packed_bits = 0;
    for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx)
    {
        BlockInfo & block = blocks[block_idx];
        block.bit_offset_in_packed_array = packed_bits;

        size_t max_x = 0;
        size_t max_y = 0;
        size_t num_marks_in_this_block = std::min(MARKS_PER_BLOCK, num_marks - block_idx * MARKS_PER_BLOCK);
        for (size_t i = 0; i < num_marks_in_this_block; ++i)
        {
            const auto & mark = marks[block_idx * MARKS_PER_BLOCK + i];
            block.min_x = std::min(block.min_x, mark.offset_in_compressed_file);
            max_x = std::max(max_x, mark.offset_in_compressed_file);
            block.min_y = std::min(block.min_y, mark.offset_in_decompressed_block);
            max_y = std::max(max_y, mark.offset_in_decompressed_block);

            block.trailing_zero_bits_in_y
                = std::min(block.trailing_zero_bits_in_y, static_cast<UInt8>(getTrailingZeroBits(mark.offset_in_decompressed_block)));
        }

        block.bits_for_x = sizeof(size_t) * 8 - getLeadingZeroBits(max_x - block.min_x);
        block.bits_for_y = sizeof(size_t) * 8 - getLeadingZeroBits((max_y - block.min_y) >> block.trailing_zero_bits_in_y);
        packed_bits += num_marks_in_this_block * (block.bits_for_x + block.bits_for_y);
    }

    // Overallocate by +1 element to let the bit packing/unpacking do less bounds checking.
    size_t packed_length = (packed_bits + 63) / 64 + 1;
    packed.reserve_exact(packed_length);
    packed.resize_fill(packed_length);

    // Second pass: write out the packed marks.
    for (size_t idx = 0; idx < num_marks; ++idx)
    {
        const auto & mark = marks[idx];
        auto [block, offset] = lookUpMark(idx);
        writeBits(packed.data(), offset, mark.offset_in_compressed_file - block->min_x);
        writeBits(
            packed.data(),
            offset + block->bits_for_x,
            (mark.offset_in_decompressed_block - block->min_y) >> block->trailing_zero_bits_in_y);
    }
}

MarkInCompressedFile MarksInCompressedFile::get(size_t idx) const
{
    auto [block, offset] = lookUpMark(idx);
    size_t x = block->min_x + readBits(packed.data(), offset, block->bits_for_x);
    size_t y = block->min_y + (readBits(packed.data(), offset + block->bits_for_x, block->bits_for_y) << block->trailing_zero_bits_in_y);
    return MarkInCompressedFile{.offset_in_compressed_file = x, .offset_in_decompressed_block = y};
}

std::tuple<const MarksInCompressedFile::BlockInfo *, size_t> MarksInCompressedFile::lookUpMark(size_t idx) const
{
    size_t block_idx = idx / MARKS_PER_BLOCK;
    const BlockInfo & block = blocks[block_idx];
    size_t offset = block.bit_offset_in_packed_array + (idx - block_idx * MARKS_PER_BLOCK) * (block.bits_for_x + block.bits_for_y);
    return {&block, offset};
}

size_t MarksInCompressedFile::approximateMemoryUsage() const
{
    return sizeof(*this) + blocks.size() * sizeof(blocks[0]) + packed.size() * sizeof(packed[0]);
}

}