contrib/clickhouse/src/Compression/LZ4_decompress_faster.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

#pragma once

#include <cmath>
#include <random>
#include <pcg_random.hpp>


namespace LZ4
{

/** There are many implementation details of LZ4 decompression loop, that affect performance.
  * For example: copy by 8 or by 16 (SSE2) bytes at once; use shuffle (SSSE3) instruction to replicate match or not.
  *
  * The optimal algorithm is dependent on:
  *
  * 1. CPU architecture.
  * (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle,
  *  but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios)
  *
  * 2. Data distribution.
  * (example: when compression ratio is higher than 10.20,
  *  it's usually better to copy by 16 bytes rather than 8).
  *
  * It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant.
  * (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.)
  *
  * Instead of this, we choose best algorithm by using performance statistics
  *  with something like "Bayesian Bandits" method.
  */


/** Both buffers passed to 'decompress' function must have
  *  at least this amount of excessive bytes after end of data
  *  that is allowed to read/write.
  * This value is a little overestimation.
  */
static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = 64;


/** When decompressing uniform sequence of blocks (for example, blocks from one file),
  *  you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method.
  * It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime.
  * One PerformanceStatistics object cannot be used concurrently from different threads.
  */
struct PerformanceStatistics
{
    struct Element
    {
        double count = 0;
        double sum = 0;

        double adjustedCount() const
        {
            return count - NUM_INVOCATIONS_TO_THROW_OFF;
        }

        double mean() const
        {
            return sum / adjustedCount();
        }

        /// For better convergence, we don't use proper estimate of stddev.
        /// We want to eventually separate between two algorithms even in case
        ///  when there is no statistical significant difference between them.
        double sigma() const
        {
            return mean() / sqrt(adjustedCount());
        }

        void update(double seconds, double bytes)
        {
            ++count;

            if (count > NUM_INVOCATIONS_TO_THROW_OFF)
                sum += seconds / bytes;
        }

        double sample(pcg64 & stat_rng) const
        {
            /// If there is a variant with not enough statistics, always choose it.
            /// And in that case prefer variant with less number of invocations.

            if (adjustedCount() < 2)
                return adjustedCount() - 1;
            else
                return std::normal_distribution<>(mean(), sigma())(stat_rng);
        }
    };

    /// Number of different algorithms to select from.
    static constexpr size_t NUM_ELEMENTS = 5;

    /// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account.
    static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = 2;

    /// How to select method to run.
    /// -1 - automatically, based on statistics (default);
    /// >= 0 - always choose specified method (for performance testing);
    /// -2 - choose methods in round robin fashion (for performance testing).
    ssize_t choose_method = -1;

    Element data[NUM_ELEMENTS];

    /// It's Ok that generator is not seeded.
    pcg64 rng;

    /// To select from different algorithms we use a kind of "bandits" algorithm.
    /// Sample random values from estimated normal distributions and choose the minimal.
    size_t select(size_t max_method = NUM_ELEMENTS)
    {
        if (choose_method < 0)
        {
            double samples[max_method];
            for (size_t i = 0; i < max_method; ++i)
                samples[i] = choose_method == -1
                    ? data[i].sample(rng)
                    : data[i].adjustedCount();

            return std::min_element(samples, samples + max_method) - samples;
        }
        else
            return choose_method;
    }

    PerformanceStatistics() = default;
    explicit PerformanceStatistics(ssize_t choose_method_) : choose_method(choose_method_) {}
};


/** This method dispatch to one of different implementations depending on performance statistics.
  */
bool decompress(
    const char * const source, /// NOLINT
    char * const dest, /// NOLINT
    size_t source_size,
    size_t dest_size,
    PerformanceStatistics & statistics);


/** Obtain statistics about LZ4 block useful for development.
  */
struct StreamStatistics
{
    size_t num_tokens = 0;
    size_t sum_literal_lengths = 0;
    size_t sum_match_lengths = 0;
    size_t sum_match_offsets = 0;
    size_t count_match_offset_less_8 = 0;
    size_t count_match_offset_less_16 = 0;
    size_t count_match_replicate_itself = 0;

    void literal(size_t length);
    void match(size_t length, size_t offset);

    void print() const;
};

void statistics(
    const char * const source, /// NOLINT
    char * const dest, /// NOLINT
    size_t dest_size,
    StreamStatistics & stat);

}