aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Compression/CompressionFactoryAdditions.cpp
blob: 98e9e7480da1425f9aef1a75c9b59ff0e665d056 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/**
 * This file contains a part of CompressionCodecFactory methods definitions and
 * is needed only because they have dependencies on DataTypes.
 * They are not useful for fuzzers, so we leave them in other translation unit.
 */

#include <Compression/CompressionFactory.h>

#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/parseQuery.h>
#include <Parsers/queryToString.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeNested.h>
#include <DataTypes/DataTypeNullable.h>
#include <Common/Exception.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int UNEXPECTED_AST_STRUCTURE;
    extern const int UNKNOWN_CODEC;
    extern const int BAD_ARGUMENTS;
    extern const int LOGICAL_ERROR;
}


void CompressionCodecFactory::validateCodec(
    const String & family_name, std::optional<int> level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const
{
    if (family_name.empty())
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Compression codec name cannot be empty");

    if (level)
    {
        auto literal = std::make_shared<ASTLiteral>(static_cast<UInt64>(*level));
        validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", makeASTFunction(Poco::toUpper(family_name), literal)),
            {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec);
    }
    else
    {
        auto identifier = std::make_shared<ASTIdentifier>(Poco::toUpper(family_name));
        validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", identifier),
            {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec);
    }
}

namespace
{

bool innerDataTypeIsFloat(const DataTypePtr & type)
{
    if (isFloat(type))
        return true;
    if (const DataTypeNullable * type_nullable = typeid_cast<const DataTypeNullable *>(type.get()))
        return innerDataTypeIsFloat(type_nullable->getNestedType());
    if (const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(type.get()))
        return innerDataTypeIsFloat(type_array->getNestedType());
    if (const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
    {
        for (const auto & subtype : type_tuple->getElements())
            if (innerDataTypeIsFloat(subtype))
                return true;
        return false;
    }
    return false;
}

}

ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
    const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const
{
    if (const auto * func = ast->as<ASTFunction>())
    {
        ASTPtr codecs_descriptions = std::make_shared<ASTExpressionList>();

        bool with_compression_codec = false;
        bool with_none_codec = false;
        std::optional<size_t> first_generic_compression_codec_pos;
        std::optional<size_t> first_delta_codec_pos;
        std::optional<size_t> last_floating_point_time_series_codec_pos;
        std::set<size_t> encryption_codecs_pos;

        bool can_substitute_codec_arguments = true;
        for (size_t i = 0, size = func->arguments->children.size(); i < size; ++i)
        {
            const ASTPtr & inner_codec_ast = func->arguments->children[i];
            String codec_family_name;
            ASTPtr codec_arguments;
            if (const auto * family_name = inner_codec_ast->as<ASTIdentifier>())
            {
                codec_family_name = family_name->name();
                codec_arguments = {};
            }
            else if (const auto * ast_func = inner_codec_ast->as<ASTFunction>())
            {
                codec_family_name = ast_func->name;
                codec_arguments = ast_func->arguments;
            }
            else
                throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST element for compression codec");

            /// Default codec replaced with current default codec which may depend on different
            /// settings (and properties of data) in runtime.
            CompressionCodecPtr result_codec;
            if (codec_family_name == DEFAULT_CODEC_NAME)
            {
                if (codec_arguments != nullptr)
                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
                        "{} codec cannot have any arguments, it's just an alias for codec specified in config.xml", DEFAULT_CODEC_NAME);

                result_codec = default_codec;
                codecs_descriptions->children.emplace_back(std::make_shared<ASTIdentifier>(DEFAULT_CODEC_NAME));
            }
            else
            {
                if (column_type)
                {
                    CompressionCodecPtr prev_codec;
                    ISerialization::StreamCallback callback = [&](const auto & substream_path)
                    {
                        assert(!substream_path.empty());
                        if (ISerialization::isSpecialCompressionAllowed(substream_path))
                        {
                            const auto & last_type = substream_path.back().data.type;
                            result_codec = getImpl(codec_family_name, codec_arguments, last_type.get());

                            /// Case for column Tuple, which compressed with codec which depends on data type, like Delta.
                            /// We cannot substitute parameters for such codecs.
                            if (prev_codec && prev_codec->getHash() != result_codec->getHash())
                                can_substitute_codec_arguments = false;
                            prev_codec = result_codec;
                        }
                    };

                    auto serialization = column_type->getDefaultSerialization();
                    serialization->enumerateStreams(callback, column_type);

                    if (!result_codec)
                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());
                }
                else
                {
                    result_codec = getImpl(codec_family_name, codec_arguments, nullptr);
                }

                if (!allow_experimental_codecs && result_codec->isExperimental())
                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
                        "Codec {} is experimental and not meant to be used in production."
                        " You can enable it with the 'allow_experimental_codecs' setting.",
                        codec_family_name);

                if (!enable_deflate_qpl_codec && result_codec->isDeflateQpl())
                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
                        "Codec {} is disabled by default."
                        " You can enable it with the 'enable_deflate_qpl_codec' setting.",
                        codec_family_name);

                codecs_descriptions->children.emplace_back(result_codec->getCodecDesc());
            }

            with_compression_codec |= result_codec->isCompression();
            with_none_codec |= result_codec->isNone();

            if (result_codec->isGenericCompression() && !first_generic_compression_codec_pos.has_value())
                first_generic_compression_codec_pos = i;

            if (result_codec->isDeltaCompression() && !first_delta_codec_pos.has_value())
                first_delta_codec_pos = i;

            if (result_codec->isFloatingPointTimeSeriesCodec())
                last_floating_point_time_series_codec_pos = i;

            if (result_codec->isEncryption())
                encryption_codecs_pos.insert(i);
        }

        String codec_description = queryToString(codecs_descriptions);

        if (sanity_check)
        {
            if (codecs_descriptions->children.size() > 1 && with_none_codec)
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "It does not make sense to have codec NONE along with other compression codecs: {}. "
                    "(Note: you can enable setting 'allow_suspicious_codecs' to skip this check).",
                    codec_description);

            /// Allow to explicitly specify single NONE codec if user don't want any compression.
            /// But applying other transformations solely without compression (e.g. Delta) does not make sense.
            /// It's okay to apply encryption codecs solely without anything else.
            if (!with_compression_codec && !with_none_codec && encryption_codecs_pos.size() != codecs_descriptions->children.size())
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "Compression codec {} does not compress anything. "
                    "You may want to add generic compression algorithm after other transformations, like: {}, LZ4. "
                    "(Note: you can enable setting 'allow_suspicious_codecs' to skip this check).",
                    codec_description, codec_description);

            /// It does not make sense to apply any non-encryption codecs after encryption one.
            if (!encryption_codecs_pos.empty() &&
                *encryption_codecs_pos.begin() != codecs_descriptions->children.size() - encryption_codecs_pos.size())
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "The combination of compression codecs {} is meaningless, "
                    "because it does not make sense to apply any non-post-processing codecs after "
                    "post-processing ones. (Note: you can enable setting 'allow_suspicious_codecs' "
                    "to skip this check).", codec_description);

            /// Floating-point time series codecs are not supposed to compress non-floating-point data
            if (last_floating_point_time_series_codec_pos.has_value()
                    && column_type && !innerDataTypeIsFloat(column_type))
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "The combination of compression codecs {} is meaningless,"
                    " because it does not make sense to apply a floating-point time series codec to non-floating-point columns"
                    " (Note: you can enable setting 'allow_suspicious_codecs' to skip this check).", codec_description);

            /// Floating-point time series codecs usually do implicit delta compression (or something equivalent), and makes no sense to run
            /// delta compression manually.
            if (first_delta_codec_pos.has_value() && last_floating_point_time_series_codec_pos.has_value()
                && (*first_delta_codec_pos < *last_floating_point_time_series_codec_pos))
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "The combination of compression codecs {} is meaningless,"
                    " because floating point time series codecs do delta compression implicitly by themselves."
                    " (Note: you can enable setting 'allow_suspicious_codecs' to skip this check).", codec_description);

            /// It does not make sense to apply any transformations after generic compression algorithm
            /// So, generic compression can be only one and only at the end.
            if (first_generic_compression_codec_pos &&
                *first_generic_compression_codec_pos != codecs_descriptions->children.size() - 1 - encryption_codecs_pos.size())
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "The combination of compression codecs {} is meaningless, "
                    "because it does not make sense to apply any transformations after generic "
                    "compression algorithm. (Note: you can enable setting 'allow_suspicious_codecs' "
                    "to skip this check).", codec_description);

        }

        /// For columns with nested types like Tuple(UInt32, UInt64) we
        /// obviously cannot substitute parameters for codecs which depend on
        /// data type, because for the first column Delta(4) is suitable and
        /// Delta(8) for the second. So we should leave codec description as is
        /// and deduce them in get method for each subtype separately. For all
        /// other types it's better to substitute parameters, for better
        /// readability and backward compatibility.
        if (can_substitute_codec_arguments)
        {
            std::shared_ptr<ASTFunction> result = std::make_shared<ASTFunction>();
            result->name = "CODEC";
            result->arguments = codecs_descriptions;
            return result;
        }
        else
        {
            return ast;
        }
    }

    throw Exception(ErrorCodes::UNKNOWN_CODEC, "Unknown codec family: {}", queryToString(ast));
}


}