contrib/clickhouse/src/Formats/FormatSettings.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416

#pragma once

#include <Core/Names.h>
#include <Core/Defines.h>
#include <base/types.h>
#include <base/unit.h>


namespace DB
{

/**
  * Various tweaks for input/output formats. Text serialization/deserialization
  * of data types also depend on some of these settings. It is different from
  * FormatFactorySettings in that it has all necessary user-provided settings
  * combined with information from context etc, that we can use directly during
  * serialization. In contrast, FormatFactorySettings' job is to reflect the
  * changes made to user-visible format settings, such as when tweaking the
  * the format for File engine.
  * NOTE Parameters for unrelated formats and unrelated data types are collected
  * in this struct - it prevents modularity, but they are difficult to separate.
  */
struct FormatSettings
{
    /// Format will be used for streaming. Not every formats support it
    /// Option means that each chunk of data need to be formatted independently. Also each chunk will be flushed at the end of processing.
    bool enable_streaming = false;

    bool skip_unknown_fields = false;
    bool with_names_use_header = false;
    bool with_types_use_header = false;
    bool write_statistics = true;
    bool import_nested_json = false;
    bool null_as_default = true;
    bool decimal_trailing_zeros = false;
    bool defaults_for_omitted_fields = true;

    bool seekable_read = true;
    UInt64 max_rows_to_read_for_schema_inference = 25000;
    UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024;

    String column_names_for_schema_inference;
    String schema_inference_hints;

    bool try_infer_integers = false;
    bool try_infer_dates = false;
    bool try_infer_datetimes = false;

    enum class DateTimeInputFormat
    {
        Basic,        /// Default format for fast parsing: YYYY-MM-DD hh:mm:ss (ISO-8601 without fractional part and timezone) or NNNNNNNNNN unix timestamp.
        BestEffort,   /// Use sophisticated rules to parse whatever possible.
        BestEffortUS  /// Use sophisticated rules to parse American style: mm/dd/yyyy
    };

    DateTimeInputFormat date_time_input_format = DateTimeInputFormat::Basic;

    enum class DateTimeOutputFormat
    {
        Simple,
        ISO,
        UnixTimestamp
    };

    enum class EscapingRule
    {
        None,
        Escaped,
        Quoted,
        CSV,
        JSON,
        XML,
        Raw
    };

    bool schema_inference_make_columns_nullable = true;

    DateTimeOutputFormat date_time_output_format = DateTimeOutputFormat::Simple;

    enum class IntervalOutputFormat
    {
        Kusto,
        Numeric
    };

    struct
    {
        IntervalOutputFormat output_format = IntervalOutputFormat::Numeric;
    } interval;

    bool input_format_ipv4_default_on_conversion_error = false;
    bool input_format_ipv6_default_on_conversion_error = false;

    UInt64 input_allow_errors_num = 0;
    Float32 input_allow_errors_ratio = 0;

    UInt64 max_binary_string_size = 1_GiB;
    UInt64 max_binary_array_size = 1_GiB;
    UInt64 client_protocol_version = 0;

    UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH;

    size_t max_threads = 1;

    enum class ArrowCompression
    {
        NONE,
        LZ4_FRAME,
        ZSTD
    };

    struct
    {
        UInt64 row_group_size = 1000000;
        bool low_cardinality_as_dictionary = false;
        bool allow_missing_columns = false;
        bool skip_columns_with_unsupported_types_in_schema_inference = false;
        bool case_insensitive_column_matching = false;
        bool output_string_as_string = false;
        bool output_fixed_string_as_fixed_byte_array = true;
        ArrowCompression output_compression_method = ArrowCompression::NONE;
    } arrow;

    struct
    {
        String schema_registry_url;
        String output_codec;
        UInt64 output_sync_interval = 16 * 1024;
        bool allow_missing_fields = false;
        String string_column_pattern;
        UInt64 output_rows_in_file = 1;
    } avro;

    String bool_true_representation = "true";
    String bool_false_representation = "false";

    struct CSV
    {
        char delimiter = ',';
        bool allow_single_quotes = true;
        bool allow_double_quotes = true;
        bool empty_as_default = false;
        bool crlf_end_of_line = false;
        bool enum_as_number = false;
        bool arrays_as_nested_csv = false;
        String null_representation = "\\N";
        char tuple_delimiter = ',';
        bool use_best_effort_in_schema_inference = true;
        UInt64 skip_first_lines = 0;
        String custom_delimiter;
        bool try_detect_header = true;
        bool skip_trailing_empty_lines = false;
        bool trim_whitespaces = true;
        bool allow_whitespace_or_tab_as_delimiter = false;
        bool allow_variable_number_of_columns = false;
        bool use_default_on_bad_values = false;
    } csv;

    struct HiveText
    {
        char fields_delimiter = '\x01';
        char collection_items_delimiter = '\x02';
        char map_keys_delimiter = '\x03';
        Names input_field_names;
    } hive_text;

    struct Custom
    {
        std::string result_before_delimiter;
        std::string result_after_delimiter;
        std::string row_before_delimiter;
        std::string row_after_delimiter;
        std::string row_between_delimiter;
        std::string field_delimiter;
        EscapingRule escaping_rule = EscapingRule::Escaped;
        bool try_detect_header = true;
        bool skip_trailing_empty_lines = false;
        bool allow_variable_number_of_columns = false;
    } custom;

    struct
    {
        bool array_of_rows = false;
        bool quote_64bit_integers = true;
        bool quote_64bit_floats = false;
        bool quote_denormals = true;
        bool quote_decimals = false;
        bool escape_forward_slashes = true;
        bool read_named_tuples_as_objects = false;
        bool write_named_tuples_as_objects = false;
        bool defaults_for_missing_elements_in_named_tuple = false;
        bool ignore_unknown_keys_in_named_tuple = false;
        bool serialize_as_strings = false;
        bool read_bools_as_numbers = true;
        bool read_numbers_as_strings = true;
        bool read_objects_as_strings = true;
        bool try_infer_numbers_from_strings = false;
        bool validate_types_from_metadata = true;
        bool validate_utf8 = false;
        bool allow_object_type = false;
        bool valid_output_on_exception = false;
        bool compact_allow_variable_number_of_columns = false;
    } json;

    struct
    {
        String column_for_object_name;
    } json_object_each_row;

    enum class ParquetVersion
    {
        V1_0,
        V2_4,
        V2_6,
        V2_LATEST,
    };

    enum class ParquetCompression
    {
        NONE,
        SNAPPY,
        ZSTD,
        LZ4,
        GZIP,
        BROTLI,
    };

    struct
    {
        UInt64 row_group_rows = 1000000;
        UInt64 row_group_bytes = 512 * 1024 * 1024;
        bool allow_missing_columns = false;
        bool skip_columns_with_unsupported_types_in_schema_inference = false;
        bool case_insensitive_column_matching = false;
        bool filter_push_down = true;
        std::unordered_set<int> skip_row_groups = {};
        bool output_string_as_string = false;
        bool output_fixed_string_as_fixed_byte_array = true;
        bool preserve_order = false;
        bool use_custom_encoder = true;
        bool parallel_encoding = true;
        UInt64 max_block_size = 8192;
        ParquetVersion output_version;
        ParquetCompression output_compression_method = ParquetCompression::SNAPPY;
        bool output_compliant_nested_types = true;
        size_t data_page_size = 1024 * 1024;
        size_t write_batch_size = 1024;
        size_t local_read_min_bytes_for_seek = 8192;
    } parquet;

    struct Pretty
    {
        UInt64 max_rows = 10000;
        UInt64 max_column_pad_width = 250;
        UInt64 max_value_width = 10000;
        bool color = true;

        bool output_format_pretty_row_numbers = false;

        enum class Charset
        {
            UTF8,
            ASCII,
        };

        Charset charset = Charset::UTF8;
    } pretty;

    struct
    {
        bool input_flatten_google_wrappers = false;
        bool output_nullables_with_google_wrappers = false;
        /**
         * Some buffers (kafka / rabbit) split the rows internally using callback,
         * and always send one row per message, so we can push there formats
         * without framing / delimiters (like ProtobufSingle). In other cases,
         * we have to enforce exporting at most one row in the format output,
         * because Protobuf without delimiters is not generally useful.
         */
        bool allow_multiple_rows_without_delimiter = false;
        bool skip_fields_with_unsupported_types_in_schema_inference = false;
        bool use_autogenerated_schema = true;
    } protobuf;

    struct
    {
        uint32_t client_capabilities = 0;
        size_t max_packet_size = 0;
        uint8_t * sequence_id = nullptr; /// Not null if it's MySQLWire output format used to handle MySQL protocol connections.
    } mysql_wire;

    struct
    {
        std::string regexp;
        EscapingRule escaping_rule = EscapingRule::Raw;
        bool skip_unmatched = false;
    } regexp;

    struct
    {
        std::string format_schema;
        std::string format_schema_path;
        bool is_server = false;
        std::string output_format_schema;
    } schema;

    struct
    {
        String resultset_format;
        String row_format;
        String row_between_delimiter;
    } template_settings;

    struct
    {
        bool empty_as_default = false;
        bool crlf_end_of_line = false;
        String null_representation = "\\N";
        bool enum_as_number = false;
        bool use_best_effort_in_schema_inference = true;
        UInt64 skip_first_lines = 0;
        bool try_detect_header = true;
        bool skip_trailing_empty_lines = false;
        bool allow_variable_number_of_columns = false;
    } tsv;

    struct
    {
        bool interpret_expressions = true;
        bool deduce_templates_of_expressions = true;
        bool accurate_types_of_literals = true;
    } values;

    enum class ORCCompression
    {
        NONE,
        LZ4,
        SNAPPY,
        ZSTD,
        ZLIB,
    };

    struct
    {
        bool allow_missing_columns = false;
        int64_t row_batch_size = 100'000;
        bool skip_columns_with_unsupported_types_in_schema_inference = false;
        bool case_insensitive_column_matching = false;
        std::unordered_set<int> skip_stripes = {};
        bool output_string_as_string = false;
        ORCCompression output_compression_method = ORCCompression::NONE;
        bool use_fast_decoder = true;
    } orc;

    /// For capnProto format we should determine how to
    /// compare ClickHouse Enum and Enum from schema.
    enum class CapnProtoEnumComparingMode
    {
        BY_NAMES, // Names in enums should be the same, values can be different.
        BY_NAMES_CASE_INSENSITIVE, // Case-insensitive name comparison.
        BY_VALUES, // Values should be the same, names can be different.
    };

    struct CapnProto
    {
        CapnProtoEnumComparingMode enum_comparing_mode = CapnProtoEnumComparingMode::BY_VALUES;
        bool skip_fields_with_unsupported_types_in_schema_inference = false;
        bool use_autogenerated_schema = true;
    } capn_proto;

    enum class MsgPackUUIDRepresentation
    {
        STR, // Output UUID as a string of 36 characters.
        BIN, // Output UUID as 16-bytes binary.
        EXT, // Output UUID as ExtType = 2
    };

    struct
    {
        UInt64 number_of_columns = 0;
        MsgPackUUIDRepresentation output_uuid_representation = MsgPackUUIDRepresentation::EXT;
    } msgpack;

    struct MySQLDump
    {
        String table_name;
        bool map_column_names = true;
    } mysql_dump;

    struct
    {
        UInt64 max_batch_size = DEFAULT_BLOCK_SIZE;
        String table_name = "table";
        bool include_column_names = true;
        bool use_replace = false;
        bool quote_names = true;
    } sql_insert;

    struct
    {
        bool output_string_as_string;
        bool skip_fields_with_unsupported_types_in_schema_inference;
    } bson;

    struct
    {
        bool allow_types_conversion = true;
    } native;

    struct
    {
        bool valid_output_on_exception = false;
    } xml;
};

}