aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h
blob: 24643b81b679ee50ddc3760a3fb640e2143a6855 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#pragma once
#include "clickhouse_config.h"
#if USE_PARQUET

#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#error #include <parquet/metadata.h>

namespace parquet::arrow { class FileReader; }

namespace arrow { class Buffer; class RecordBatchReader;}

namespace DB
{

/* Special format that always returns just one row with Parquet file metadata (see https://parquet.apache.org/docs/file-format/metadata/).
 * The result row have the next structure:
 * num_columns - the number of columns
 * num_rows - the total number of rows
 * num_row_groups - the total number of row groups
 * format_version - parquet format version, always 1.0 or 2.6
 * total_uncompressed_size - total bytes size of the data, calculated as the sum of total_uncompressed_size from all row groups
 * total_compressed_size - total compressed bytes size of the data, calculated as the sum of total_compressed_size from all row groups
 * columns - the list of columns metadata with the next structure:
 *     name - column name
 *     path - column path (differs from name for nested column)
 *     max_definition_level - maximum definition level
 *     max_repetition_level - maximum repetition level
 *     physical_type - column physical type
 *     logical_type - column logical type
 *     compression - compression used for this column
 *     total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_uncompressed_size of the column from all row groups
 *     total_uncompressed_size - total uncompressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups
 *     space_saved - percent of space saved by compression, calculated as (1 - total_compressed_size/total_uncompressed_size).
 *     encodings - the list of encodings used for this column
 * row_groups - the list of row groups metadata with the next structure:
 *     num_columns - the number of columns in the row group
 *     num_rows - the number of rows in the row group
 *     total_uncompressed_size - total bytes size of the row group
 *     total_compressed_size - total compressed bytes size of the row group
 *     columns - the list of column chunks metadata with the next structure:
 *         name - column name
 *         path - column path
 *         total_compressed_size - total compressed bytes size of the column in the row group
 *         total_uncompressed_size - total uncompressed bytes size of the column in the row group
 *         have_statistics - bool flag that indicates if column chunk metadata contains column statistics
 *         statistics - column chunk statistics (all fields are NULL if have_statistics = false) with the next structure:
 *             num_values - the number of non-null values in the column chunk
 *             null_count - the number of NULL values in the column chunk
 *             distinct_count - the number pf distinct values in the column chunk
 *             min - the minimum value of the column chunk
 *             max - the maximum column of the column chunk
 * */

class ParquetMetadataInputFormat : public IInputFormat
{
public:
    ParquetMetadataInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_);

    String getName() const override { return "ParquetMetadataInputFormat"; }

    void resetParser() override;

private:
    Chunk generate() override;

    void onCancel() override
    {
        is_stopped = 1;
    }

    void fillColumnsMetadata(const std::shared_ptr<parquet::FileMetaData> & metadata, MutableColumnPtr & column);
    void fillRowGroupsMetadata(const std::shared_ptr<parquet::FileMetaData> & metadata, MutableColumnPtr & column);
    void fillColumnChunksMetadata(const std::unique_ptr<parquet::RowGroupMetaData> & row_group_metadata, IColumn & column);
    void fillColumnStatistics(const std::shared_ptr<parquet::Statistics> & statistics, IColumn & column, int32_t type_length);

    const FormatSettings format_settings;
    bool done = false;
    std::atomic<int> is_stopped{0};
};

class ParquetMetadataSchemaReader : public ISchemaReader
{
public:
    ParquetMetadataSchemaReader(ReadBuffer & in_);

    NamesAndTypesList readSchema() override;
};

}

#endif