diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/apache/arrow/cpp/src/parquet/printer.cc | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/apache/arrow/cpp/src/parquet/printer.cc')
-rw-r--r-- | contrib/libs/apache/arrow/cpp/src/parquet/printer.cc | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc new file mode 100644 index 0000000000..dfd4bd802e --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc @@ -0,0 +1,297 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/printer.h" + +#include <cstdint> +#include <cstdio> +#include <memory> +#include <ostream> +#include <string> +#include <vector> + +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/string.h" + +#include "parquet/column_scanner.h" +#include "parquet/exception.h" +#include "parquet/file_reader.h" +#include "parquet/metadata.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/types.h" + +namespace parquet { + +class ColumnReader; + +// ---------------------------------------------------------------------- +// ParquetFilePrinter::DebugPrint + +// the fixed initial size is just for an example +#define COL_WIDTH 30 + +void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, + bool print_values, bool format_dump, + bool print_key_value_metadata, const char* filename) { + const FileMetaData* file_metadata = fileReader->metadata().get(); + + stream << "File Name: " << filename << "\n"; + stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n"; + stream << "Created By: " << file_metadata->created_by() << "\n"; + stream << "Total rows: " << file_metadata->num_rows() << "\n"; + + if (print_key_value_metadata && file_metadata->key_value_metadata()) { + auto key_value_metadata = file_metadata->key_value_metadata(); + int64_t size_of_key_value_metadata = key_value_metadata->size(); + stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n"; + for (int64_t i = 0; i < size_of_key_value_metadata; i++) { + stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": " + << key_value_metadata->value(i) << "\n"; + } + } + + stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n"; + stream << "Number of Real Columns: " + << file_metadata->schema()->group_node()->field_count() << "\n"; + + if (selected_columns.size() == 0) { + for (int i = 0; i < file_metadata->num_columns(); i++) { + selected_columns.push_back(i); + } + } else { + for (auto i : selected_columns) { + if (i < 0 || i >= file_metadata->num_columns()) { + throw ParquetException("Selected column is out of range"); + } + } + } + + stream << "Number of Columns: " << file_metadata->num_columns() << "\n"; + stream << "Number of Selected Columns: " << selected_columns.size() << "\n"; + for (auto i : selected_columns) { + const ColumnDescriptor* descr = file_metadata->schema()->Column(i); + stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" + << TypeToString(descr->physical_type()); + const auto& logical_type = descr->logical_type(); + if (!logical_type->is_none()) { + stream << " / " << logical_type->ToString(); + } + if (descr->converted_type() != ConvertedType::NONE) { + stream << " / " << ConvertedTypeToString(descr->converted_type()); + if (descr->converted_type() == ConvertedType::DECIMAL) { + stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")"; + } + } + stream << ")" << std::endl; + } + + for (int r = 0; r < file_metadata->num_row_groups(); ++r) { + stream << "--- Row Group: " << r << " ---\n"; + + auto group_reader = fileReader->RowGroup(r); + std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); + + stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n"; + stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size() + << " ---\n"; + stream << "--- Rows: " << group_metadata->num_rows() << " ---\n"; + + // Print column metadata + for (auto i : selected_columns) { + auto column_chunk = group_metadata->ColumnChunk(i); + std::shared_ptr<Statistics> stats = column_chunk->statistics(); + + const ColumnDescriptor* descr = file_metadata->schema()->Column(i); + stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values(); + if (column_chunk->is_stats_set()) { + std::string min = stats->EncodeMin(), max = stats->EncodeMax(); + stream << ", Null Values: " << stats->null_count() + << ", Distinct Values: " << stats->distinct_count() << std::endl + << " Max: " << FormatStatValue(descr->physical_type(), max) + << ", Min: " << FormatStatValue(descr->physical_type(), min); + } else { + stream << " Statistics Not Set"; + } + stream << std::endl + << " Compression: " + << ::arrow::internal::AsciiToUpper( + Codec::GetCodecAsString(column_chunk->compression())) + << ", Encodings:"; + for (auto encoding : column_chunk->encodings()) { + stream << " " << EncodingToString(encoding); + } + stream << std::endl + << " Uncompressed Size: " << column_chunk->total_uncompressed_size() + << ", Compressed Size: " << column_chunk->total_compressed_size() + << std::endl; + } + + if (!print_values) { + continue; + } + stream << "--- Values ---\n"; + + static constexpr int bufsize = COL_WIDTH + 1; + char buffer[bufsize]; + + // Create readers for selected columns and print contents + std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); + int j = 0; + for (auto i : selected_columns) { + std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); + // This is OK in this method as long as the RowGroupReader does not get + // deleted + auto& scanner = scanners[j++] = Scanner::Make(col_reader); + + if (format_dump) { + stream << "Column " << i << std::endl; + while (scanner->HasNext()) { + scanner->PrintNext(stream, 0, true); + stream << "\n"; + } + continue; + } + + snprintf(buffer, bufsize, "%-*s", COL_WIDTH, + file_metadata->schema()->Column(i)->name().c_str()); + stream << buffer << '|'; + } + if (format_dump) { + continue; + } + stream << "\n"; + + bool hasRow; + do { + hasRow = false; + for (auto scanner : scanners) { + if (scanner->HasNext()) { + hasRow = true; + scanner->PrintNext(stream, COL_WIDTH); + stream << '|'; + } + } + stream << "\n"; + } while (hasRow); + } +} + +void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, + const char* filename) { + const FileMetaData* file_metadata = fileReader->metadata().get(); + stream << "{\n"; + stream << " \"FileName\": \"" << filename << "\",\n"; + stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version()) + << "\",\n"; + stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n"; + stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n"; + stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n"; + stream << " \"NumberOfRealColumns\": \"" + << file_metadata->schema()->group_node()->field_count() << "\",\n"; + stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n"; + + if (selected_columns.size() == 0) { + for (int i = 0; i < file_metadata->num_columns(); i++) { + selected_columns.push_back(i); + } + } else { + for (auto i : selected_columns) { + if (i < 0 || i >= file_metadata->num_columns()) { + throw ParquetException("Selected column is out of range"); + } + } + } + + stream << " \"Columns\": [\n"; + int c = 0; + for (auto i : selected_columns) { + const ColumnDescriptor* descr = file_metadata->schema()->Column(i); + stream << " { \"Id\": \"" << i << "\"," + << " \"Name\": \"" << descr->path()->ToDotString() << "\"," + << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\"," + << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type()) + << "\"," + << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }"; + c++; + if (c != static_cast<int>(selected_columns.size())) { + stream << ",\n"; + } + } + + stream << "\n ],\n \"RowGroups\": [\n"; + for (int r = 0; r < file_metadata->num_row_groups(); ++r) { + stream << " {\n \"Id\": \"" << r << "\", "; + + auto group_reader = fileReader->RowGroup(r); + std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); + + stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", "; + stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size() + << "\", "; + stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; + + // Print column metadata + stream << " \"ColumnChunks\": [\n"; + int c1 = 0; + for (auto i : selected_columns) { + auto column_chunk = group_metadata->ColumnChunk(i); + std::shared_ptr<Statistics> stats = column_chunk->statistics(); + + const ColumnDescriptor* descr = file_metadata->schema()->Column(i); + stream << " {\"Id\": \"" << i << "\", \"Values\": \"" + << column_chunk->num_values() << "\", " + << "\"StatsSet\": "; + if (column_chunk->is_stats_set()) { + stream << "\"True\", \"Stats\": {"; + std::string min = stats->EncodeMin(), max = stats->EncodeMax(); + stream << "\"NumNulls\": \"" << stats->null_count() << "\", " + << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " + << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " + << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) + << "\" },"; + } else { + stream << "\"False\","; + } + stream << "\n \"Compression\": \"" + << ::arrow::internal::AsciiToUpper( + Codec::GetCodecAsString(column_chunk->compression())) + << "\", \"Encodings\": \""; + for (auto encoding : column_chunk->encodings()) { + stream << EncodingToString(encoding) << " "; + } + stream << "\", " + << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() + << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); + + // end of a ColumnChunk + stream << "\" }"; + c1++; + if (c1 != static_cast<int>(selected_columns.size())) { + stream << ",\n"; + } + } + + stream << "\n ]\n }"; + if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { + stream << ",\n"; + } + } + stream << "\n ]\n}\n"; +} + +} // namespace parquet |