#pragma clang system_header // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include #include #include #include #include "contrib/libs/apache/arrow_next/cpp/src/parquet/arrow/schema.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/column_reader.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/file_reader.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/metadata.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/platform.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/schema.h" namespace arrow20 { class Array; class ChunkedArray; class DataType; class Field; class KeyValueMetadata; class Schema; } // namespace arrow20 using arrow20::Status; namespace parquet20 { class ArrowReaderProperties; namespace arrow20 { class ColumnReaderImpl; // ---------------------------------------------------------------------- // Iteration utilities // Abstraction to decouple row group iteration details from the ColumnReader, // so we can read only a single row group if we want class FileColumnIterator { public: explicit FileColumnIterator(int column_index, ParquetFileReader* reader, std::vector row_groups) : column_index_(column_index), reader_(reader), schema_(reader->metadata()->schema()), row_groups_(row_groups.begin(), row_groups.end()), row_group_index_(-1) {} virtual ~FileColumnIterator() {} std::unique_ptr<::parquet20::PageReader> NextChunk() { if (row_groups_.empty()) { return nullptr; } row_group_index_ = row_groups_.front(); auto row_group_reader = reader_->RowGroup(row_group_index_); row_groups_.pop_front(); return row_group_reader->GetColumnPageReader(column_index_); } const SchemaDescriptor* schema() const { return schema_; } const ColumnDescriptor* descr() const { return schema_->Column(column_index_); } std::shared_ptr metadata() const { return reader_->metadata(); } std::unique_ptr row_group_metadata() const { return metadata()->RowGroup(row_group_index_); } std::unique_ptr column_chunk_metadata() const { return row_group_metadata()->ColumnChunk(column_index_); } int column_index() const { return column_index_; } int row_group_index() const { return row_group_index_; } protected: int column_index_; ParquetFileReader* reader_; const SchemaDescriptor* schema_; std::deque row_groups_; int row_group_index_; }; using FileColumnIteratorFactory = std::function; struct ReaderContext { ParquetFileReader* reader; ::arrow20::MemoryPool* pool; FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; ArrowReaderProperties* reader_properties; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { return this->included_leaves->find(leaf_index) != this->included_leaves->end(); } return true; } }; Status TransferColumnData(::parquet20::internal::RecordReader* reader, std::unique_ptr<::parquet20::ColumnChunkMetaData> metadata, const std::shared_ptr<::arrow20::Field>& value_field, const ColumnDescriptor* descr, const ReaderContext* ctx, std::shared_ptr<::arrow20::ChunkedArray>* out); } // namespace arrow20 } // namespace parquet20