summaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/arrow_next/cpp/src/arrow/adapters/orc/adapter.h
blob: 7b5e9fa681aefff12a706acf1d6cfc4c9cbedf82 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#pragma clang system_header
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <memory>
#include <vector>

#include "contrib/libs/apache/arrow_next/cpp/src/arrow/adapters/orc/options.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/io/interfaces.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/memory_pool.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/record_batch.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/status.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/type.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/type_fwd.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/macros.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/visibility.h"

namespace arrow20 {
namespace adapters {
namespace orc {

/// \brief Information about an ORC stripe
struct StripeInformation {
  /// \brief Offset of the stripe from the start of the file, in bytes
  int64_t offset;
  /// \brief Length of the stripe, in bytes
  int64_t length;
  /// \brief Number of rows in the stripe
  int64_t num_rows;
  /// \brief Index of the first row of the stripe
  int64_t first_row_id;
};

/// \class ORCFileReader
/// \brief Read an Arrow Table or RecordBatch from an ORC file.
class ARROW_EXPORT ORCFileReader {
 public:
  ~ORCFileReader();

  /// \brief Creates a new ORC reader
  ///
  /// \param[in] file the data source
  /// \param[in] pool a MemoryPool to use for buffer allocations
  /// \return the returned reader object
  static Result<std::unique_ptr<ORCFileReader>> Open(
      const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);

  /// \brief Return the schema read from the ORC file
  ///
  /// \return the returned Schema object
  Result<std::shared_ptr<Schema>> ReadSchema();

  /// \brief Read the file as a Table
  ///
  /// The table will be composed of one record batch per stripe.
  ///
  /// \return the returned Table
  Result<std::shared_ptr<Table>> Read();

  /// \brief Read the file as a Table
  ///
  /// The table will be composed of one record batch per stripe.
  ///
  /// \param[in] schema the Table schema
  /// \return the returned Table
  Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema);

  /// \brief Read the file as a Table
  ///
  /// The table will be composed of one record batch per stripe.
  ///
  /// \param[in] include_indices the selected field indices to read
  /// \return the returned Table
  Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices);

  /// \brief Read the file as a Table
  ///
  /// The table will be composed of one record batch per stripe.
  ///
  /// \param[in] include_names the selected field names to read
  /// \return the returned Table
  Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);

  /// \brief Read the file as a Table
  ///
  /// The table will be composed of one record batch per stripe.
  ///
  /// \param[in] schema the Table schema
  /// \param[in] include_indices the selected field indices to read
  /// \return the returned Table
  Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema,
                                      const std::vector<int>& include_indices);

  /// \brief Read a single stripe as a RecordBatch
  ///
  /// \param[in] stripe the stripe index
  /// \return the returned RecordBatch
  Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);

  /// \brief Read a single stripe as a RecordBatch
  ///
  /// \param[in] stripe the stripe index
  /// \param[in] include_indices the selected field indices to read
  /// \return the returned RecordBatch
  Result<std::shared_ptr<RecordBatch>> ReadStripe(
      int64_t stripe, const std::vector<int>& include_indices);

  /// \brief Read a single stripe as a RecordBatch
  ///
  /// \param[in] stripe the stripe index
  /// \param[in] include_names the selected field names to read
  /// \return the returned RecordBatch
  Result<std::shared_ptr<RecordBatch>> ReadStripe(
      int64_t stripe, const std::vector<std::string>& include_names);

  /// \brief Seek to designated row. Invoke NextStripeReader() after seek
  ///        will return stripe reader starting from designated row.
  ///
  /// \param[in] row_number the rows number to seek
  Status Seek(int64_t row_number);

  /// \brief Get a stripe level record batch iterator.
  ///
  /// Each record batch will have up to `batch_size` rows.
  /// NextStripeReader serves as a fine-grained alternative to ReadStripe
  /// which may cause OOM issues by loading the whole stripe into memory.
  ///
  /// Note this will only read rows for the current stripe, not the entire
  /// file.
  ///
  /// \param[in] batch_size the maximum number of rows in each record batch
  /// \return the returned stripe reader
  Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(int64_t batch_size);

  /// \brief Get a stripe level record batch iterator.
  ///
  /// Each record batch will have up to `batch_size` rows.
  /// NextStripeReader serves as a fine-grained alternative to ReadStripe
  /// which may cause OOM issues by loading the whole stripe into memory.
  ///
  /// Note this will only read rows for the current stripe, not the entire
  /// file.
  ///
  /// \param[in] batch_size the maximum number of rows in each record batch
  /// \param[in] include_indices the selected field indices to read
  /// \return the stripe reader
  Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
      int64_t batch_size, const std::vector<int>& include_indices);

  /// \brief Get a record batch iterator for the entire file.
  ///
  /// Each record batch will have up to `batch_size` rows.
  ///
  /// \param[in] batch_size the maximum number of rows in each record batch
  /// \param[in] include_names the selected field names to read, if not empty
  /// (otherwise all fields are read)
  /// \return the record batch iterator
  Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
      int64_t batch_size, const std::vector<std::string>& include_names);

  /// \brief The number of stripes in the file
  int64_t NumberOfStripes();

  /// \brief The number of rows in the file
  int64_t NumberOfRows();

  /// \brief StripeInformation for each stripe.
  StripeInformation GetStripeInformation(int64_t stripe);

  /// \brief Get the format version of the file.
  ///         Currently known values are 0.11 and 0.12.
  ///
  /// \return The FileVersion of the ORC file.
  FileVersion GetFileVersion();

  /// \brief Get the software instance and version that wrote this file.
  ///
  /// \return a user-facing string that specifies the software version
  std::string GetSoftwareVersion();

  /// \brief Get the compression kind of the file.
  ///
  /// \return The kind of compression in the ORC file.
  Result<Compression::type> GetCompression();

  /// \brief Get the buffer size for the compression.
  ///
  /// \return Number of bytes to buffer for the compression codec.
  int64_t GetCompressionSize();

  /// \brief Get the number of rows per an entry in the row index.
  /// \return the number of rows per an entry in the row index or 0 if there
  ///          is no row index.
  int64_t GetRowIndexStride();

  /// \brief Get ID of writer that generated the file.
  ///
  /// \return UNKNOWN_WRITER if the writer ID is undefined
  WriterId GetWriterId();

  /// \brief Get the writer id value when getWriterId() returns an unknown writer.
  ///
  /// \return the integer value of the writer ID.
  int32_t GetWriterIdValue();

  /// \brief Get the version of the writer.
  ///
  /// \return the version of the writer.

  WriterVersion GetWriterVersion();

  /// \brief Get the number of stripe statistics in the file.
  ///
  /// \return the number of stripe statistics
  int64_t GetNumberOfStripeStatistics();

  /// \brief Get the length of the data stripes in the file.
  ///
  /// \return return the number of bytes in stripes
  int64_t GetContentLength();

  /// \brief Get the length of the file stripe statistics.
  ///
  /// \return the number of compressed bytes in the file stripe statistics
  int64_t GetStripeStatisticsLength();

  /// \brief Get the length of the file footer.
  ///
  /// \return the number of compressed bytes in the file footer
  int64_t GetFileFooterLength();

  /// \brief Get the length of the file postscript.
  ///
  /// \return the number of bytes in the file postscript
  int64_t GetFilePostscriptLength();

  /// \brief Get the total length of the file.
  ///
  /// \return the number of bytes in the file
  int64_t GetFileLength();

  /// \brief Get the serialized file tail.
  ///         Useful if another reader of the same file wants to avoid re-reading
  ///         the file tail. See ReadOptions.SetSerializedFileTail().
  ///
  /// \return a string of bytes with the file tail
  std::string GetSerializedFileTail();

  /// \brief Return the metadata read from the ORC file
  ///
  /// \return A KeyValueMetadata object containing the ORC metadata
  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();

 private:
  class Impl;
  std::unique_ptr<Impl> impl_;
  ORCFileReader();
};

/// \class ORCFileWriter
/// \brief Write an Arrow Table or RecordBatch to an ORC file.
class ARROW_EXPORT ORCFileWriter {
 public:
  ~ORCFileWriter();
  /// \brief Creates a new ORC writer.
  ///
  /// \param[in] output_stream a pointer to the io::OutputStream to write into
  /// \param[in] write_options the ORC writer options for Arrow
  /// \return the returned writer object
  static Result<std::unique_ptr<ORCFileWriter>> Open(
      io::OutputStream* output_stream,
      const WriteOptions& write_options = WriteOptions());

  /// \brief Write a table. This can be called multiple times.
  ///
  /// Tables passed in subsequent calls must match the schema of the table that was
  /// written first.
  ///
  /// \param[in] table the Arrow table from which data is extracted.
  /// \return Status
  Status Write(const Table& table);

  /// \brief Write a RecordBatch. This can be called multiple times.
  ///
  /// RecordBatches passed in subsequent calls must match the schema of the
  /// RecordBatch that was written first.
  ///
  /// \param[in] record_batch the Arrow RecordBatch from which data is extracted.
  /// \return Status
  Status Write(const RecordBatch& record_batch);

  /// \brief Close an ORC writer (orc::Writer)
  ///
  /// \return Status
  Status Close();

 private:
  class Impl;
  std::unique_ptr<Impl> impl_;

 private:
  ORCFileWriter();
};

}  // namespace orc
}  // namespace adapters
}  // namespace arrow20