aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/orc/c++/src/Reader.hh
blob: a1367e4bd3780ee0d5c506cb5638dbb3d4ec9beb (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef ORC_READER_IMPL_HH
#define ORC_READER_IMPL_HH

#include "orc/Exceptions.hh"
#include "orc/Int128.hh"
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"

#include "ColumnReader.hh"
#include "RLE.hh"
#include "SchemaEvolution.hh"
#include "TypeImpl.hh"
#include "sargs/SargsApplier.hh"

namespace orc {

  static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;

  /**
   * WriterVersion Implementation
   */
  class WriterVersionImpl {
   private:
    WriterVersion version;

   public:
    // Known Versions with issues resolved
    // The static method below is to fix global constructors Clang warning
    static const WriterVersionImpl& VERSION_HIVE_8732();

    WriterVersionImpl(WriterVersion ver) : version(ver) {}

    bool compareGT(const WriterVersion other) const {
      return version > other;
    }
  };

  /**
   * State shared between Reader and Row Reader
   */
  struct FileContents {
    std::unique_ptr<InputStream> stream;
    std::unique_ptr<proto::PostScript> postscript;
    std::unique_ptr<proto::Footer> footer;
    std::unique_ptr<Type> schema;
    uint64_t blockSize;
    CompressionKind compression;
    MemoryPool* pool;
    std::ostream* errorStream;
    /// Decimal64 in ORCv2 uses RLE to store values. This flag indicates whether
    /// this new encoding is used.
    bool isDecimalAsLong;
    std::unique_ptr<proto::Metadata> metadata;
    ReaderMetrics* readerMetrics;
  };

  proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
                                      const FileContents& contents);

  class ReaderImpl;
  class Timezone;

  class ColumnSelector {
   private:
    std::map<std::string, uint64_t> nameIdMap;
    std::map<uint64_t, const Type*> idTypeMap;
    const FileContents* contents;
    std::vector<std::string> columns;

    // build map from type name and id, id to Type
    void buildTypeNameIdMap(const Type* type);
    std::string toDotColumnPath();

   public:
    // Select a field by name
    void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name);
    // Select a field by id
    void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId);
    // Select a type by id
    void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId);
    // Select a type by id and read intent map.
    void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId,
                                const RowReaderOptions::IdReadIntentMap& idReadIntentMap);

    // Select all of the recursive children of the given type.
    void selectChildren(std::vector<bool>& selectedColumns, const Type& type);
    // Select a type id of the given type.
    // This function may also select all of the recursive children of the given type
    // depending on the read intent of that type in idReadIntentMap.
    void selectChildren(std::vector<bool>& selectedColumns, const Type& type,
                        const RowReaderOptions::IdReadIntentMap& idReadIntentMap);

    // For each child of type, select it if one of its children
    // is selected.
    bool selectParents(std::vector<bool>& selectedColumns, const Type& type);

    /**
     * Constructor that selects columns.
     * @param contents of the file
     */
    ColumnSelector(const FileContents* contents);

    // Select the columns from the RowReaderoptions object
    void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options);

    // Select the columns from the Readeroptions object
    void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options);
  };

  class RowReaderImpl : public RowReader {
   private:
    const Timezone& localTimezone;

    // contents
    std::shared_ptr<FileContents> contents;
    const bool throwOnHive11DecimalOverflow;
    const int32_t forcedScaleOnHive11Decimal;

    // inputs
    std::vector<bool> selectedColumns;

    // footer
    proto::Footer* footer;
    DataBuffer<uint64_t> firstRowOfStripe;
    mutable std::unique_ptr<Type> selectedSchema;
    bool skipBloomFilters;

    // reading state
    uint64_t previousRow;
    uint64_t firstStripe;
    uint64_t currentStripe;
    uint64_t lastStripe;  // the stripe AFTER the last one
    uint64_t processingStripe;
    uint64_t currentRowInStripe;
    uint64_t rowsInCurrentStripe;
    // number of row groups between first stripe and last stripe
    uint64_t numRowGroupsInStripeRange;
    proto::StripeInformation currentStripeInfo;
    proto::StripeFooter currentStripeFooter;
    std::unique_ptr<ColumnReader> reader;

    bool enableEncodedBlock;
    bool useTightNumericVector;
    bool throwOnSchemaEvolutionOverflow;
    // internal methods
    void startNextStripe();
    inline void markEndOfFile();

    // row index of current stripe with column id as the key
    std::unordered_map<uint64_t, proto::RowIndex> rowIndexes;
    std::map<uint32_t, BloomFilterIndex> bloomFilterIndex;
    std::shared_ptr<SearchArgument> sargs;
    std::unique_ptr<SargsApplier> sargsApplier;

    // desired timezone to return data of timestamp types.
    const Timezone& readerTimezone;

    // match read and file types
    SchemaEvolution schemaEvolution;

    // load stripe index if not done so
    void loadStripeIndex();

    // In case of PPD, batch size should be aware of row group boundaries.
    // If only a subset of row groups are selected then the next read should
    // stop at the end of selected range.
    static uint64_t computeBatchSize(uint64_t requestedSize, uint64_t currentRowInStripe,
                                     uint64_t rowsInCurrentStripe, uint64_t rowIndexStride,
                                     const std::vector<uint64_t>& nextSkippedRows);

    // Skip non-selected rows
    static uint64_t advanceToNextRowGroup(uint64_t currentRowInStripe, uint64_t rowsInCurrentStripe,
                                          uint64_t rowIndexStride,
                                          const std::vector<uint64_t>& nextSkippedRows);

    friend class TestRowReader_advanceToNextRowGroup_Test;
    friend class TestRowReader_computeBatchSize_Test;

    // whether the current stripe is initialized
    inline bool isCurrentStripeInited() const {
      return currentStripe == processingStripe;
    }

    /**
     * Seek to the start of a row group in the current stripe
     * @param rowGroupEntryId the row group id to seek to
     */
    void seekToRowGroup(uint32_t rowGroupEntryId);

    /**
     * Check if the file has bad bloom filters. We will skip using them in the
     * following reads.
     * @return true if it has.
     */
    bool hasBadBloomFilters();

   public:
    /**
     * Constructor that lets the user specify additional options.
     * @param contents of the file
     * @param options options for reading
     */
    RowReaderImpl(std::shared_ptr<FileContents> contents, const RowReaderOptions& options);

    // Select the columns from the options object
    const std::vector<bool> getSelectedColumns() const override;

    const Type& getSelectedType() const override;

    std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const override;

    bool next(ColumnVectorBatch& data) override;

    CompressionKind getCompression() const;

    uint64_t getCompressionSize() const;

    uint64_t getRowNumber() const override;

    void seekToRow(uint64_t rowNumber) override;

    const FileContents& getFileContents() const;
    bool getThrowOnHive11DecimalOverflow() const;
    bool getIsDecimalAsLong() const;
    int32_t getForcedScaleOnHive11Decimal() const;

    const SchemaEvolution* getSchemaEvolution() const {
      return &schemaEvolution;
    }
  };

  class ReaderImpl : public Reader {
   private:
    // FileContents
    std::shared_ptr<FileContents> contents;

    // inputs
    const ReaderOptions options;
    const uint64_t fileLength;
    const uint64_t postscriptLength;

    // footer
    proto::Footer* footer;
    uint64_t numberOfStripes;
    uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns);

    // internal methods
    void readMetadata() const;
    void checkOrcVersion();
    void getRowIndexStatistics(
        const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
        const proto::StripeFooter& currentStripeFooter,
        std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;

    // metadata
    mutable bool isMetadataLoaded;

   public:
    /**
     * Constructor that lets the user specify additional options.
     * @param contents of the file
     * @param options options for reading
     * @param fileLength the length of the file in bytes
     * @param postscriptLength the length of the postscript in bytes
     */
    ReaderImpl(std::shared_ptr<FileContents> contents, const ReaderOptions& options,
               uint64_t fileLength, uint64_t postscriptLength);

    const ReaderOptions& getReaderOptions() const;

    CompressionKind getCompression() const override;

    FileVersion getFormatVersion() const override;

    WriterId getWriterId() const override;

    uint32_t getWriterIdValue() const override;

    std::string getSoftwareVersion() const override;

    WriterVersion getWriterVersion() const override;

    uint64_t getNumberOfRows() const override;

    uint64_t getRowIndexStride() const override;

    std::list<std::string> getMetadataKeys() const override;

    std::string getMetadataValue(const std::string& key) const override;

    bool hasMetadataValue(const std::string& key) const override;

    uint64_t getCompressionSize() const override;

    uint64_t getNumberOfStripes() const override;

    std::unique_ptr<StripeInformation> getStripe(uint64_t) const override;

    uint64_t getNumberOfStripeStatistics() const override;

    const std::string& getStreamName() const override;

    std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const override;

    std::unique_ptr<RowReader> createRowReader() const override;

    std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options) const override;

    uint64_t getContentLength() const override;
    uint64_t getStripeStatisticsLength() const override;
    uint64_t getFileFooterLength() const override;
    uint64_t getFilePostscriptLength() const override;
    uint64_t getFileLength() const override;

    std::unique_ptr<Statistics> getStatistics() const override;

    std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId) const override;

    std::string getSerializedFileTail() const override;

    const Type& getType() const override;

    bool hasCorrectStatistics() const override;

    const ReaderMetrics* getReaderMetrics() const override {
      return contents->readerMetrics;
    }

    const proto::PostScript* getPostscript() const {
      return contents->postscript.get();
    }

    uint64_t getBlockSize() const {
      return contents->blockSize;
    }

    const proto::Footer* getFooter() const {
      return contents->footer.get();
    }

    const Type* getSchema() const {
      return contents->schema.get();
    }

    InputStream* getStream() const {
      return contents->stream.get();
    }

    uint64_t getMemoryUse(int stripeIx = -1) override;

    uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx = -1) override;

    uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx = -1) override;

    uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx = -1) override;

    std::map<uint32_t, BloomFilterIndex> getBloomFilters(
        uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
  };
}  // namespace orc

#endif