diff options
author | robot-contrib <robot-contrib@yandex-team.com> | 2025-01-14 13:24:41 +0300 |
---|---|---|
committer | robot-contrib <robot-contrib@yandex-team.com> | 2025-01-14 13:52:04 +0300 |
commit | 9e771f1b1c96aedbfdd5ac897a61aa7af1fb1684 (patch) | |
tree | f68905ba70bf0ac0db3f6b06edc1034395def40a /contrib | |
parent | d04cf8fc2232c749af6ad9ffc0a8d235627db0aa (diff) | |
download | ydb-9e771f1b1c96aedbfdd5ac897a61aa7af1fb1684.tar.gz |
Update contrib/libs/apache/orc to 2.1.0
commit_hash:69caf27dc9a3b69957ea34c11fa5f7f2d2f6360a
Diffstat (limited to 'contrib')
72 files changed, 5458 insertions, 3854 deletions
diff --git a/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report b/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report index bd6f063606..a2e9c7ccd9 100644 --- a/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report +++ b/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report @@ -31,7 +31,7 @@ KEEP Apache-2.0 44dc743c95835a9e71d7b3cca63dcc7c BELONGS ya.make -FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c++/include/orc/ColumnPrinter.hh at line 3, c++/include/orc/Common.hh at line 3, c++/include/orc/Exceptions.hh at line 3, c++/include/orc/Int128.hh at line 3, c++/include/orc/MemoryPool.hh at line 3, c++/include/orc/OrcFile.hh at line 3, c++/include/orc/Reader.hh at line 3, c++/include/orc/Statistics.hh at line 3, c++/include/orc/Type.hh at line 3, c++/include/orc/Vector.hh at line 3, c++/include/orc/Writer.hh at line 3, c++/include/orc/orc-config.hh at line 3, c++/include/orc/sargs/Literal.hh at line 3, c++/include/orc/sargs/SearchArgument.hh at line 3, c++/include/orc/sargs/TruthValue.hh at line 3, c++/src/Adaptor-linux.hh at line 3, c++/src/Adaptor.cc at line 3, c++/src/BlockBuffer.cc at line 3, c++/src/BlockBuffer.hh at line 3, c++/src/BloomFilter.cc at line 3, c++/src/BloomFilter.hh at line 3, c++/src/Bpacking.hh at line 3, c++/src/BpackingDefault.cc at line 3, c++/src/BpackingDefault.hh at line 3, c++/src/ByteRLE.cc at line 3, c++/src/ByteRLE.hh at line 3, c++/src/ColumnPrinter.cc at line 3, c++/src/ColumnReader.cc at line 3, c++/src/ColumnReader.hh at line 3, c++/src/ColumnWriter.cc at line 3, c++/src/ColumnWriter.hh at line 3, c++/src/Common.cc at line 3, c++/src/Compression.cc at line 3, c++/src/Compression.hh at line 3, c++/src/ConvertColumnReader.cc at line 3, c++/src/ConvertColumnReader.hh at line 3, c++/src/CpuInfoUtil.cc at line 3, c++/src/CpuInfoUtil.hh at line 3, c++/src/Dispatch.hh at line 3, c++/src/Exceptions.cc at line 3, c++/src/Int128.cc at line 3, c++/src/LzoDecompressor.hh at line 3, c++/src/MemoryPool.cc at line 3, c++/src/Murmur3.cc at line 3, c++/src/Murmur3.hh at line 3, c++/src/Options.hh at line 3, c++/src/OrcFile.cc at line 3, c++/src/RLE.cc at line 3, c++/src/RLE.hh at line 3, c++/src/RLEV2Util.hh at line 3, c++/src/RLEv1.cc at line 3, c++/src/RLEv1.hh at line 3, c++/src/RLEv2.hh at line 3, c++/src/Reader.cc at line 3, c++/src/Reader.hh at line 3, c++/src/RleDecoderV2.cc at line 3, c++/src/SchemaEvolution.cc at line 3, c++/src/SchemaEvolution.hh at line 3, c++/src/Statistics.cc at line 3, c++/src/Statistics.hh at line 3, c++/src/StripeStream.cc at line 3, c++/src/StripeStream.hh at line 3, c++/src/Timezone.cc at line 3, c++/src/Timezone.hh at line 3, c++/src/TypeImpl.cc at line 3, c++/src/TypeImpl.hh at line 3, c++/src/Utils.hh at line 3, c++/src/Vector.cc at line 3, c++/src/Writer.cc at line 3, c++/src/io/InputStream.cc at line 3, c++/src/io/InputStream.hh at line 3, c++/src/io/OutputStream.cc at line 3, c++/src/io/OutputStream.hh at line 3, c++/src/sargs/ExpressionTree.cc at line 3, c++/src/sargs/ExpressionTree.hh at line 3, c++/src/sargs/Literal.cc at line 3, c++/src/sargs/PredicateLeaf.cc at line 3, c++/src/sargs/PredicateLeaf.hh at line 3, c++/src/sargs/SargsApplier.cc at line 3, c++/src/sargs/SargsApplier.hh at line 3, c++/src/sargs/SearchArgument.cc at line 3, c++/src/sargs/SearchArgument.hh at line 3, c++/src/sargs/TruthValue.cc at line 3 +FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c++/include/orc/ColumnPrinter.hh at line 3, c++/include/orc/Common.hh at line 3, c++/include/orc/Exceptions.hh at line 3, c++/include/orc/Int128.hh at line 3, c++/include/orc/MemoryPool.hh at line 3, c++/include/orc/OrcFile.hh at line 3, c++/include/orc/Reader.hh at line 3, c++/include/orc/Statistics.hh at line 3, c++/include/orc/Type.hh at line 3, c++/include/orc/Vector.hh at line 3, c++/include/orc/Writer.hh at line 3, c++/include/orc/orc-config.hh at line 3, c++/include/orc/sargs/Literal.hh at line 3, c++/include/orc/sargs/SearchArgument.hh at line 3, c++/include/orc/sargs/TruthValue.hh at line 3, c++/src/Adaptor-linux.hh at line 3, c++/src/Adaptor.cc at line 3, c++/src/BlockBuffer.cc at line 3, c++/src/BlockBuffer.hh at line 3, c++/src/BloomFilter.cc at line 3, c++/src/BloomFilter.hh at line 3, c++/src/Bpacking.hh at line 3, c++/src/BpackingDefault.cc at line 3, c++/src/BpackingDefault.hh at line 3, c++/src/ByteRLE.cc at line 3, c++/src/ByteRLE.hh at line 3, c++/src/ColumnPrinter.cc at line 3, c++/src/ColumnReader.cc at line 3, c++/src/ColumnReader.hh at line 3, c++/src/ColumnWriter.cc at line 3, c++/src/ColumnWriter.hh at line 3, c++/src/Common.cc at line 3, c++/src/Compression.cc at line 3, c++/src/Compression.hh at line 3, c++/src/ConvertColumnReader.cc at line 3, c++/src/ConvertColumnReader.hh at line 3, c++/src/CpuInfoUtil.cc at line 3, c++/src/CpuInfoUtil.hh at line 3, c++/src/Dispatch.hh at line 3, c++/src/Exceptions.cc at line 3, c++/src/Int128.cc at line 3, c++/src/LzoDecompressor.hh at line 3, c++/src/MemoryPool.cc at line 3, c++/src/Murmur3.cc at line 3, c++/src/Murmur3.hh at line 3, c++/src/Options.hh at line 3, c++/src/OrcFile.cc at line 3, c++/src/RLE.cc at line 3, c++/src/RLE.hh at line 3, c++/src/RLEV2Util.hh at line 3, c++/src/RLEv1.cc at line 3, c++/src/RLEv1.hh at line 3, c++/src/RLEv2.hh at line 3, c++/src/Reader.cc at line 3, c++/src/Reader.hh at line 3, c++/src/RleDecoderV2.cc at line 3, c++/src/SchemaEvolution.cc at line 3, c++/src/SchemaEvolution.hh at line 3, c++/src/Statistics.cc at line 3, c++/src/Statistics.hh at line 3, c++/src/StripeStream.cc at line 3, c++/src/StripeStream.hh at line 3, c++/src/Timezone.cc at line 3, c++/src/Timezone.hh at line 3, c++/src/TypeImpl.cc at line 3, c++/src/TypeImpl.hh at line 3, c++/src/Utils.hh at line 3, c++/src/Vector.cc at line 3, c++/src/Writer.cc at line 3, c++/src/io/Cache.cc at line 3, c++/src/io/Cache.hh at line 3, c++/src/io/InputStream.cc at line 3, c++/src/io/InputStream.hh at line 3, c++/src/io/OutputStream.cc at line 3, c++/src/io/OutputStream.hh at line 3, c++/src/sargs/ExpressionTree.cc at line 3, c++/src/sargs/ExpressionTree.hh at line 3, c++/src/sargs/Literal.cc at line 3, c++/src/sargs/PredicateLeaf.cc at line 3, c++/src/sargs/PredicateLeaf.hh at line 3, c++/src/sargs/SargsApplier.cc at line 3, c++/src/sargs/SargsApplier.hh at line 3, c++/src/sargs/SearchArgument.cc at line 3, c++/src/sargs/SearchArgument.hh at line 3, c++/src/sargs/TruthValue.cc at line 3 Note: matched license text is too long. Read it in the source files. Scancode info: Original SPDX id: Apache-2.0 @@ -109,6 +109,8 @@ FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c+ c++/src/Utils.hh [2:16] c++/src/Vector.cc [2:16] c++/src/Writer.cc [2:16] + c++/src/io/Cache.cc [2:16] + c++/src/io/Cache.hh [2:16] c++/src/io/InputStream.cc [2:16] c++/src/io/InputStream.hh [2:16] c++/src/io/OutputStream.cc [2:16] diff --git a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh index 328c0e84b6..dbdd49a65b 100644 --- a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh @@ -29,7 +29,6 @@ #include <vector> namespace orc { - class ColumnPrinter { protected: std::string& buffer; @@ -42,8 +41,13 @@ namespace orc { virtual void printRow(uint64_t rowId) = 0; // should be called once at the start of each batch of rows virtual void reset(const ColumnVectorBatch& batch); + struct Param { + bool printDecimalAsString = false; + bool printDecimalTrimTrailingZeros = false; + }; }; - std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&, const Type* type); + std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&, const Type* type, + ColumnPrinter::Param = {}); } // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Common.hh b/contrib/libs/apache/orc/c++/include/orc/Common.hh index e983280e46..d72ecc9f62 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Common.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Common.hh @@ -33,32 +33,32 @@ namespace orc { class FileVersion { private: - uint32_t majorVersion; - uint32_t minorVersion; + uint32_t majorVersion_; + uint32_t minorVersion_; public: static const FileVersion& v_0_11(); static const FileVersion& v_0_12(); static const FileVersion& UNSTABLE_PRE_2_0(); - FileVersion(uint32_t major, uint32_t minor) : majorVersion(major), minorVersion(minor) {} + FileVersion(uint32_t major, uint32_t minor) : majorVersion_(major), minorVersion_(minor) {} /** * Get major version */ uint32_t getMajor() const { - return this->majorVersion; + return this->majorVersion_; } /** * Get minor version */ uint32_t getMinor() const { - return this->minorVersion; + return this->minorVersion_; } bool operator==(const FileVersion& right) const { - return this->majorVersion == right.getMajor() && this->minorVersion == right.getMinor(); + return this->majorVersion_ == right.getMajor() && this->minorVersion_ == right.getMinor(); } bool operator!=(const FileVersion& right) const { diff --git a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh index 0536dbd164..b19a00760c 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh @@ -28,8 +28,8 @@ namespace orc { class NotImplementedYet : public std::logic_error { public: - explicit NotImplementedYet(const std::string& what_arg); - explicit NotImplementedYet(const char* what_arg); + explicit NotImplementedYet(const std::string& whatArg); + explicit NotImplementedYet(const char* whatArg); ~NotImplementedYet() noexcept override; NotImplementedYet(const NotImplementedYet&); @@ -39,8 +39,8 @@ namespace orc { class ParseError : public std::runtime_error { public: - explicit ParseError(const std::string& what_arg); - explicit ParseError(const char* what_arg); + explicit ParseError(const std::string& whatArg); + explicit ParseError(const char* whatArg); ~ParseError() noexcept override; ParseError(const ParseError&); @@ -50,8 +50,8 @@ namespace orc { class InvalidArgument : public std::runtime_error { public: - explicit InvalidArgument(const std::string& what_arg); - explicit InvalidArgument(const char* what_arg); + explicit InvalidArgument(const std::string& whatArg); + explicit InvalidArgument(const char* whatArg); ~InvalidArgument() noexcept override; InvalidArgument(const InvalidArgument&); @@ -61,12 +61,24 @@ namespace orc { class SchemaEvolutionError : public std::logic_error { public: - explicit SchemaEvolutionError(const std::string& what_arg); - explicit SchemaEvolutionError(const char* what_arg); + explicit SchemaEvolutionError(const std::string& whatArg); + explicit SchemaEvolutionError(const char* whatArg); virtual ~SchemaEvolutionError() noexcept override; SchemaEvolutionError(const SchemaEvolutionError&); SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete; }; + + class CompressionError : public std::runtime_error { + public: + explicit CompressionError(const std::string& whatArg); + explicit CompressionError(const char* whatArg); + ~CompressionError() noexcept override; + CompressionError(const CompressionError&); + + private: + CompressionError& operator=(const CompressionError&); + }; + } // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh index bcb4a58e22..6954c771cf 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh @@ -37,8 +37,8 @@ namespace orc { class Int128 { public: Int128() { - highbits = 0; - lowbits = 0; + highbits_ = 0; + lowbits_ = 0; } /** @@ -46,11 +46,11 @@ namespace orc { */ Int128(int64_t right) { if (right >= 0) { - highbits = 0; - lowbits = static_cast<uint64_t>(right); + highbits_ = 0; + lowbits_ = static_cast<uint64_t>(right); } else { - highbits = -1; - lowbits = static_cast<uint64_t>(right); + highbits_ = -1; + lowbits_ = static_cast<uint64_t>(right); } } @@ -58,8 +58,8 @@ namespace orc { * Create from the twos complement representation. */ Int128(int64_t high, uint64_t low) { - highbits = high; - lowbits = low; + highbits_ = high; + lowbits_ = low; } /** @@ -78,16 +78,16 @@ namespace orc { static Int128 minimumValue(); Int128& negate() { - lowbits = ~lowbits + 1; - highbits = ~highbits; - if (lowbits == 0) { - highbits += 1; + lowbits_ = ~lowbits_ + 1; + highbits_ = ~highbits_; + if (lowbits_ == 0) { + highbits_ += 1; } return *this; } Int128& abs() { - if (highbits < 0) { + if (highbits_ < 0) { negate(); } return *this; @@ -100,8 +100,8 @@ namespace orc { } Int128& invert() { - lowbits = ~lowbits; - highbits = ~highbits; + lowbits_ = ~lowbits_; + highbits_ = ~highbits_; return *this; } @@ -111,12 +111,12 @@ namespace orc { * @return *this */ Int128& operator+=(const Int128& right) { - uint64_t sum = lowbits + right.lowbits; - highbits += right.highbits; - if (sum < lowbits) { - highbits += 1; + uint64_t sum = lowbits_ + right.lowbits_; + highbits_ += right.highbits_; + if (sum < lowbits_) { + highbits_ += 1; } - lowbits = sum; + lowbits_ = sum; return *this; } @@ -126,12 +126,12 @@ namespace orc { * @return *this */ Int128& operator-=(const Int128& right) { - uint64_t diff = lowbits - right.lowbits; - highbits -= right.highbits; - if (diff > lowbits) { - highbits -= 1; + uint64_t diff = lowbits_ - right.lowbits_; + highbits_ -= right.highbits_; + if (diff > lowbits_) { + highbits_ -= 1; } - lowbits = diff; + lowbits_ = diff; return *this; } @@ -162,8 +162,8 @@ namespace orc { * @return *this */ Int128& operator|=(const Int128& right) { - lowbits |= right.lowbits; - highbits |= right.highbits; + lowbits_ |= right.lowbits_; + highbits_ |= right.highbits_; return *this; } @@ -173,8 +173,8 @@ namespace orc { * @return *this */ Int128& operator&=(const Int128& right) { - lowbits &= right.lowbits; - highbits &= right.highbits; + lowbits_ &= right.lowbits_; + highbits_ &= right.highbits_; return *this; } @@ -196,15 +196,15 @@ namespace orc { Int128& operator<<=(uint32_t bits) { if (bits != 0) { if (bits < 64) { - highbits <<= bits; - highbits |= (lowbits >> (64 - bits)); - lowbits <<= bits; + highbits_ <<= bits; + highbits_ |= (lowbits_ >> (64 - bits)); + lowbits_ <<= bits; } else if (bits < 128) { - highbits = static_cast<int64_t>(lowbits) << (bits - 64); - lowbits = 0; + highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64); + lowbits_ = 0; } else { - highbits = 0; - lowbits = 0; + highbits_ = 0; + lowbits_ = 0; } } return *this; @@ -217,74 +217,74 @@ namespace orc { Int128& operator>>=(uint32_t bits) { if (bits != 0) { if (bits < 64) { - lowbits >>= bits; - lowbits |= static_cast<uint64_t>(highbits << (64 - bits)); - highbits = static_cast<int64_t>(static_cast<uint64_t>(highbits) >> bits); + lowbits_ >>= bits; + lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits)); + highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >> bits); } else if (bits < 128) { - lowbits = static_cast<uint64_t>(highbits >> (bits - 64)); - highbits = highbits >= 0 ? 0 : -1l; + lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64)); + highbits_ = highbits_ >= 0 ? 0 : -1l; } else { - highbits = highbits >= 0 ? 0 : -1l; - lowbits = static_cast<uint64_t>(highbits); + highbits_ = highbits_ >= 0 ? 0 : -1l; + lowbits_ = static_cast<uint64_t>(highbits_); } } return *this; } bool operator==(const Int128& right) const { - return highbits == right.highbits && lowbits == right.lowbits; + return highbits_ == right.highbits_ && lowbits_ == right.lowbits_; } bool operator!=(const Int128& right) const { - return highbits != right.highbits || lowbits != right.lowbits; + return highbits_ != right.highbits_ || lowbits_ != right.lowbits_; } bool operator<(const Int128& right) const { - if (highbits == right.highbits) { - return lowbits < right.lowbits; + if (highbits_ == right.highbits_) { + return lowbits_ < right.lowbits_; } else { - return highbits < right.highbits; + return highbits_ < right.highbits_; } } bool operator<=(const Int128& right) const { - if (highbits == right.highbits) { - return lowbits <= right.lowbits; + if (highbits_ == right.highbits_) { + return lowbits_ <= right.lowbits_; } else { - return highbits <= right.highbits; + return highbits_ <= right.highbits_; } } bool operator>(const Int128& right) const { - if (highbits == right.highbits) { - return lowbits > right.lowbits; + if (highbits_ == right.highbits_) { + return lowbits_ > right.lowbits_; } else { - return highbits > right.highbits; + return highbits_ > right.highbits_; } } bool operator>=(const Int128& right) const { - if (highbits == right.highbits) { - return lowbits >= right.lowbits; + if (highbits_ == right.highbits_) { + return lowbits_ >= right.lowbits_; } else { - return highbits >= right.highbits; + return highbits_ >= right.highbits_; } } uint32_t hash() const { - return static_cast<uint32_t>(highbits >> 32) ^ static_cast<uint32_t>(highbits) ^ - static_cast<uint32_t>(lowbits >> 32) ^ static_cast<uint32_t>(lowbits); + return static_cast<uint32_t>(highbits_ >> 32) ^ static_cast<uint32_t>(highbits_) ^ + static_cast<uint32_t>(lowbits_ >> 32) ^ static_cast<uint32_t>(lowbits_); } /** * Does this value fit into a long? */ bool fitsInLong() const { - switch (highbits) { + switch (highbits_) { case 0: - return 0 == (lowbits & LONG_SIGN_BIT); + return 0 == (lowbits_ & LONG_SIGN_BIT); case -1: - return 0 != (lowbits & LONG_SIGN_BIT); + return 0 != (lowbits_ & LONG_SIGN_BIT); default: return false; } @@ -295,7 +295,7 @@ namespace orc { */ int64_t toLong() const { if (fitsInLong()) { - return static_cast<int64_t>(lowbits); + return static_cast<int64_t>(lowbits_); } throw std::range_error("Int128 too large to convert to long"); } @@ -331,14 +331,14 @@ namespace orc { * Get the high bits of the twos complement representation of the number. */ int64_t getHighBits() const { - return highbits; + return highbits_; } /** * Get the low bits of the twos complement representation of the number. */ uint64_t getLowBits() const { - return lowbits; + return lowbits_; } /** @@ -352,8 +352,8 @@ namespace orc { private: static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u; - int64_t highbits; - uint64_t lowbits; + int64_t highbits_; + uint64_t lowbits_; }; /** diff --git a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh index 6d999d3aa8..a914e5f260 100644 --- a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh +++ b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh @@ -36,50 +36,50 @@ namespace orc { template <class T> class DataBuffer { private: - MemoryPool& memoryPool; - T* buf; + MemoryPool& memoryPool_; + T* buf_; // current size - uint64_t currentSize; + uint64_t currentSize_; // maximal capacity (actual allocated memory) - uint64_t currentCapacity; + uint64_t currentCapacity_; // not implemented DataBuffer(DataBuffer& buffer); DataBuffer& operator=(DataBuffer& buffer); public: - DataBuffer(MemoryPool& pool, uint64_t _size = 0); + DataBuffer(MemoryPool& pool, uint64_t size = 0); DataBuffer(DataBuffer<T>&& buffer) noexcept; virtual ~DataBuffer(); T* data() { - return buf; + return buf_; } const T* data() const { - return buf; + return buf_; } uint64_t size() const { - return currentSize; + return currentSize_; } uint64_t capacity() const { - return currentCapacity; + return currentCapacity_; } const T& operator[](uint64_t i) const { - return buf[i]; + return buf_[i]; } T& operator[](uint64_t i) { - return buf[i]; + return buf_[i]; } - void reserve(uint64_t _size); - void resize(uint64_t _size); + void reserve(uint64_t size); + void resize(uint64_t size); void zeroOut(); }; diff --git a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh index a9ad692d42..ea71567c5f 100644 --- a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh +++ b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh @@ -19,6 +19,7 @@ #ifndef ORC_FILE_HH #define ORC_FILE_HH +#include <future> #include <string> #include "orc/Reader.hh" @@ -59,6 +60,18 @@ namespace orc { virtual void read(void* buf, uint64_t length, uint64_t offset) = 0; /** + * Read data asynchronously into the buffer. The buffer is allocated by the caller. + * @param buf the buffer to read into + * @param length the number of bytes to read. + * @param offset the position in the stream to read from. + * @return a future that will be set when the read is complete. + */ + virtual std::future<void> readAsync(void* buf, uint64_t length, uint64_t offset) { + return std::async(std::launch::async, + [this, buf, length, offset] { this->read(buf, length, offset); }); + } + + /** * Get the name of the stream for error messages. */ virtual const std::string& getName() const = 0; diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh index b631c2c6ea..b015b64910 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh @@ -40,6 +40,17 @@ namespace orc { struct ReaderOptionsPrivate; struct RowReaderOptionsPrivate; + struct CacheOptions { + // The maximum distance in bytes between two consecutive + // ranges; beyond this value, ranges are not combined + uint64_t holeSizeLimit = 8192; + + // The maximum size in bytes of a combined range; if + // combining two consecutive ranges would produce a range of a + // size greater than this, they are not combined + uint64_t rangeSizeLimit = 32 * 1024 * 1024; + }; + /** * Expose the reader metrics including the latency and * number of calls of the decompression/decoding/IO modules. @@ -59,15 +70,26 @@ namespace orc { std::atomic<uint64_t> IOBlockingLatencyUs{0}; std::atomic<uint64_t> SelectedRowGroupCount{0}; std::atomic<uint64_t> EvaluatedRowGroupCount{0}; + std::atomic<uint64_t> ReadRangeCacheHits{0}; + std::atomic<uint64_t> ReadRangeCacheMisses{0}; }; ReaderMetrics* getDefaultReaderMetrics(); + // Row group index of a single column in a stripe. + struct RowGroupIndex { + // Positions are represented as a two-dimensional array where the first + // dimension is row group index and the second dimension is the position + // list of the row group. The size of the second dimension should be equal + // among all row groups. + std::vector<std::vector<uint64_t>> positions; + }; + /** * Options for creating a Reader. */ class ReaderOptions { private: - std::unique_ptr<ReaderOptionsPrivate> privateBits; + std::unique_ptr<ReaderOptionsPrivate> privateBits_; public: ReaderOptions(); @@ -108,6 +130,11 @@ namespace orc { ReaderOptions& setReaderMetrics(ReaderMetrics* metrics); /** + * Set the cache options. + */ + ReaderOptions& setCacheOptions(const CacheOptions& cacheOptions); + + /** * Set the location of the tail as defined by the logical length of the * file. */ @@ -138,6 +165,11 @@ namespace orc { * Get the reader metrics. */ ReaderMetrics* getReaderMetrics() const; + + /** + * Set the cache options. + */ + const CacheOptions& getCacheOptions() const; }; /** @@ -145,7 +177,7 @@ namespace orc { */ class RowReaderOptions { private: - std::unique_ptr<RowReaderOptionsPrivate> privateBits; + std::unique_ptr<RowReaderOptionsPrivate> privateBits_; public: RowReaderOptions(); @@ -605,6 +637,33 @@ namespace orc { */ virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters( uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; + + /** + * Get row group index of all selected columns in the specified stripe + * @param stripeIndex index of the stripe to be read for row group index. + * @param included index of selected columns to return (if not specified, + * all columns will be returned). + * @return map of row group index keyed by its column index. + */ + virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex( + uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const = 0; + + /** + * Trigger IO prefetch and cache the prefetched contents asynchronously. + * It is thread safe. Users should make sure requested stripes and columns + * are not overlapped, otherwise the overlapping part will be prefetched multiple time, + * which doesn't affect correctness but waste IO and memory resources. + * @param stripes the stripes to prefetch + * @param includeTypes the types to prefetch + */ + virtual void preBuffer(const std::vector<uint32_t>& stripes, + const std::list<uint64_t>& includeTypes) = 0; + + /** + * Release cached entries whose right boundary is less than or equal to the given boundary. + * @param boundary the boundary value to release cache entries + */ + virtual void releaseBuffer(uint64_t boundary) = 0; }; /** diff --git a/contrib/libs/apache/orc/c++/include/orc/Vector.hh b/contrib/libs/apache/orc/c++/include/orc/Vector.hh index 0dfe926965..663bef9cd7 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Vector.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Vector.hh @@ -57,6 +57,8 @@ namespace orc { bool hasNulls; // whether the vector batch is encoded bool isEncoded; + // whether the dictionary is decoded into vector batch + bool dictionaryDecoded; // custom memory pool MemoryPool& memoryPool; @@ -88,6 +90,14 @@ namespace orc { */ virtual bool hasVariableLength(); + /** + * Decode possible dictionary into vector batch. + */ + void decodeDictionary(); + + protected: + virtual void decodeDictionaryImpl() {} + private: ColumnVectorBatch(const ColumnVectorBatch&); ColumnVectorBatch& operator=(const ColumnVectorBatch&); @@ -248,6 +258,10 @@ namespace orc { ~EncodedStringVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; + + // Calculate data and length in StringVectorBatch from dictionary and index + void decodeDictionaryImpl() override; + std::shared_ptr<StringDictionary> dictionary; // index for dictionary entry @@ -264,6 +278,9 @@ namespace orc { bool hasVariableLength() override; std::vector<ColumnVectorBatch*> fields; + + protected: + void decodeDictionaryImpl() override; }; struct ListVectorBatch : public ColumnVectorBatch { @@ -283,6 +300,9 @@ namespace orc { // the concatenated elements std::unique_ptr<ColumnVectorBatch> elements; + + protected: + void decodeDictionaryImpl() override; }; struct MapVectorBatch : public ColumnVectorBatch { @@ -304,6 +324,9 @@ namespace orc { std::unique_ptr<ColumnVectorBatch> keys; // the concatenated elements std::unique_ptr<ColumnVectorBatch> elements; + + protected: + void decodeDictionaryImpl() override; }; struct UnionVectorBatch : public ColumnVectorBatch { @@ -327,6 +350,9 @@ namespace orc { // the sub-columns std::vector<ColumnVectorBatch*> children; + + protected: + void decodeDictionaryImpl() override; }; struct Decimal { diff --git a/contrib/libs/apache/orc/c++/include/orc/Writer.hh b/contrib/libs/apache/orc/c++/include/orc/Writer.hh index 047ee9ffc5..78f06739bc 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Writer.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Writer.hh @@ -55,7 +55,7 @@ namespace orc { */ class WriterOptions { private: - std::unique_ptr<WriterOptionsPrivate> privateBits; + std::unique_ptr<WriterOptionsPrivate> privateBits_; public: WriterOptions(); @@ -277,6 +277,32 @@ namespace orc { * @return if not set, return default value which is 1 MB. */ uint64_t getOutputBufferCapacity() const; + + /** + * Set the initial block size of original input buffer in the class CompressionStream. + * the input buffer is used to store raw data before compression, while the output buffer is + * dedicated to holding compressed data + */ + WriterOptions& setMemoryBlockSize(uint64_t capacity); + + /** + * Get the initial block size of original input buffer in the class CompressionStream. + * @return if not set, return default value which is 64 KB. + */ + uint64_t getMemoryBlockSize() const; + + /** + * Set whether the compression block should be aligned to row group boundary. + * The boolean type may not be aligned to row group boundary due to the + * requirement of the Boolean RLE encoder to pack input bits into bytes + */ + WriterOptions& setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup); + + /** + * Get if the compression block should be aligned to row group boundary. + * @return if not set, return default value which is false. + */ + bool getAlignBlockBoundToRowGroup() const; }; class Writer { diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh index 5205a56af6..7bd4ac63b5 100644 --- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh +++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh @@ -19,7 +19,7 @@ #ifndef ORC_CONFIG_HH #define ORC_CONFIG_HH -#define ORC_VERSION "2.0.3" +#define ORC_VERSION "2.1.0" #define ORC_CXX_HAS_CSTDINT diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh index 9ce958302d..f7d37005a5 100644 --- a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh +++ b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh @@ -39,7 +39,7 @@ namespace orc { Timestamp(const Timestamp&) = default; Timestamp(Timestamp&&) = default; ~Timestamp() = default; - Timestamp(int64_t second_, int32_t nanos_) : second(second_), nanos(nanos_) { + Timestamp(int64_t second, int32_t nanos) : second(second), nanos(nanos) { // PASS } Timestamp& operator=(const Timestamp&) = default; @@ -130,15 +130,15 @@ namespace orc { * Check if a literal is null */ bool isNull() const { - return mIsNull; + return isNull_; } PredicateDataType getType() const { - return mType; + return type_; } std::string toString() const; size_t getHashCode() const { - return mHashCode; + return hashCode_; } private: @@ -158,13 +158,13 @@ namespace orc { }; private: - LiteralVal mValue; // data value for this literal if not null - PredicateDataType mType; // data type of the literal - size_t mSize; // size of mValue if it is Buffer - int32_t mPrecision; // precision of decimal type - int32_t mScale; // scale of decimal type - bool mIsNull; // whether this literal is null - size_t mHashCode; // precomputed hash code for the literal + LiteralVal value_; // data value for this literal if not null + PredicateDataType type_; // data type of the literal + size_t size_; // size of mValue if it is Buffer + int32_t precision_; // precision of decimal type + int32_t scale_; // scale of decimal type + bool isNull_; // whether this literal is null + size_t hashCode_; // precomputed hash code for the literal }; } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh index 286188e3a1..b10cc775ec 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh +++ b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh @@ -70,6 +70,7 @@ typedef SSIZE_T ssize_t; #define PRAGMA(TXT) _Pragma(#TXT) #if defined(_MSC_VER) + // Handles both cl.exe and clang-cl.exe compilers #define DIAGNOSTIC_IGNORE(XXX) __pragma(warning(disable : XXX)) #elif defined(__clang__) #define DIAGNOSTIC_IGNORE(XXX) PRAGMA(clang diagnostic ignored XXX) diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.cc b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc index 1f7843fad7..09bf078c85 100644 --- a/contrib/libs/apache/orc/c++/src/BlockBuffer.cc +++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc @@ -24,56 +24,56 @@ namespace orc { - BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t _blockSize) - : memoryPool(pool), currentSize(0), currentCapacity(0), blockSize(_blockSize) { - if (blockSize == 0) { + BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t blockSize) + : memoryPool_(pool), currentSize_(0), currentCapacity_(0), blockSize_(blockSize) { + if (blockSize_ == 0) { throw std::logic_error("Block size cannot be zero"); } - reserve(blockSize); + reserve(blockSize_); } BlockBuffer::~BlockBuffer() { - for (size_t i = 0; i < blocks.size(); ++i) { - memoryPool.free(blocks[i]); + for (size_t i = 0; i < blocks_.size(); ++i) { + memoryPool_.free(blocks_[i]); } - blocks.clear(); - currentSize = currentCapacity = 0; + blocks_.clear(); + currentSize_ = currentCapacity_ = 0; } BlockBuffer::Block BlockBuffer::getBlock(uint64_t blockIndex) const { if (blockIndex >= getBlockNumber()) { throw std::out_of_range("Block index out of range"); } - return Block(blocks[blockIndex], std::min(currentSize - blockIndex * blockSize, blockSize)); + return Block(blocks_[blockIndex], std::min(currentSize_ - blockIndex * blockSize_, blockSize_)); } BlockBuffer::Block BlockBuffer::getNextBlock() { - if (currentSize < currentCapacity) { - Block emptyBlock(blocks[currentSize / blockSize] + currentSize % blockSize, - blockSize - currentSize % blockSize); - currentSize = (currentSize / blockSize + 1) * blockSize; + if (currentSize_ < currentCapacity_) { + Block emptyBlock(blocks_[currentSize_ / blockSize_] + currentSize_ % blockSize_, + blockSize_ - currentSize_ % blockSize_); + currentSize_ = (currentSize_ / blockSize_ + 1) * blockSize_; return emptyBlock; } else { - resize(currentSize + blockSize); - return Block(blocks.back(), blockSize); + resize(currentSize_ + blockSize_); + return Block(blocks_.back(), blockSize_); } } void BlockBuffer::resize(uint64_t size) { reserve(size); - if (currentCapacity >= size) { - currentSize = size; + if (currentCapacity_ >= size) { + currentSize_ = size; } else { throw std::logic_error("Block buffer resize error"); } } void BlockBuffer::reserve(uint64_t newCapacity) { - while (currentCapacity < newCapacity) { - char* newBlockPtr = memoryPool.malloc(blockSize); + while (currentCapacity_ < newCapacity) { + char* newBlockPtr = memoryPool_.malloc(blockSize_); if (newBlockPtr != nullptr) { - blocks.push_back(newBlockPtr); - currentCapacity += blockSize; + blocks_.push_back(newBlockPtr); + currentCapacity_ += blockSize_; } else { break; } @@ -81,7 +81,7 @@ namespace orc { } void BlockBuffer::writeTo(OutputStream* output, WriterMetrics* metrics) { - if (currentSize == 0) { + if (currentSize_ == 0) { return; } static uint64_t MAX_CHUNK_SIZE = 1024 * 1024 * 1024; @@ -92,12 +92,12 @@ namespace orc { uint64_t ioCount = 0; uint64_t blockNumber = getBlockNumber(); // if only exists one block, currentSize is equal to first block size - if (blockNumber == 1 && currentSize <= chunkSize) { + if (blockNumber == 1 && currentSize_ <= chunkSize) { Block block = getBlock(0); output->write(block.data, block.size); ++ioCount; } else { - char* chunk = memoryPool.malloc(chunkSize); + char* chunk = memoryPool_.malloc(chunkSize); uint64_t chunkOffset = 0; for (uint64_t i = 0; i < blockNumber; ++i) { Block block = getBlock(i); @@ -121,7 +121,7 @@ namespace orc { output->write(chunk, chunkOffset); ++ioCount; } - memoryPool.free(chunk); + memoryPool_.free(chunk); } if (metrics != nullptr) { diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.hh b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh index 0f5f78e3fe..6d265b0e32 100644 --- a/contrib/libs/apache/orc/c++/src/BlockBuffer.hh +++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh @@ -34,15 +34,15 @@ namespace orc { */ class BlockBuffer { private: - MemoryPool& memoryPool; + MemoryPool& memoryPool_; // current buffer size - uint64_t currentSize; + uint64_t currentSize_; // maximal capacity (actual allocated memory) - uint64_t currentCapacity; + uint64_t currentCapacity_; // unit for buffer expansion - const uint64_t blockSize; + const uint64_t blockSize_; // pointers to the start of each block - std::vector<char*> blocks; + std::vector<char*> blocks_; // non-copy-constructible BlockBuffer(BlockBuffer& buffer) = delete; @@ -66,7 +66,7 @@ namespace orc { uint64_t size; Block() : data(nullptr), size(0) {} - Block(char* _data, uint64_t _size) : data(_data), size(_size) {} + Block(char* data, uint64_t size) : data(data), size(size) {} Block(const Block& block) = default; ~Block() = default; }; @@ -94,24 +94,26 @@ namespace orc { * Get the number of blocks that are fully or partially occupied */ uint64_t getBlockNumber() const { - return (currentSize + blockSize - 1) / blockSize; + return (currentSize_ + blockSize_ - 1) / blockSize_; } uint64_t size() const { - return currentSize; + return currentSize_; } uint64_t capacity() const { - return currentCapacity; + return currentCapacity_; } void resize(uint64_t size); + /** * Requests the BlockBuffer to contain at least newCapacity bytes. * Reallocation happens if there is need of more space. * @param newCapacity new capacity of BlockBuffer */ void reserve(uint64_t newCapacity); + /** * Write the BlockBuffer content into OutputStream * @param output the output stream to write to diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc index 882c6f4252..887637223a 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc @@ -37,50 +37,50 @@ namespace orc { * Implementation of BitSet */ BitSet::BitSet(uint64_t numBits) { - mData.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0); + data_.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0); } BitSet::BitSet(const uint64_t* bits, uint64_t numBits) { // caller should make sure numBits is multiple of 64 - mData.resize(numBits >> SHIFT_6_BITS, 0); - memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS); + data_.resize(numBits >> SHIFT_6_BITS, 0); + memcpy(data_.data(), bits, numBits >> SHIFT_3_BITS); } void BitSet::set(uint64_t index) { - mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG)); + data_[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG)); } bool BitSet::get(uint64_t index) { - return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0; + return (data_[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0; } uint64_t BitSet::bitSize() { - return mData.size() << SHIFT_6_BITS; + return data_.size() << SHIFT_6_BITS; } void BitSet::merge(const BitSet& other) { - if (mData.size() != other.mData.size()) { + if (data_.size() != other.data_.size()) { std::stringstream ss; - ss << "BitSet must be of equal length (" << mData.size() << " != " << other.mData.size() + ss << "BitSet must be of equal length (" << data_.size() << " != " << other.data_.size() << ")"; throw std::logic_error(ss.str()); } - for (size_t i = 0; i != mData.size(); i++) { - mData[i] |= other.mData[i]; + for (size_t i = 0; i != data_.size(); i++) { + data_[i] |= other.data_[i]; } } void BitSet::clear() { - memset(mData.data(), 0, sizeof(uint64_t) * mData.size()); + memset(data_.data(), 0, sizeof(uint64_t) * data_.size()); } const uint64_t* BitSet::getData() const { - return mData.data(); + return data_.data(); } bool BitSet::operator==(const BitSet& other) const { - return mData == other.mData; + return data_ == other.data_; } /** @@ -127,9 +127,9 @@ namespace orc { uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp)); // make 'mNumBits' multiple of 64 - mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG)); - mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits); - mBitSet.reset(new BitSet(mNumBits)); + numBits_ = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG)); + numHashFunctions_ = optimalNumOfHashFunctions(expectedEntries, numBits_); + bitSet_.reset(new BitSet(numBits_)); } void BloomFilterImpl::addBytes(const char* data, int64_t length) { @@ -155,11 +155,11 @@ namespace orc { } uint64_t BloomFilterImpl::getBitSize() const { - return mBitSet->bitSize(); + return bitSet_->bitSize(); } int32_t BloomFilterImpl::getNumHashFunctions() const { - return mNumHashFunctions; + return numHashFunctions_; } DIAGNOSTIC_PUSH @@ -175,17 +175,17 @@ namespace orc { // caller should make sure input proto::BloomFilter is valid since // no check will be performed in the following constructor BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) { - mNumHashFunctions = static_cast<int32_t>(bloomFilter.num_hash_functions()); + numHashFunctions_ = static_cast<int32_t>(bloomFilter.num_hash_functions()); const std::string& bitsetStr = bloomFilter.utf8bitset(); - mNumBits = bitsetStr.size() << SHIFT_3_BITS; - checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); + numBits_ = bitsetStr.size() << SHIFT_3_BITS; + checkArgument(numBits_ % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); const uint64_t* bitset = reinterpret_cast<const uint64_t*>(bitsetStr.data()); if (isLittleEndian()) { - mBitSet.reset(new BitSet(bitset, mNumBits)); + bitSet_.reset(new BitSet(bitset, numBits_)); } else { - std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS); + std::vector<uint64_t> longs(numBits_ >> SHIFT_6_BITS); for (size_t i = 0; i != longs.size(); ++i) { // convert little-endian to big-endian const uint64_t src = bitset[i]; @@ -195,7 +195,7 @@ namespace orc { } } - mBitSet.reset(new BitSet(longs.data(), mNumBits)); + bitSet_.reset(new BitSet(longs.data(), numBits_)); } } @@ -215,14 +215,14 @@ namespace orc { // So we cast hash64 to uint64_t here for an unsigned right shift. int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32); - for (int32_t i = 1; i <= mNumHashFunctions; ++i) { + for (int32_t i = 1; i <= numHashFunctions_; ++i) { int32_t combinedHash = hash1 + i * hash2; // hashcode should be positive, flip all the bits if it's negative if (combinedHash < 0) { combinedHash = ~combinedHash; } - uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; - mBitSet->set(pos); + uint64_t pos = static_cast<uint64_t>(combinedHash) % numBits_; + bitSet_->set(pos); } } @@ -232,14 +232,14 @@ namespace orc { // So we cast hash64 to uint64_t here for an unsigned right shift. int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32); - for (int32_t i = 1; i <= mNumHashFunctions; ++i) { + for (int32_t i = 1; i <= numHashFunctions_; ++i) { int32_t combinedHash = hash1 + i * hash2; // hashcode should be positive, flip all the bits if it's negative if (combinedHash < 0) { combinedHash = ~combinedHash; } - uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; - if (!mBitSet->get(pos)) { + uint64_t pos = static_cast<uint64_t>(combinedHash) % numBits_; + if (!bitSet_->get(pos)) { return false; } } @@ -247,33 +247,33 @@ namespace orc { } void BloomFilterImpl::merge(const BloomFilterImpl& other) { - if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) { + if (numBits_ != other.numBits_ || numHashFunctions_ != other.numHashFunctions_) { std::stringstream ss; ss << "BloomFilters are not compatible for merging: " - << "this: numBits:" << mNumBits << ",numHashFunctions:" << mNumHashFunctions - << ", that: numBits:" << other.mNumBits << ",numHashFunctions:" << other.mNumHashFunctions; + << "this: numBits:" << numBits_ << ",numHashFunctions:" << numHashFunctions_ + << ", that: numBits:" << other.numBits_ << ",numHashFunctions:" << other.numHashFunctions_; throw std::logic_error(ss.str()); } - mBitSet->merge(*other.mBitSet); + bitSet_->merge(*other.bitSet_); } void BloomFilterImpl::reset() { - mBitSet->clear(); + bitSet_->clear(); } void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const { - bloomFilter.set_num_hash_functions(static_cast<uint32_t>(mNumHashFunctions)); + bloomFilter.set_num_hash_functions(static_cast<uint32_t>(numHashFunctions_)); // According to ORC standard, the encoding is a sequence of bytes with // a little endian encoding in the utf8bitset field. if (isLittleEndian()) { // bytes are already organized in little endian; thus no conversion needed - const char* bitset = reinterpret_cast<const char*>(mBitSet->getData()); + const char* bitset = reinterpret_cast<const char*>(bitSet_->getData()); bloomFilter.set_utf8bitset(bitset, sizeInBytes()); } else { std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0); - const uint64_t* longs = mBitSet->getData(); + const uint64_t* longs = bitSet_->getData(); for (size_t i = 0; i != bitset.size(); ++i) { uint64_t& dst = bitset[i]; const uint64_t src = longs[i]; @@ -287,8 +287,8 @@ namespace orc { } bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const { - return mNumBits == other.mNumBits && mNumHashFunctions == other.mNumHashFunctions && - *mBitSet == *other.mBitSet; + return numBits_ == other.numBits_ && numHashFunctions_ == other.numHashFunctions_ && + *bitSet_ == *other.bitSet_; } BloomFilter::~BloomFilter() { diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh index d72961a83c..ebc4a5ee04 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh @@ -90,7 +90,7 @@ namespace orc { bool operator==(const BitSet& other) const; private: - std::vector<uint64_t> mData; + std::vector<uint64_t> data_; }; /** @@ -174,9 +174,9 @@ namespace orc { private: static constexpr double DEFAULT_FPP = 0.05; - uint64_t mNumBits; - int32_t mNumHashFunctions; - std::unique_ptr<BitSet> mBitSet; + uint64_t numBits_; + int32_t numHashFunctions_; + std::unique_ptr<BitSet> bitSet_; }; struct BloomFilterUTF8Utils { diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.cc b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc index 5a80bc6fb1..401a217d35 100644 --- a/contrib/libs/apache/orc/c++/src/BpackingDefault.cc +++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc @@ -22,7 +22,7 @@ namespace orc { - UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder(dec) { + UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder_(dec) { // PASS } @@ -34,17 +34,17 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (decoder->getBitsLeft() > 0 && curIdx < offset + len) { - decoder->setBitsLeft(decoder->getBitsLeft() - 4); - data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15; + while (decoder_->getBitsLeft() > 0 && curIdx < offset + len) { + decoder_->setBitsLeft(decoder_->getBitsLeft() - 4); + data[curIdx++] = (decoder_->getCurByte() >> decoder_->getBitsLeft()) & 15; } if (curIdx == offset + len) return; // Exhaust the buffer uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = std::min(numGroups, static_cast<uint64_t>(decoder->bufLength())); + numGroups = std::min(numGroups, static_cast<uint64_t>(decoder_->bufLength())); // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); uint32_t localByte; for (uint64_t i = 0; i < numGroups; ++i) { localByte = *buffer++; @@ -52,12 +52,12 @@ namespace orc { data[curIdx + 1] = localByte & 15; curIdx += 2; } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd' - decoder->setCurByte(decoder->readByte()); - decoder->setBitsLeft(8); + decoder_->setCurByte(decoder_->readByte()); + decoder_->setBitsLeft(8); } } @@ -65,18 +65,18 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength(); + int64_t bufferNum = decoder_->bufLength(); bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { data[curIdx++] = *buffer++; } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = decoder->readByte(); + data[curIdx++] = decoder_->readByte(); } } @@ -84,23 +84,23 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 2; + int64_t bufferNum = decoder_->bufLength() / 2; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint16_t b0, b1; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint16_t>(*buffer); b1 = static_cast<uint16_t>(*(buffer + 1)); buffer += 2; data[curIdx++] = (b0 << 8) | b1; } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); data[curIdx++] = (b0 << 8) | b1; } } @@ -109,11 +109,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 3; + int64_t bufferNum = decoder_->bufLength() / 3; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint32_t b0, b1, b2; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint32_t>(*buffer); b1 = static_cast<uint32_t>(*(buffer + 1)); @@ -122,13 +122,13 @@ namespace orc { data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2); } //////decoder->bufferStart += bufferNum * 3; - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); - b2 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); + b2 = decoder_->readByte(); data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2); } } @@ -137,11 +137,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 4; + int64_t bufferNum = decoder_->bufLength() / 4; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint32_t b0, b1, b2, b3; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint32_t>(*buffer); b1 = static_cast<uint32_t>(*(buffer + 1)); @@ -150,14 +150,14 @@ namespace orc { buffer += 4; data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); - b2 = decoder->readByte(); - b3 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); + b2 = decoder_->readByte(); + b3 = decoder_->readByte(); data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); } } @@ -166,11 +166,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 5; + int64_t bufferNum = decoder_->bufLength() / 5; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint32_t>(*buffer); b1 = static_cast<uint32_t>(*(buffer + 1)); @@ -181,15 +181,15 @@ namespace orc { data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); - b2 = decoder->readByte(); - b3 = decoder->readByte(); - b4 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); + b2 = decoder_->readByte(); + b3 = decoder_->readByte(); + b4 = decoder_->readByte(); data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); } } @@ -198,11 +198,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 6; + int64_t bufferNum = decoder_->bufLength() / 6; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint32_t>(*buffer); b1 = static_cast<uint32_t>(*(buffer + 1)); @@ -214,16 +214,16 @@ namespace orc { data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); - b2 = decoder->readByte(); - b3 = decoder->readByte(); - b4 = decoder->readByte(); - b5 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); + b2 = decoder_->readByte(); + b3 = decoder_->readByte(); + b4 = decoder_->readByte(); + b5 = decoder_->readByte(); data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); } @@ -233,11 +233,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 7; + int64_t bufferNum = decoder_->bufLength() / 7; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint32_t>(*buffer); b1 = static_cast<uint32_t>(*(buffer + 1)); @@ -250,17 +250,17 @@ namespace orc { data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); - b2 = decoder->readByte(); - b3 = decoder->readByte(); - b4 = decoder->readByte(); - b5 = decoder->readByte(); - b6 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); + b2 = decoder_->readByte(); + b3 = decoder_->readByte(); + b4 = decoder_->readByte(); + b5 = decoder_->readByte(); + b6 = decoder_->readByte(); data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); } @@ -270,11 +270,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufLength() / 8; + int64_t bufferNum = decoder_->bufLength() / 8; bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6, b7; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast<uint32_t>(*buffer); b1 = static_cast<uint32_t>(*(buffer + 1)); @@ -288,18 +288,18 @@ namespace orc { data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); } - decoder->setBufStart(reinterpret_cast<char*>(buffer)); + decoder_->setBufStart(reinterpret_cast<char*>(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(); - b1 = decoder->readByte(); - b2 = decoder->readByte(); - b3 = decoder->readByte(); - b4 = decoder->readByte(); - b5 = decoder->readByte(); - b6 = decoder->readByte(); - b7 = decoder->readByte(); + b0 = decoder_->readByte(); + b1 = decoder_->readByte(); + b2 = decoder_->readByte(); + b3 = decoder_->readByte(); + b4 = decoder_->readByte(); + b5 = decoder_->readByte(); + b6 = decoder_->readByte(); + b7 = decoder_->readByte(); data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); } @@ -309,19 +309,19 @@ namespace orc { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > decoder->getBitsLeft()) { - result <<= decoder->getBitsLeft(); - result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); - bitsLeftToRead -= decoder->getBitsLeft(); - decoder->setCurByte(decoder->readByte()); - decoder->setBitsLeft(8); + while (bitsLeftToRead > decoder_->getBitsLeft()) { + result <<= decoder_->getBitsLeft(); + result |= decoder_->getCurByte() & ((1 << decoder_->getBitsLeft()) - 1); + bitsLeftToRead -= decoder_->getBitsLeft(); + decoder_->setCurByte(decoder_->readByte()); + decoder_->setBitsLeft(8); } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - decoder->setBitsLeft(decoder->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead)); - result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); + decoder_->setBitsLeft(decoder_->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead)); + result |= (decoder_->getCurByte() >> decoder_->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast<int64_t>(result); } diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.hh b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh index 0a58234495..bbd7851260 100644 --- a/contrib/libs/apache/orc/c++/src/BpackingDefault.hh +++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh @@ -45,7 +45,7 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); private: - RleDecoderV2* decoder; + RleDecoderV2* decoder_; }; class BitUnpackDefault : public BitUnpack { diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc index b81d282e35..ded9f55a00 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc @@ -63,6 +63,8 @@ namespace orc { virtual void suppress() override; + virtual void finishEncode() override; + /** * Reset to initial state */ @@ -186,16 +188,17 @@ namespace orc { void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); + uint64_t unusedBufferSize = static_cast<uint64_t>(bufferLength - bufferPosition); if (outputStream->isCompressed()) { // start of the compression chunk in the stream recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast<uint64_t>(bufferLength); // byte offset of the RLE run’s start location - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } recorder->add(static_cast<uint64_t>(numLiterals)); } @@ -215,6 +218,13 @@ namespace orc { reset(); } + void ByteRleEncoderImpl::finishEncode() { + writeValues(); + outputStream->BackUp(bufferLength - bufferPosition); + outputStream->finishStream(); + bufferLength = bufferPosition = 0; + } + std::unique_ptr<ByteRleEncoder> createByteRleEncoder( std::unique_ptr<BufferedOutputStream> output) { return std::make_unique<ByteRleEncoderImpl>(std::move(output)); @@ -244,14 +254,14 @@ namespace orc { virtual void suppress() override; private: - int bitsRemained; - char current; + int bitsRemained_; + char current_; }; BooleanRleEncoderImpl::BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output) : ByteRleEncoderImpl(std::move(output)) { - bitsRemained = 8; - current = static_cast<char>(0); + bitsRemained_ = 8; + current_ = static_cast<char>(0); } BooleanRleEncoderImpl::~BooleanRleEncoderImpl() { @@ -260,43 +270,43 @@ namespace orc { void BooleanRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) { for (uint64_t i = 0; i < numValues; ++i) { - if (bitsRemained == 0) { - write(current); - current = static_cast<char>(0); - bitsRemained = 8; + if (bitsRemained_ == 0) { + write(current_); + current_ = static_cast<char>(0); + bitsRemained_ = 8; } if (!notNull || notNull[i]) { if (!data || data[i]) { - current = static_cast<char>(current | (0x80 >> (8 - bitsRemained))); + current_ = static_cast<char>(current_ | (0x80 >> (8 - bitsRemained_))); } - --bitsRemained; + --bitsRemained_; } } - if (bitsRemained == 0) { - write(current); - current = static_cast<char>(0); - bitsRemained = 8; + if (bitsRemained_ == 0) { + write(current_); + current_ = static_cast<char>(0); + bitsRemained_ = 8; } } uint64_t BooleanRleEncoderImpl::flush() { - if (bitsRemained != 8) { - write(current); + if (bitsRemained_ != 8) { + write(current_); } - bitsRemained = 8; - current = static_cast<char>(0); + bitsRemained_ = 8; + current_ = static_cast<char>(0); return ByteRleEncoderImpl::flush(); } void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { ByteRleEncoderImpl::recordPosition(recorder); - recorder->add(static_cast<uint64_t>(8 - bitsRemained)); + recorder->add(static_cast<uint64_t>(8 - bitsRemained_)); } void BooleanRleEncoderImpl::suppress() { ByteRleEncoderImpl::suppress(); - bitsRemained = 8; - current = static_cast<char>(0); + bitsRemained_ = 8; + current_ = static_cast<char>(0); } std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder( @@ -386,8 +396,8 @@ namespace orc { } ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, - ReaderMetrics* _metrics) - : metrics(_metrics) { + ReaderMetrics* metrics) + : metrics(metrics) { inputStream = std::move(input); reset(); } @@ -526,8 +536,8 @@ namespace orc { }; BooleanRleDecoderImpl::BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, - ReaderMetrics* _metrics) - : ByteRleDecoderImpl(std::move(input), _metrics) { + ReaderMetrics* metrics) + : ByteRleDecoderImpl(std::move(input), metrics) { remainingBits = 0; lastByte = 0; } diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh index bd19f52ecc..bee064f666 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh @@ -59,6 +59,13 @@ namespace orc { * suppress the data and reset to initial state */ virtual void suppress() = 0; + + /** + * Finalize the encoding process. This function should be called after all data required for + * encoding has been added. It ensures that any remaining data is processed and the final state + * of the encoder is set. + */ + virtual void finishEncode() = 0; }; class ByteRleDecoder { diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc index 5297f80371..8b16ecbd09 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc @@ -17,6 +17,7 @@ */ #include "orc/ColumnPrinter.hh" +#include "orc/Int128.hh" #include "orc/orc-config.hh" #include "Adaptor.hh" @@ -35,7 +36,7 @@ namespace orc { class VoidColumnPrinter : public ColumnPrinter { public: - VoidColumnPrinter(std::string&); + VoidColumnPrinter(std::string&, ColumnPrinter::Param); ~VoidColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -43,10 +44,10 @@ namespace orc { class BooleanColumnPrinter : public ColumnPrinter { private: - const int64_t* data; + const int64_t* data_; public: - BooleanColumnPrinter(std::string&); + BooleanColumnPrinter(std::string&, ColumnPrinter::Param); ~BooleanColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -54,10 +55,10 @@ namespace orc { class LongColumnPrinter : public ColumnPrinter { private: - const int64_t* data; + const int64_t* data_; public: - LongColumnPrinter(std::string&); + LongColumnPrinter(std::string&, ColumnPrinter::Param); ~LongColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -65,11 +66,11 @@ namespace orc { class DoubleColumnPrinter : public ColumnPrinter { private: - const double* data; - const bool isFloat; + const double* data_; + const bool isFloat_; public: - DoubleColumnPrinter(std::string&, const Type& type); + DoubleColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param); virtual ~DoubleColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -77,11 +78,11 @@ namespace orc { class TimestampColumnPrinter : public ColumnPrinter { private: - const int64_t* seconds; - const int64_t* nanoseconds; + const int64_t* seconds_; + const int64_t* nanoseconds_; public: - TimestampColumnPrinter(std::string&); + TimestampColumnPrinter(std::string&, ColumnPrinter::Param); ~TimestampColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -89,10 +90,10 @@ namespace orc { class DateColumnPrinter : public ColumnPrinter { private: - const int64_t* data; + const int64_t* data_; public: - DateColumnPrinter(std::string&); + DateColumnPrinter(std::string&, ColumnPrinter::Param); ~DateColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -100,11 +101,12 @@ namespace orc { class Decimal64ColumnPrinter : public ColumnPrinter { private: - const int64_t* data; - int32_t scale; + const int64_t* data_; + int32_t scale_; + ColumnPrinter::Param param_; public: - Decimal64ColumnPrinter(std::string&); + Decimal64ColumnPrinter(std::string&, ColumnPrinter::Param); ~Decimal64ColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -112,11 +114,12 @@ namespace orc { class Decimal128ColumnPrinter : public ColumnPrinter { private: - const Int128* data; - int32_t scale; + const Int128* data_; + int32_t scale_; + ColumnPrinter::Param param_; public: - Decimal128ColumnPrinter(std::string&); + Decimal128ColumnPrinter(std::string&, ColumnPrinter::Param); ~Decimal128ColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -124,11 +127,11 @@ namespace orc { class StringColumnPrinter : public ColumnPrinter { private: - const char* const* start; - const int64_t* length; + const char* const* start_; + const int64_t* length_; public: - StringColumnPrinter(std::string&); + StringColumnPrinter(std::string&, ColumnPrinter::Param); virtual ~StringColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -136,11 +139,11 @@ namespace orc { class BinaryColumnPrinter : public ColumnPrinter { private: - const char* const* start; - const int64_t* length; + const char* const* start_; + const int64_t* length_; public: - BinaryColumnPrinter(std::string&); + BinaryColumnPrinter(std::string&, ColumnPrinter::Param); virtual ~BinaryColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -148,11 +151,11 @@ namespace orc { class ListColumnPrinter : public ColumnPrinter { private: - const int64_t* offsets; - std::unique_ptr<ColumnPrinter> elementPrinter; + const int64_t* offsets_; + std::unique_ptr<ColumnPrinter> elementPrinter_; public: - ListColumnPrinter(std::string&, const Type& type); + ListColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param); virtual ~ListColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -160,12 +163,12 @@ namespace orc { class MapColumnPrinter : public ColumnPrinter { private: - const int64_t* offsets; - std::unique_ptr<ColumnPrinter> keyPrinter; - std::unique_ptr<ColumnPrinter> elementPrinter; + const int64_t* offsets_; + std::unique_ptr<ColumnPrinter> keyPrinter_; + std::unique_ptr<ColumnPrinter> elementPrinter_; public: - MapColumnPrinter(std::string&, const Type& type); + MapColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param); virtual ~MapColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -173,23 +176,23 @@ namespace orc { class UnionColumnPrinter : public ColumnPrinter { private: - const unsigned char* tags; - const uint64_t* offsets; - std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter; + const unsigned char* tags_; + const uint64_t* offsets_; + std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter_; public: - UnionColumnPrinter(std::string&, const Type& type); + UnionColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param); void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; class StructColumnPrinter : public ColumnPrinter { private: - std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter; - std::vector<std::string> fieldNames; + std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter_; + std::vector<std::string> fieldNames_; public: - StructColumnPrinter(std::string&, const Type& type); + StructColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param); void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; @@ -203,7 +206,7 @@ namespace orc { file.append(ptr, len); } - ColumnPrinter::ColumnPrinter(std::string& _buffer) : buffer(_buffer) { + ColumnPrinter::ColumnPrinter(std::string& buffer) : buffer(buffer) { notNull = nullptr; hasNulls = false; } @@ -221,69 +224,70 @@ namespace orc { } } - std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type) { + std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type, + ColumnPrinter::Param param) { std::unique_ptr<ColumnPrinter> result; if (type == nullptr) { - result = std::make_unique<VoidColumnPrinter>(buffer); + result = std::make_unique<VoidColumnPrinter>(buffer, param); } else { switch (static_cast<int64_t>(type->getKind())) { case BOOLEAN: - result = std::make_unique<BooleanColumnPrinter>(buffer); + result = std::make_unique<BooleanColumnPrinter>(buffer, param); break; case BYTE: case SHORT: case INT: case LONG: - result = std::make_unique<LongColumnPrinter>(buffer); + result = std::make_unique<LongColumnPrinter>(buffer, param); break; case FLOAT: case DOUBLE: - result = std::make_unique<DoubleColumnPrinter>(buffer, *type); + result = std::make_unique<DoubleColumnPrinter>(buffer, *type, param); break; case STRING: case VARCHAR: case CHAR: - result = std::make_unique<StringColumnPrinter>(buffer); + result = std::make_unique<StringColumnPrinter>(buffer, param); break; case BINARY: - result = std::make_unique<BinaryColumnPrinter>(buffer); + result = std::make_unique<BinaryColumnPrinter>(buffer, param); break; case TIMESTAMP: case TIMESTAMP_INSTANT: - result = std::make_unique<TimestampColumnPrinter>(buffer); + result = std::make_unique<TimestampColumnPrinter>(buffer, param); break; case LIST: - result = std::make_unique<ListColumnPrinter>(buffer, *type); + result = std::make_unique<ListColumnPrinter>(buffer, *type, param); break; case MAP: - result = std::make_unique<MapColumnPrinter>(buffer, *type); + result = std::make_unique<MapColumnPrinter>(buffer, *type, param); break; case STRUCT: - result = std::make_unique<StructColumnPrinter>(buffer, *type); + result = std::make_unique<StructColumnPrinter>(buffer, *type, param); break; case DECIMAL: if (type->getPrecision() == 0 || type->getPrecision() > 18) { - result = std::make_unique<Decimal128ColumnPrinter>(buffer); + result = std::make_unique<Decimal128ColumnPrinter>(buffer, param); } else { - result = std::make_unique<Decimal64ColumnPrinter>(buffer); + result = std::make_unique<Decimal64ColumnPrinter>(buffer, param); } break; case DATE: - result = std::make_unique<DateColumnPrinter>(buffer); + result = std::make_unique<DateColumnPrinter>(buffer, param); break; case UNION: - result = std::make_unique<UnionColumnPrinter>(buffer, *type); + result = std::make_unique<UnionColumnPrinter>(buffer, *type, param); break; default: @@ -293,7 +297,8 @@ namespace orc { return result; } - VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer) : ColumnPrinter(_buffer) { + VoidColumnPrinter::VoidColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer) { // PASS } @@ -305,33 +310,34 @@ namespace orc { writeString(buffer, "null"); } - LongColumnPrinter::LongColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), data(nullptr) { + LongColumnPrinter::LongColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer), data_(nullptr) { // PASS } void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + data_ = dynamic_cast<const LongVectorBatch&>(batch).data.data(); } void LongColumnPrinter::printRow(uint64_t rowId) { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - const auto numBuffer = std::to_string(static_cast<int64_t>(data[rowId])); + const auto numBuffer = std::to_string(static_cast<int64_t>(data_[rowId])); writeString(buffer, numBuffer.c_str()); } } - DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, const Type& type) - : ColumnPrinter(_buffer), data(nullptr), isFloat(type.getKind() == FLOAT) { + DoubleColumnPrinter::DoubleColumnPrinter(std::string& buffer, const Type& type, + ColumnPrinter::Param) + : ColumnPrinter(buffer), data_(nullptr), isFloat_(type.getKind() == FLOAT) { // PASS } void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); + data_ = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); } void DoubleColumnPrinter::printRow(uint64_t rowId) { @@ -339,86 +345,76 @@ namespace orc { writeString(buffer, "null"); } else { char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", data[rowId]); + snprintf(numBuffer, sizeof(numBuffer), isFloat_ ? "%.7g" : "%.14g", data_[rowId]); writeString(buffer, numBuffer); } } - Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), data(nullptr), scale(0) { + Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& buffer, ColumnPrinter::Param param) + : ColumnPrinter(buffer), data_(nullptr), scale_(0), param_(param) { // PASS } void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; + data_ = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); + scale_ = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; } - std::string toDecimalString(int64_t value, int32_t scale) { - std::stringstream buffer; - if (scale == 0) { - buffer << value; - return buffer.str(); - } - std::string sign = ""; - if (value < 0) { - sign = "-"; - value = -value; - } - buffer << value; - std::string str = buffer.str(); - int32_t len = static_cast<int32_t>(str.length()); - if (len > scale) { - return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(scale)); - } else if (len == scale) { - return sign + "0." + str; - } else { - std::string result = sign + "0."; - for (int32_t i = 0; i < scale - len; ++i) { - result += "0"; - } - return result + str; - } + std::string toDecimalString(int64_t value, int32_t scale, bool trimTrailingZeros) { + return Int128(value).toDecimalString(scale, trimTrailingZeros); } void Decimal64ColumnPrinter::printRow(uint64_t rowId) { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - writeString(buffer, toDecimalString(data[rowId], scale).c_str()); + bool trimTrailingZeros = param_.printDecimalTrimTrailingZeros; + if (param_.printDecimalAsString) { + writeChar(buffer, '"'); + writeString(buffer, toDecimalString(data_[rowId], scale_, trimTrailingZeros).c_str()); + writeChar(buffer, '"'); + } else { + writeString(buffer, toDecimalString(data_[rowId], scale_, trimTrailingZeros).c_str()); + } } } - Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), data(nullptr), scale(0) { + Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& buffer, ColumnPrinter::Param param) + : ColumnPrinter(buffer), data_(nullptr), scale_(0), param_(param) { // PASS } void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; + data_ = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); + scale_ = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; } void Decimal128ColumnPrinter::printRow(uint64_t rowId) { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - writeString(buffer, data[rowId].toDecimalString(scale).c_str()); + bool trimTrailingZeros = param_.printDecimalTrimTrailingZeros; + if (param_.printDecimalAsString) { + writeChar(buffer, '"'); + writeString(buffer, data_[rowId].toDecimalString(scale_, trimTrailingZeros).c_str()); + writeChar(buffer, '"'); + } else { + writeString(buffer, data_[rowId].toDecimalString(scale_, trimTrailingZeros).c_str()); + } } } - StringColumnPrinter::StringColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), start(nullptr), length(nullptr) { + StringColumnPrinter::StringColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer), start_(nullptr), length_(nullptr) { // PASS } void StringColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); - length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); + start_ = dynamic_cast<const StringVectorBatch&>(batch).data.data(); + length_ = dynamic_cast<const StringVectorBatch&>(batch).length.data(); } void StringColumnPrinter::printRow(uint64_t rowId) { @@ -426,8 +422,8 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '"'); - for (int64_t i = 0; i < length[rowId]; ++i) { - char ch = static_cast<char>(start[rowId][i]); + for (int64_t i = 0; i < length_[rowId]; ++i) { + char ch = static_cast<char>(start_[rowId][i]); switch (ch) { case '\\': writeString(buffer, "\\\\"); @@ -459,15 +455,16 @@ namespace orc { } } - ListColumnPrinter::ListColumnPrinter(std::string& _buffer, const Type& type) - : ColumnPrinter(_buffer), offsets(nullptr) { - elementPrinter = createColumnPrinter(buffer, type.getSubtype(0)); + ListColumnPrinter::ListColumnPrinter(std::string& buffer, const Type& type, + ColumnPrinter::Param param) + : ColumnPrinter(buffer), offsets_(nullptr) { + elementPrinter_ = createColumnPrinter(buffer, type.getSubtype(0), param); } void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); - elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements); + offsets_ = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); + elementPrinter_->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements); } void ListColumnPrinter::printRow(uint64_t rowId) { @@ -475,28 +472,29 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '['); - for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) { - if (i != offsets[rowId]) { + for (int64_t i = offsets_[rowId]; i < offsets_[rowId + 1]; ++i) { + if (i != offsets_[rowId]) { writeString(buffer, ", "); } - elementPrinter->printRow(static_cast<uint64_t>(i)); + elementPrinter_->printRow(static_cast<uint64_t>(i)); } writeChar(buffer, ']'); } } - MapColumnPrinter::MapColumnPrinter(std::string& _buffer, const Type& type) - : ColumnPrinter(_buffer), offsets(nullptr) { - keyPrinter = createColumnPrinter(buffer, type.getSubtype(0)); - elementPrinter = createColumnPrinter(buffer, type.getSubtype(1)); + MapColumnPrinter::MapColumnPrinter(std::string& buffer, const Type& type, + ColumnPrinter::Param param) + : ColumnPrinter(buffer), offsets_(nullptr) { + keyPrinter_ = createColumnPrinter(buffer, type.getSubtype(0), param); + elementPrinter_ = createColumnPrinter(buffer, type.getSubtype(1), param); } void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch); - offsets = myBatch.offsets.data(); - keyPrinter->reset(*myBatch.keys); - elementPrinter->reset(*myBatch.elements); + offsets_ = myBatch.offsets.data(); + keyPrinter_->reset(*myBatch.keys); + elementPrinter_->reset(*myBatch.elements); } void MapColumnPrinter::printRow(uint64_t rowId) { @@ -504,34 +502,35 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '['); - for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) { - if (i != offsets[rowId]) { + for (int64_t i = offsets_[rowId]; i < offsets_[rowId + 1]; ++i) { + if (i != offsets_[rowId]) { writeString(buffer, ", "); } writeString(buffer, "{\"key\": "); - keyPrinter->printRow(static_cast<uint64_t>(i)); + keyPrinter_->printRow(static_cast<uint64_t>(i)); writeString(buffer, ", \"value\": "); - elementPrinter->printRow(static_cast<uint64_t>(i)); + elementPrinter_->printRow(static_cast<uint64_t>(i)); writeChar(buffer, '}'); } writeChar(buffer, ']'); } } - UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, const Type& type) - : ColumnPrinter(_buffer), tags(nullptr), offsets(nullptr) { + UnionColumnPrinter::UnionColumnPrinter(std::string& buffer, const Type& type, + ColumnPrinter::Param param) + : ColumnPrinter(buffer), tags_(nullptr), offsets_(nullptr) { for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { - fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))); + fieldPrinter_.push_back(createColumnPrinter(buffer, type.getSubtype(i), param)); } } void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); const UnionVectorBatch& unionBatch = dynamic_cast<const UnionVectorBatch&>(batch); - tags = unionBatch.tags.data(); - offsets = unionBatch.offsets.data(); - for (size_t i = 0; i < fieldPrinter.size(); ++i) { - fieldPrinter[i]->reset(*(unionBatch.children[i])); + tags_ = unionBatch.tags.data(); + offsets_ = unionBatch.offsets.data(); + for (size_t i = 0; i < fieldPrinter_.size(); ++i) { + fieldPrinter_[i]->reset(*(unionBatch.children[i])); } } @@ -540,27 +539,28 @@ namespace orc { writeString(buffer, "null"); } else { writeString(buffer, "{\"tag\": "); - const auto numBuffer = std::to_string(static_cast<int64_t>(tags[rowId])); + const auto numBuffer = std::to_string(static_cast<int64_t>(tags_[rowId])); writeString(buffer, numBuffer.c_str()); writeString(buffer, ", \"value\": "); - fieldPrinter[tags[rowId]]->printRow(offsets[rowId]); + fieldPrinter_[tags_[rowId]]->printRow(offsets_[rowId]); writeChar(buffer, '}'); } } - StructColumnPrinter::StructColumnPrinter(std::string& _buffer, const Type& type) - : ColumnPrinter(_buffer) { + StructColumnPrinter::StructColumnPrinter(std::string& buffer, const Type& type, + ColumnPrinter::Param param) + : ColumnPrinter(buffer) { for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { - fieldNames.push_back(type.getFieldName(i)); - fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))); + fieldNames_.push_back(type.getFieldName(i)); + fieldPrinter_.push_back(createColumnPrinter(buffer, type.getSubtype(i), param)); } } void StructColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); const StructVectorBatch& structBatch = dynamic_cast<const StructVectorBatch&>(batch); - for (size_t i = 0; i < fieldPrinter.size(); ++i) { - fieldPrinter[i]->reset(*(structBatch.fields[i])); + for (size_t i = 0; i < fieldPrinter_.size(); ++i) { + fieldPrinter_[i]->reset(*(structBatch.fields[i])); } } @@ -569,21 +569,21 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '{'); - for (unsigned int i = 0; i < fieldPrinter.size(); ++i) { + for (unsigned int i = 0; i < fieldPrinter_.size(); ++i) { if (i != 0) { writeString(buffer, ", "); } writeChar(buffer, '"'); - writeString(buffer, fieldNames[i].c_str()); + writeString(buffer, fieldNames_[i].c_str()); writeString(buffer, "\": "); - fieldPrinter[i]->printRow(rowId); + fieldPrinter_[i]->printRow(rowId); } writeChar(buffer, '}'); } } - DateColumnPrinter::DateColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), data(nullptr) { + DateColumnPrinter::DateColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer), data_(nullptr) { // PASS } @@ -591,7 +591,7 @@ namespace orc { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - const time_t timeValue = data[rowId] * 24 * 60 * 60; + const time_t timeValue = data_[rowId] * 24 * 60 * 60; struct tm tmValue; gmtime_r(&timeValue, &tmValue); char timeBuffer[11]; @@ -604,11 +604,11 @@ namespace orc { void DateColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + data_ = dynamic_cast<const LongVectorBatch&>(batch).data.data(); } - BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), data(nullptr) { + BooleanColumnPrinter::BooleanColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer), data_(nullptr) { // PASS } @@ -616,17 +616,17 @@ namespace orc { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - writeString(buffer, (data[rowId] ? "true" : "false")); + writeString(buffer, (data_[rowId] ? "true" : "false")); } } void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + data_ = dynamic_cast<const LongVectorBatch&>(batch).data.data(); } - BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), start(nullptr), length(nullptr) { + BinaryColumnPrinter::BinaryColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer), start_(nullptr), length_(nullptr) { // PASS } @@ -635,11 +635,11 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '['); - for (int64_t i = 0; i < length[rowId]; ++i) { + for (int64_t i = 0; i < length_[rowId]; ++i) { if (i != 0) { writeString(buffer, ", "); } - const auto numBuffer = std::to_string(static_cast<int>(start[rowId][i]) & 0xff); + const auto numBuffer = std::to_string(static_cast<int>(start_[rowId][i]) & 0xff); writeString(buffer, numBuffer.c_str()); } writeChar(buffer, ']'); @@ -648,12 +648,12 @@ namespace orc { void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); - length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); + start_ = dynamic_cast<const StringVectorBatch&>(batch).data.data(); + length_ = dynamic_cast<const StringVectorBatch&>(batch).length.data(); } - TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer) - : ColumnPrinter(_buffer), seconds(nullptr), nanoseconds(nullptr) { + TimestampColumnPrinter::TimestampColumnPrinter(std::string& buffer, ColumnPrinter::Param) + : ColumnPrinter(buffer), seconds_(nullptr), nanoseconds_(nullptr) { // PASS } @@ -662,8 +662,8 @@ namespace orc { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - int64_t nanos = nanoseconds[rowId]; - time_t secs = static_cast<time_t>(seconds[rowId]); + int64_t nanos = nanoseconds_[rowId]; + time_t secs = static_cast<time_t>(seconds_[rowId]); struct tm tmValue; gmtime_r(&secs, &tmValue); char timeBuffer[20]; @@ -694,7 +694,7 @@ namespace orc { void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); const TimestampVectorBatch& ts = dynamic_cast<const TimestampVectorBatch&>(batch); - seconds = ts.data.data(); - nanoseconds = ts.nanoseconds.data(); + seconds_ = ts.data.data(); + nanoseconds_ = ts.nanoseconds.data(); } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc index a6bbdabedc..af434c37ca 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc @@ -138,7 +138,7 @@ namespace orc { template <typename BatchType> class BooleanColumnReader : public ColumnReader { private: - std::unique_ptr<orc::ByteRleDecoder> rle; + std::unique_ptr<orc::ByteRleDecoder> rle_; public: BooleanColumnReader(const Type& type, StripeStreams& stipe); @@ -157,7 +157,7 @@ namespace orc { std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); if (stream == nullptr) throw ParseError("DATA stream not found in Boolean column"); - rle = createBooleanRleDecoder(std::move(stream), metrics); + rle_ = createBooleanRleDecoder(std::move(stream), metrics); } template <typename BatchType> @@ -168,7 +168,7 @@ namespace orc { template <typename BatchType> uint64_t BooleanColumnReader<BatchType>::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - rle->skip(numValues); + rle_->skip(numValues); return numValues; } @@ -180,8 +180,8 @@ namespace orc { // LongVectorBatch with long*. We cheat here in that case and use the long* // and then expand it in a second pass.. auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), numValues, - rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + rle_->next(reinterpret_cast<char*>(ptr), numValues, + rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); expandBytesToIntegers(ptr, numValues); } @@ -189,27 +189,27 @@ namespace orc { void BooleanColumnReader<BatchType>::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); + rle_->seek(positions.at(columnId)); } template <typename BatchType> class ByteColumnReader : public ColumnReader { private: - std::unique_ptr<orc::ByteRleDecoder> rle; + std::unique_ptr<orc::ByteRleDecoder> rle_; public: ByteColumnReader(const Type& type, StripeStreams& stripe) : ColumnReader(type, stripe) { std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); if (stream == nullptr) throw ParseError("DATA stream not found in Byte column"); - rle = createByteRleDecoder(std::move(stream), metrics); + rle_ = createByteRleDecoder(std::move(stream), metrics); } ~ByteColumnReader() override = default; uint64_t skip(uint64_t numValues) override { numValues = ColumnReader::skip(numValues); - rle->skip(numValues); + rle_->skip(numValues); return numValues; } @@ -218,14 +218,14 @@ namespace orc { // Since the byte rle places the output in a char* instead of long*, // we cheat here and use the long* and then expand it in a second pass. auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), numValues, - rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + rle_->next(reinterpret_cast<char*>(ptr), numValues, + rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); expandBytesToIntegers(ptr, numValues); } void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override { ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); + rle_->seek(positions.at(columnId)); } }; @@ -267,12 +267,12 @@ namespace orc { class TimestampColumnReader : public ColumnReader { private: - std::unique_ptr<orc::RleDecoder> secondsRle; - std::unique_ptr<orc::RleDecoder> nanoRle; - const Timezone* writerTimezone; - const Timezone* readerTimezone; - const int64_t epochOffset; - const bool sameTimezone; + std::unique_ptr<orc::RleDecoder> secondsRle_; + std::unique_ptr<orc::RleDecoder> nanoRle_; + const Timezone* writerTimezone_; + const Timezone* readerTimezone_; + const int64_t epochOffset_; + const bool sameTimezone_; public: TimestampColumnReader(const Type& type, StripeStreams& stripe, bool isInstantType); @@ -288,18 +288,18 @@ namespace orc { TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe, bool isInstantType) : ColumnReader(type, stripe), - writerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getWriterTimezone()), - readerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()), - epochOffset(writerTimezone->getEpoch()), - sameTimezone(writerTimezone == readerTimezone) { + writerTimezone_(isInstantType ? &getTimezoneByName("GMT") : &stripe.getWriterTimezone()), + readerTimezone_(isInstantType ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()), + epochOffset_(writerTimezone_->getEpoch()), + sameTimezone_(writerTimezone_ == readerTimezone_) { RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); if (stream == nullptr) throw ParseError("DATA stream not found in Timestamp column"); - secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics); + secondsRle_ = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics); stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); if (stream == nullptr) throw ParseError("SECONDARY stream not found in Timestamp column"); - nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); + nanoRle_ = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); } TimestampColumnReader::~TimestampColumnReader() { @@ -308,8 +308,8 @@ namespace orc { uint64_t TimestampColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - secondsRle->skip(numValues); - nanoRle->skip(numValues); + secondsRle_->skip(numValues); + nanoRle_->skip(numValues); return numValues; } @@ -318,9 +318,9 @@ namespace orc { notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; TimestampVectorBatch& timestampBatch = dynamic_cast<TimestampVectorBatch&>(rowBatch); int64_t* secsBuffer = timestampBatch.data.data(); - secondsRle->next(secsBuffer, numValues, notNull); + secondsRle_->next(secsBuffer, numValues, notNull); int64_t* nanoBuffer = timestampBatch.nanoseconds.data(); - nanoRle->next(nanoBuffer, numValues, notNull); + nanoRle_->next(nanoBuffer, numValues, notNull); // Construct the values for (uint64_t i = 0; i < numValues; i++) { @@ -332,17 +332,17 @@ namespace orc { nanoBuffer[i] *= 10; } } - int64_t writerTime = secsBuffer[i] + epochOffset; - if (!sameTimezone) { + int64_t writerTime = secsBuffer[i] + epochOffset_; + if (!sameTimezone_) { // adjust timestamp value to same wall clock time if writer and reader // time zones have different rules, which is required for Apache Orc. - const auto& wv = writerTimezone->getVariant(writerTime); - const auto& rv = readerTimezone->getVariant(writerTime); + const auto& wv = writerTimezone_->getVariant(writerTime); + const auto& rv = readerTimezone_->getVariant(writerTime); if (!wv.hasSameTzRule(rv)) { // If the timezone adjustment moves the millis across a DST boundary, // we need to reevaluate the offsets. int64_t adjustedTime = writerTime + wv.gmtOffset - rv.gmtOffset; - const auto& adjustedReader = readerTimezone->getVariant(adjustedTime); + const auto& adjustedReader = readerTimezone_->getVariant(adjustedTime); writerTime = writerTime + wv.gmtOffset - adjustedReader.gmtOffset; } } @@ -357,8 +357,8 @@ namespace orc { void TimestampColumnReader::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - secondsRle->seek(positions.at(columnId)); - nanoRle->seek(positions.at(columnId)); + secondsRle_->seek(positions.at(columnId)); + nanoRle_->seek(positions.at(columnId)); } template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> @@ -374,39 +374,39 @@ namespace orc { void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; private: - std::unique_ptr<SeekableInputStream> inputStream; - const uint64_t bytesPerValue = (columnKind == FLOAT) ? 4 : 8; - const char* bufferPointer; - const char* bufferEnd; + std::unique_ptr<SeekableInputStream> inputStream_; + const uint64_t bytesPerValue_ = (columnKind == FLOAT) ? 4 : 8; + const char* bufferPointer_; + const char* bufferEnd_; unsigned char readByte() { - if (bufferPointer == bufferEnd) { + if (bufferPointer_ == bufferEnd_) { int length; - if (!inputStream->Next(reinterpret_cast<const void**>(&bufferPointer), &length)) { + if (!inputStream_->Next(reinterpret_cast<const void**>(&bufferPointer_), &length)) { throw ParseError("bad read in DoubleColumnReader::next()"); } - bufferEnd = bufferPointer + length; + bufferEnd_ = bufferPointer_ + length; } - return static_cast<unsigned char>(*(bufferPointer++)); + return static_cast<unsigned char>(*(bufferPointer_++)); } template <typename FloatType> FloatType readDouble() { int64_t bits = 0; - if (bufferEnd - bufferPointer >= 8) { + if (bufferEnd_ - bufferPointer_ >= 8) { if (isLittleEndian) { - memcpy(&bits, bufferPointer, sizeof(bits)); + memcpy(&bits, bufferPointer_, sizeof(bits)); } else { - bits = static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[0])); - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[1])) << 8; - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[2])) << 16; - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[3])) << 24; - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[4])) << 32; - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[5])) << 40; - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[6])) << 48; - bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[7])) << 56; + bits = static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[0])); + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[1])) << 8; + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[2])) << 16; + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[3])) << 24; + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[4])) << 32; + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[5])) << 40; + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[6])) << 48; + bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[7])) << 56; } - bufferPointer += 8; + bufferPointer_ += 8; } else { for (uint64_t i = 0; i < 8; i++) { bits |= static_cast<int64_t>(readByte()) << (i * 8); @@ -419,16 +419,16 @@ namespace orc { template <typename FloatType> FloatType readFloat() { int32_t bits = 0; - if (bufferEnd - bufferPointer >= 4) { + if (bufferEnd_ - bufferPointer_ >= 4) { if (isLittleEndian) { - bits = *(reinterpret_cast<const int32_t*>(bufferPointer)); + bits = *(reinterpret_cast<const int32_t*>(bufferPointer_)); } else { - bits = static_cast<unsigned char>(bufferPointer[0]); - bits |= static_cast<unsigned char>(bufferPointer[1]) << 8; - bits |= static_cast<unsigned char>(bufferPointer[2]) << 16; - bits |= static_cast<unsigned char>(bufferPointer[3]) << 24; + bits = static_cast<unsigned char>(bufferPointer_[0]); + bits |= static_cast<unsigned char>(bufferPointer_[1]) << 8; + bits |= static_cast<unsigned char>(bufferPointer_[2]) << 16; + bits |= static_cast<unsigned char>(bufferPointer_[3]) << 24; } - bufferPointer += 4; + bufferPointer_ += 4; } else { for (uint64_t i = 0; i < 4; i++) { bits |= readByte() << (i * 8); @@ -445,9 +445,9 @@ namespace orc { template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::DoubleColumnReader( const Type& type, StripeStreams& stripe) - : ColumnReader(type, stripe), bufferPointer(nullptr), bufferEnd(nullptr) { - inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (inputStream == nullptr) throw ParseError("DATA stream not found in Double column"); + : ColumnReader(type, stripe), bufferPointer_(nullptr), bufferEnd_(nullptr) { + inputStream_ = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (inputStream_ == nullptr) throw ParseError("DATA stream not found in Double column"); } template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> @@ -455,19 +455,19 @@ namespace orc { uint64_t numValues) { numValues = ColumnReader::skip(numValues); - if (static_cast<size_t>(bufferEnd - bufferPointer) >= bytesPerValue * numValues) { - bufferPointer += bytesPerValue * numValues; + if (static_cast<size_t>(bufferEnd_ - bufferPointer_) >= bytesPerValue_ * numValues) { + bufferPointer_ += bytesPerValue_ * numValues; } else { size_t sizeToSkip = - bytesPerValue * numValues - static_cast<size_t>(bufferEnd - bufferPointer); + bytesPerValue_ * numValues - static_cast<size_t>(bufferEnd_ - bufferPointer_); const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); while (sizeToSkip != 0) { size_t step = sizeToSkip > cap ? cap : sizeToSkip; - inputStream->Skip(static_cast<int>(step)); + inputStream_->Skip(static_cast<int>(step)); sizeToSkip -= step; } - bufferEnd = nullptr; - bufferPointer = nullptr; + bufferEnd_ = nullptr; + bufferPointer_ = nullptr; } return numValues; @@ -506,12 +506,12 @@ namespace orc { // Only viable when the machine is little-endian. uint64_t bufferNum = 0; if (isLittleEndian) { - bufferNum = - std::min(numValues, static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue); - uint64_t bufferBytes = bufferNum * bytesPerValue; + bufferNum = std::min(numValues, + static_cast<size_t>(bufferEnd_ - bufferPointer_) / bytesPerValue_); + uint64_t bufferBytes = bufferNum * bytesPerValue_; if (bufferBytes > 0) { - memcpy(outArray, bufferPointer, bufferBytes); - bufferPointer += bufferBytes; + memcpy(outArray, bufferPointer_, bufferBytes); + bufferPointer_ += bufferBytes; } } for (size_t i = bufferNum; i < numValues; ++i) { @@ -525,10 +525,10 @@ namespace orc { void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - inputStream->seek(positions.at(columnId)); + inputStream_->seek(positions.at(columnId)); // clear buffer state after seek - bufferEnd = nullptr; - bufferPointer = nullptr; + bufferEnd_ = nullptr; + bufferPointer_ = nullptr; } void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) { @@ -549,8 +549,8 @@ namespace orc { class StringDictionaryColumnReader : public ColumnReader { private: - std::shared_ptr<StringDictionary> dictionary; - std::unique_ptr<RleDecoder> rle; + std::shared_ptr<StringDictionary> dictionary_; + std::unique_ptr<RleDecoder> rle_; public: StringDictionaryColumnReader(const Type& type, StripeStreams& stipe); @@ -567,7 +567,7 @@ namespace orc { StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type, StripeStreams& stripe) - : ColumnReader(type, stripe), dictionary(new StringDictionary(stripe.getMemoryPool())) { + : ColumnReader(type, stripe), dictionary_(new StringDictionary(stripe.getMemoryPool())) { RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind()); uint32_t dictSize = stripe.getEncoding(columnId).dictionary_size(); std::unique_ptr<SeekableInputStream> stream = @@ -575,15 +575,15 @@ namespace orc { if (stream == nullptr) { throw ParseError("DATA stream not found in StringDictionaryColumn"); } - rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); + rle_ = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false); if (dictSize > 0 && stream == nullptr) { throw ParseError("LENGTH stream not found in StringDictionaryColumn"); } std::unique_ptr<RleDecoder> lengthDecoder = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); - dictionary->dictionaryOffset.resize(dictSize + 1); - int64_t* lengthArray = dictionary->dictionaryOffset.data(); + dictionary_->dictionaryOffset.resize(dictSize + 1); + int64_t* lengthArray = dictionary_->dictionaryOffset.data(); lengthDecoder->next(lengthArray + 1, dictSize, nullptr); lengthArray[0] = 0; for (uint32_t i = 1; i < dictSize + 1; ++i) { @@ -593,13 +593,13 @@ namespace orc { lengthArray[i] += lengthArray[i - 1]; } int64_t blobSize = lengthArray[dictSize]; - dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize)); + dictionary_->dictionaryBlob.resize(static_cast<uint64_t>(blobSize)); std::unique_ptr<SeekableInputStream> blobStream = stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); if (blobSize > 0 && blobStream == nullptr) { throw ParseError("DICTIONARY_DATA stream not found in StringDictionaryColumn"); } - readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get()); + readFully(dictionary_->dictionaryBlob.data(), blobSize, blobStream.get()); } StringDictionaryColumnReader::~StringDictionaryColumnReader() { @@ -608,7 +608,7 @@ namespace orc { uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - rle->skip(numValues); + rle_->skip(numValues); return numValues; } @@ -618,12 +618,12 @@ namespace orc { // update the notNull from the parent class notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char* blob = dictionary->dictionaryBlob.data(); - int64_t* dictionaryOffsets = dictionary->dictionaryOffset.data(); + char* blob = dictionary_->dictionaryBlob.data(); + int64_t* dictionaryOffsets = dictionary_->dictionaryOffset.data(); char** outputStarts = byteBatch.data.data(); int64_t* outputLengths = byteBatch.length.data(); - rle->next(outputLengths, numValues, notNull); - uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1; + rle_->next(outputLengths, numValues, notNull); + uint64_t dictionaryCount = dictionary_->dictionaryOffset.size() - 1; if (notNull) { for (uint64_t i = 0; i < numValues; ++i) { if (notNull[i]) { @@ -654,24 +654,24 @@ namespace orc { rowBatch.isEncoded = true; EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch); - batch.dictionary = this->dictionary; + batch.dictionary = this->dictionary_; // Length buffer is reused to save dictionary entry ids - rle->next(batch.index.data(), numValues, notNull); + rle_->next(batch.index.data(), numValues, notNull); } void StringDictionaryColumnReader::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); + rle_->seek(positions.at(columnId)); } class StringDirectColumnReader : public ColumnReader { private: - std::unique_ptr<RleDecoder> lengthRle; - std::unique_ptr<SeekableInputStream> blobStream; - const char* lastBuffer; - size_t lastBufferLength; + std::unique_ptr<RleDecoder> lengthRle_; + std::unique_ptr<SeekableInputStream> blobStream_; + const char* lastBuffer_; + size_t lastBufferLength_; /** * Compute the total length of the values. @@ -699,11 +699,11 @@ namespace orc { std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); if (stream == nullptr) throw ParseError("LENGTH stream not found in StringDirectColumn"); - lengthRle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); - blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (blobStream == nullptr) throw ParseError("DATA stream not found in StringDirectColumn"); - lastBuffer = nullptr; - lastBufferLength = 0; + lengthRle_ = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); + blobStream_ = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (blobStream_ == nullptr) throw ParseError("DATA stream not found in StringDirectColumn"); + lastBuffer_ = nullptr; + lastBufferLength_ = 0; } StringDirectColumnReader::~StringDirectColumnReader() { @@ -719,25 +719,25 @@ namespace orc { // read the lengths, so we know haw many bytes to skip while (done < numValues) { uint64_t step = std::min(BUFFER_SIZE, static_cast<size_t>(numValues - done)); - lengthRle->next(buffer, step, nullptr); + lengthRle_->next(buffer, step, nullptr); totalBytes += computeSize(buffer, nullptr, step); done += step; } - if (totalBytes <= lastBufferLength) { + if (totalBytes <= lastBufferLength_) { // subtract the needed bytes from the ones left over - lastBufferLength -= totalBytes; - lastBuffer += totalBytes; + lastBufferLength_ -= totalBytes; + lastBuffer_ += totalBytes; } else { // move the stream forward after accounting for the buffered bytes - totalBytes -= lastBufferLength; + totalBytes -= lastBufferLength_; const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); while (totalBytes != 0) { size_t step = totalBytes > cap ? cap : totalBytes; - blobStream->Skip(static_cast<int>(step)); + blobStream_->Skip(static_cast<int>(step)); totalBytes -= step; } - lastBufferLength = 0; - lastBuffer = nullptr; + lastBufferLength_ = 0; + lastBuffer_ = nullptr; } return numValues; } @@ -769,7 +769,7 @@ namespace orc { int64_t* lengthPtr = byteBatch.length.data(); // read the length vector - lengthRle->next(lengthPtr, numValues, notNull); + lengthRle_->next(lengthPtr, numValues, notNull); // figure out the total length of data we need from the blob stream const size_t totalLength = computeSize(lengthPtr, notNull, numValues); @@ -779,23 +779,23 @@ namespace orc { size_t bytesBuffered = 0; byteBatch.blob.resize(totalLength); char* ptr = byteBatch.blob.data(); - while (bytesBuffered + lastBufferLength < totalLength) { - memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength); - bytesBuffered += lastBufferLength; + while (bytesBuffered + lastBufferLength_ < totalLength) { + memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_); + bytesBuffered += lastBufferLength_; const void* readBuffer; int readLength; - if (!blobStream->Next(&readBuffer, &readLength)) { + if (!blobStream_->Next(&readBuffer, &readLength)) { throw ParseError("failed to read in StringDirectColumnReader.next"); } - lastBuffer = static_cast<const char*>(readBuffer); - lastBufferLength = static_cast<size_t>(readLength); + lastBuffer_ = static_cast<const char*>(readBuffer); + lastBufferLength_ = static_cast<size_t>(readLength); } if (bytesBuffered < totalLength) { size_t moreBytes = totalLength - bytesBuffered; - memcpy(ptr + bytesBuffered, lastBuffer, moreBytes); - lastBuffer += moreBytes; - lastBufferLength -= moreBytes; + memcpy(ptr + bytesBuffered, lastBuffer_, moreBytes); + lastBuffer_ += moreBytes; + lastBufferLength_ -= moreBytes; } size_t filledSlots = 0; @@ -820,16 +820,16 @@ namespace orc { void StringDirectColumnReader::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - blobStream->seek(positions.at(columnId)); - lengthRle->seek(positions.at(columnId)); + blobStream_->seek(positions.at(columnId)); + lengthRle_->seek(positions.at(columnId)); // clear buffer state after seek - lastBuffer = nullptr; - lastBufferLength = 0; + lastBuffer_ = nullptr; + lastBufferLength_ = 0; } class StructColumnReader : public ColumnReader { private: - std::vector<std::unique_ptr<ColumnReader>> children; + std::vector<std::unique_ptr<ColumnReader>> children_; public: StructColumnReader(const Type& type, StripeStreams& stripe, bool useTightNumericVector = false, @@ -859,7 +859,7 @@ namespace orc { for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { const Type& child = *type.getSubtype(i); if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { - children.push_back( + children_.push_back( buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow)); } } @@ -874,7 +874,7 @@ namespace orc { uint64_t StructColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - for (auto& ptr : children) { + for (auto& ptr : children_) { ptr->skip(numValues); } return numValues; @@ -895,7 +895,7 @@ namespace orc { ColumnReader::next(rowBatch, numValues, notNull); uint64_t i = 0; notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - for (auto iter = children.begin(); iter != children.end(); ++iter, ++i) { + for (auto iter = children_.begin(); iter != children_.end(); ++iter, ++i) { if (encoded) { (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues, notNull); @@ -909,15 +909,15 @@ namespace orc { std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - for (auto& ptr : children) { + for (auto& ptr : children_) { ptr->seekToRowGroup(positions); } } class ListColumnReader : public ColumnReader { private: - std::unique_ptr<ColumnReader> child; - std::unique_ptr<RleDecoder> rle; + std::unique_ptr<ColumnReader> child_; + std::unique_ptr<RleDecoder> rle_; public: ListColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, @@ -947,10 +947,11 @@ namespace orc { std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); if (stream == nullptr) throw ParseError("LENGTH stream not found in List column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); + rle_ = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); const Type& childType = *type.getSubtype(0); if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) { - child = buildReader(childType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); + child_ = + buildReader(childType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } } @@ -960,7 +961,7 @@ namespace orc { uint64_t ListColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - ColumnReader* childReader = child.get(); + ColumnReader* childReader = child_.get(); if (childReader) { const uint64_t BUFFER_SIZE = 1024; int64_t buffer[BUFFER_SIZE]; @@ -968,7 +969,7 @@ namespace orc { uint64_t lengthsRead = 0; while (lengthsRead < numValues) { uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); + rle_->next(buffer, chunk, nullptr); for (size_t i = 0; i < chunk; ++i) { childrenElements += static_cast<size_t>(buffer[i]); } @@ -976,7 +977,7 @@ namespace orc { } childReader->skip(childrenElements); } else { - rle->skip(numValues); + rle_->skip(numValues); } return numValues; } @@ -997,7 +998,7 @@ namespace orc { ListVectorBatch& listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); int64_t* offsets = listBatch.offsets.data(); notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr; - rle->next(offsets, numValues, notNull); + rle_->next(offsets, numValues, notNull); uint64_t totalChildren = 0; if (notNull) { for (size_t i = 0; i < numValues; ++i) { @@ -1017,7 +1018,7 @@ namespace orc { } } offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader* childReader = child.get(); + ColumnReader* childReader = child_.get(); if (childReader) { if (encoded) { childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr); @@ -1029,17 +1030,17 @@ namespace orc { void ListColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - if (child.get()) { - child->seekToRowGroup(positions); + rle_->seek(positions.at(columnId)); + if (child_.get()) { + child_->seekToRowGroup(positions); } } class MapColumnReader : public ColumnReader { private: - std::unique_ptr<ColumnReader> keyReader; - std::unique_ptr<ColumnReader> elementReader; - std::unique_ptr<RleDecoder> rle; + std::unique_ptr<ColumnReader> keyReader_; + std::unique_ptr<ColumnReader> elementReader_; + std::unique_ptr<RleDecoder> rle_; public: MapColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, @@ -1068,15 +1069,15 @@ namespace orc { std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); if (stream == nullptr) throw ParseError("LENGTH stream not found in Map column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); + rle_ = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); const Type& keyType = *type.getSubtype(0); if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) { - keyReader = + keyReader_ = buildReader(keyType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } const Type& elementType = *type.getSubtype(1); if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) { - elementReader = + elementReader_ = buildReader(elementType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } } @@ -1087,8 +1088,8 @@ namespace orc { uint64_t MapColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - ColumnReader* rawKeyReader = keyReader.get(); - ColumnReader* rawElementReader = elementReader.get(); + ColumnReader* rawKeyReader = keyReader_.get(); + ColumnReader* rawElementReader = elementReader_.get(); if (rawKeyReader || rawElementReader) { const uint64_t BUFFER_SIZE = 1024; int64_t buffer[BUFFER_SIZE]; @@ -1096,7 +1097,7 @@ namespace orc { uint64_t lengthsRead = 0; while (lengthsRead < numValues) { uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); + rle_->next(buffer, chunk, nullptr); for (size_t i = 0; i < chunk; ++i) { childrenElements += static_cast<size_t>(buffer[i]); } @@ -1109,7 +1110,7 @@ namespace orc { rawElementReader->skip(childrenElements); } } else { - rle->skip(numValues); + rle_->skip(numValues); } return numValues; } @@ -1130,7 +1131,7 @@ namespace orc { MapVectorBatch& mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); int64_t* offsets = mapBatch.offsets.data(); notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr; - rle->next(offsets, numValues, notNull); + rle_->next(offsets, numValues, notNull); uint64_t totalChildren = 0; if (notNull) { for (size_t i = 0; i < numValues; ++i) { @@ -1150,7 +1151,7 @@ namespace orc { } } offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader* rawKeyReader = keyReader.get(); + ColumnReader* rawKeyReader = keyReader_.get(); if (rawKeyReader) { if (encoded) { rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr); @@ -1158,7 +1159,7 @@ namespace orc { rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr); } } - ColumnReader* rawElementReader = elementReader.get(); + ColumnReader* rawElementReader = elementReader_.get(); if (rawElementReader) { if (encoded) { rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr); @@ -1170,21 +1171,21 @@ namespace orc { void MapColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - if (keyReader.get()) { - keyReader->seekToRowGroup(positions); + rle_->seek(positions.at(columnId)); + if (keyReader_.get()) { + keyReader_->seekToRowGroup(positions); } - if (elementReader.get()) { - elementReader->seekToRowGroup(positions); + if (elementReader_.get()) { + elementReader_->seekToRowGroup(positions); } } class UnionColumnReader : public ColumnReader { private: - std::unique_ptr<ByteRleDecoder> rle; - std::vector<std::unique_ptr<ColumnReader>> childrenReader; - std::vector<int64_t> childrenCounts; - uint64_t numChildren; + std::unique_ptr<ByteRleDecoder> rle_; + std::vector<std::unique_ptr<ColumnReader>> childrenReader_; + std::vector<int64_t> childrenCounts_; + uint64_t numChildren_; public: UnionColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, @@ -1207,20 +1208,20 @@ namespace orc { bool useTightNumericVector, bool throwOnSchemaEvolutionOverflow) : ColumnReader(type, stripe) { - numChildren = type.getSubtypeCount(); - childrenReader.resize(numChildren); - childrenCounts.resize(numChildren); + numChildren_ = type.getSubtypeCount(); + childrenReader_.resize(numChildren_); + childrenCounts_.resize(numChildren_); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); if (stream == nullptr) throw ParseError("LENGTH stream not found in Union column"); - rle = createByteRleDecoder(std::move(stream), metrics); + rle_ = createByteRleDecoder(std::move(stream), metrics); // figure out which types are selected const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - for (unsigned int i = 0; i < numChildren; ++i) { + for (unsigned int i = 0; i < numChildren_; ++i) { const Type& child = *type.getSubtype(i); if (selectedColumns[static_cast<size_t>(child.getColumnId())]) { - childrenReader[i] = + childrenReader_[i] = buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } } @@ -1231,19 +1232,19 @@ namespace orc { const uint64_t BUFFER_SIZE = 1024; char buffer[BUFFER_SIZE]; uint64_t lengthsRead = 0; - int64_t* counts = childrenCounts.data(); - memset(counts, 0, sizeof(int64_t) * numChildren); + int64_t* counts = childrenCounts_.data(); + memset(counts, 0, sizeof(int64_t) * numChildren_); while (lengthsRead < numValues) { uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); + rle_->next(buffer, chunk, nullptr); for (size_t i = 0; i < chunk; ++i) { counts[static_cast<size_t>(buffer[i])] += 1; } lengthsRead += chunk; } - for (size_t i = 0; i < numChildren; ++i) { - if (counts[i] != 0 && childrenReader[i] != nullptr) { - childrenReader[i]->skip(static_cast<uint64_t>(counts[i])); + for (size_t i = 0; i < numChildren_; ++i) { + if (counts[i] != 0 && childrenReader_[i] != nullptr) { + childrenReader_[i]->skip(static_cast<uint64_t>(counts[i])); } } return numValues; @@ -1264,11 +1265,11 @@ namespace orc { ColumnReader::next(rowBatch, numValues, notNull); UnionVectorBatch& unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); uint64_t* offsets = unionBatch.offsets.data(); - int64_t* counts = childrenCounts.data(); - memset(counts, 0, sizeof(int64_t) * numChildren); + int64_t* counts = childrenCounts_.data(); + memset(counts, 0, sizeof(int64_t) * numChildren_); unsigned char* tags = unionBatch.tags.data(); notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr; - rle->next(reinterpret_cast<char*>(tags), numValues, notNull); + rle_->next(reinterpret_cast<char*>(tags), numValues, notNull); // set the offsets for each row if (notNull) { for (size_t i = 0; i < numValues; ++i) { @@ -1282,14 +1283,14 @@ namespace orc { } } // read the right number of each child column - for (size_t i = 0; i < numChildren; ++i) { - if (childrenReader[i] != nullptr) { + for (size_t i = 0; i < numChildren_; ++i) { + if (childrenReader_[i] != nullptr) { if (encoded) { - childrenReader[i]->nextEncoded(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); + childrenReader_[i]->nextEncoded(*(unionBatch.children[i]), + static_cast<uint64_t>(counts[i]), nullptr); } else { - childrenReader[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]), - nullptr); + childrenReader_[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]), + nullptr); } } } @@ -1298,10 +1299,10 @@ namespace orc { void UnionColumnReader::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - for (size_t i = 0; i < numChildren; ++i) { - if (childrenReader[i] != nullptr) { - childrenReader[i]->seekToRowGroup(positions); + rle_->seek(positions.at(columnId)); + for (size_t i = 0; i < numChildren_; ++i) { + if (childrenReader_[i] != nullptr) { + childrenReader_[i]->seekToRowGroup(positions); } } } @@ -1599,8 +1600,8 @@ namespace orc { class DecimalHive11ColumnReader : public Decimal64ColumnReader { private: - bool throwOnOverflow; - std::ostream* errorStream; + bool throwOnOverflow_; + std::ostream* errorStream_; /** * Read an Int128 from the stream and correct it to the desired scale. @@ -1649,8 +1650,8 @@ namespace orc { DecimalHive11ColumnReader::DecimalHive11ColumnReader(const Type& type, StripeStreams& stripe) : Decimal64ColumnReader(type, stripe) { scale = stripe.getForcedScaleOnHive11Decimal(); - throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow(); - errorStream = stripe.getErrorStream(); + throwOnOverflow_ = stripe.getThrowOnHive11DecimalOverflow(); + errorStream_ = stripe.getErrorStream(); } DecimalHive11ColumnReader::~DecimalHive11ColumnReader() { @@ -1674,12 +1675,12 @@ namespace orc { for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) { - if (throwOnOverflow) { + if (throwOnOverflow_) { throw ParseError("Hive 0.11 decimal was more than 38 digits."); } else { - *errorStream << "Warning: " - << "Hive 0.11 decimal with more than 38 digits " - << "replaced by NULL.\n"; + *errorStream_ << "Warning: " + << "Hive 0.11 decimal with more than 38 digits " + << "replaced by NULL.\n"; notNull[i] = false; } } @@ -1688,12 +1689,12 @@ namespace orc { } else { for (size_t i = 0; i < numValues; ++i) { if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) { - if (throwOnOverflow) { + if (throwOnOverflow_) { throw ParseError("Hive 0.11 decimal was more than 38 digits."); } else { - *errorStream << "Warning: " - << "Hive 0.11 decimal with more than 38 digits " - << "replaced by NULL.\n"; + *errorStream_ << "Warning: " + << "Hive 0.11 decimal with more than 38 digits " + << "replaced by NULL.\n"; batch.hasNulls = true; batch.notNull[i] = false; } diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc index f24be1f0b2..d31b1c65d4 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc @@ -24,6 +24,7 @@ #include "RLE.hh" #include "Statistics.hh" #include "Timezone.hh" +#include "Utils.hh" namespace orc { StreamsFactory::~StreamsFactory() { @@ -33,24 +34,25 @@ namespace orc { class StreamsFactoryImpl : public StreamsFactory { public: StreamsFactoryImpl(const WriterOptions& writerOptions, OutputStream* outputStream) - : options(writerOptions), outStream(outputStream) {} + : options_(writerOptions), outStream_(outputStream) {} virtual std::unique_ptr<BufferedOutputStream> createStream( proto::Stream_Kind kind) const override; private: - const WriterOptions& options; - OutputStream* outStream; + const WriterOptions& options_; + OutputStream* outStream_; }; std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(proto::Stream_Kind) const { // In the future, we can decide compression strategy and modifier // based on stream kind. But for now we just use the setting from // WriterOption - return createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(), - // BufferedOutputStream initial capacity - options.getOutputBufferCapacity(), options.getCompressionBlockSize(), - *options.getMemoryPool(), options.getWriterMetrics()); + return createCompressor( + options_.getCompression(), outStream_, options_.getCompressionStrategy(), + // BufferedOutputStream initial capacity + options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), + options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics()); } std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options, @@ -252,6 +254,10 @@ namespace orc { // PASS } + void ColumnWriter::finishStreams() { + notNullEncoder->finishEncode(); + } + class StructColumnWriter : public ColumnWriter { public: StructColumnWriter(const Type& type, const StreamsFactory& factory, @@ -281,8 +287,10 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: - std::vector<std::unique_ptr<ColumnWriter>> children; + std::vector<std::unique_ptr<ColumnWriter>> children_; }; StructColumnWriter::StructColumnWriter(const Type& type, const StreamsFactory& factory, @@ -290,7 +298,7 @@ namespace orc { : ColumnWriter(type, factory, options) { for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { const Type& child = *type.getSubtype(i); - children.push_back(buildWriter(child, factory, options)); + children_.push_back(buildWriter(child, factory, options)); } if (enableIndex) { @@ -307,8 +315,8 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); const char* notNull = structBatch->hasNulls ? structBatch->notNull.data() + offset : nullptr; - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->add(*structBatch->fields[i], offset, numValues, notNull); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->add(*structBatch->fields[i], offset, numValues, notNull); } // update stats @@ -330,22 +338,22 @@ namespace orc { void StructColumnWriter::flush(std::vector<proto::Stream>& streams) { ColumnWriter::flush(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->flush(streams); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->flush(streams); } } void StructColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeIndex(streams); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->writeIndex(streams); } } uint64_t StructColumnWriter::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - for (uint32_t i = 0; i < children.size(); ++i) { - size += children[i]->getEstimatedSize(); + for (uint32_t i = 0; i < children_.size(); ++i) { + size += children_[i]->getEstimatedSize(); } return size; } @@ -355,62 +363,69 @@ namespace orc { encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); encoding.set_dictionary_size(0); encodings.push_back(encoding); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getColumnEncoding(encodings); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->getColumnEncoding(encodings); } } void StructColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getStripeStatistics(stats); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->getStripeStatistics(stats); } } void StructColumnWriter::mergeStripeStatsIntoFileStats() { ColumnWriter::mergeStripeStatsIntoFileStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeStripeStatsIntoFileStats(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->mergeStripeStatsIntoFileStats(); } } void StructColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getFileStatistics(stats); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->getFileStatistics(stats); } } void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeRowGroupStatsIntoStripeStats(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->mergeRowGroupStatsIntoStripeStats(); } } void StructColumnWriter::createRowIndexEntry() { ColumnWriter::createRowIndexEntry(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->createRowIndexEntry(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->createRowIndexEntry(); } } void StructColumnWriter::reset() { ColumnWriter::reset(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->reset(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->reset(); } } void StructColumnWriter::writeDictionary() { - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeDictionary(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->writeDictionary(); + } + } + + void StructColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->finishStreams(); } } @@ -431,21 +446,23 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: std::unique_ptr<RleEncoder> rleEncoder; private: - RleVersion rleVersion; + RleVersion rleVersion_; }; template <typename BatchType> IntegerColumnWriter<BatchType>::IntegerColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) - : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) { + : ColumnWriter(type, factory, options), rleVersion_(options.getRleVersion()) { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool, + rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion_, memPool, options.getAlignedBitpacking()); if (enableIndex) { @@ -512,7 +529,7 @@ namespace orc { void IntegerColumnWriter<BatchType>::getColumnEncoding( std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_kind(RleVersionMapper(rleVersion_)); encoding.set_dictionary_size(0); if (enableBloomFilter) { encoding.set_bloom_encoding(BloomFilterVersion::UTF8); @@ -527,6 +544,12 @@ namespace orc { } template <typename BatchType> + void IntegerColumnWriter<BatchType>::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder->finishEncode(); + } + + template <typename BatchType> class ByteColumnWriter : public ColumnWriter { public: ByteColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); @@ -542,8 +565,10 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: - std::unique_ptr<ByteRleEncoder> byteRleEncoder; + std::unique_ptr<ByteRleEncoder> byteRleEncoder_; }; template <typename BatchType> @@ -552,7 +577,7 @@ namespace orc { : ColumnWriter(type, factory, options) { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); - byteRleEncoder = createByteRleEncoder(std::move(dataStream)); + byteRleEncoder_ = createByteRleEncoder(std::move(dataStream)); if (enableIndex) { recordPosition(); @@ -581,7 +606,7 @@ namespace orc { for (uint64_t i = 0; i < numValues; ++i) { byteData[i] = static_cast<char>(data[i]); } - byteRleEncoder->add(byteData, numValues, notNull); + byteRleEncoder_->add(byteData, numValues, notNull); uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { @@ -590,7 +615,7 @@ namespace orc { if (enableBloomFilter) { bloomFilter->addLong(data[i]); } - intStats->update(static_cast<int64_t>(byteData[i]), 1); + intStats->update(static_cast<int64_t>(static_cast<signed char>(byteData[i])), 1); } } intStats->increase(count); @@ -606,14 +631,14 @@ namespace orc { proto::Stream stream; stream.set_kind(proto::Stream_Kind_DATA); stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(byteRleEncoder->flush()); + stream.set_length(byteRleEncoder_->flush()); streams.push_back(stream); } template <typename BatchType> uint64_t ByteColumnWriter<BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - size += byteRleEncoder->getBufferSize(); + size += byteRleEncoder_->getBufferSize(); return size; } @@ -632,7 +657,13 @@ namespace orc { template <typename BatchType> void ByteColumnWriter<BatchType>::recordPosition() const { ColumnWriter::recordPosition(); - byteRleEncoder->recordPosition(rowIndexPosition.get()); + byteRleEncoder_->recordPosition(rowIndexPosition.get()); + } + + template <typename BatchType> + void ByteColumnWriter<BatchType>::finishStreams() { + ColumnWriter::finishStreams(); + byteRleEncoder_->finishEncode(); } template <typename BatchType> @@ -652,8 +683,10 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: - std::unique_ptr<ByteRleEncoder> rleEncoder; + std::unique_ptr<ByteRleEncoder> rleEncoder_; }; template <typename BatchType> @@ -663,7 +696,7 @@ namespace orc { : ColumnWriter(type, factory, options) { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createBooleanRleEncoder(std::move(dataStream)); + rleEncoder_ = createBooleanRleEncoder(std::move(dataStream)); if (enableIndex) { recordPosition(); @@ -694,7 +727,7 @@ namespace orc { for (uint64_t i = 0; i < numValues; ++i) { byteData[i] = static_cast<char>(data[i]); } - rleEncoder->add(byteData, numValues, notNull); + rleEncoder_->add(byteData, numValues, notNull); uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { @@ -719,14 +752,14 @@ namespace orc { proto::Stream stream; stream.set_kind(proto::Stream_Kind_DATA); stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); + stream.set_length(rleEncoder_->flush()); streams.push_back(stream); } template <typename BatchType> uint64_t BooleanColumnWriter<BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); + size += rleEncoder_->getBufferSize(); return size; } @@ -745,7 +778,13 @@ namespace orc { template <typename BatchType> void BooleanColumnWriter<BatchType>::recordPosition() const { ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); + rleEncoder_->recordPosition(rowIndexPosition.get()); + } + + template <typename BatchType> + void BooleanColumnWriter<BatchType>::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder_->finishEncode(); } template <typename ValueType, typename BatchType> @@ -765,10 +804,12 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: - bool isFloat; - std::unique_ptr<AppendOnlyBufferedStream> dataStream; - DataBuffer<char> buffer; + bool isFloat_; + std::unique_ptr<AppendOnlyBufferedStream> dataStream_; + DataBuffer<char> buffer_; }; template <typename ValueType, typename BatchType> @@ -777,10 +818,10 @@ namespace orc { const WriterOptions& options, bool isFloatType) : ColumnWriter(type, factory, options), - isFloat(isFloatType), - buffer(*options.getMemoryPool()) { - dataStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA))); - buffer.resize(isFloat ? 4 : 8); + isFloat_(isFloatType), + buffer_(*options.getMemoryPool()) { + dataStream_.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA))); + buffer_.resize(isFloat_ ? 4 : 8); if (enableIndex) { recordPosition(); @@ -816,17 +857,17 @@ namespace orc { const ValueType* doubleData = dblBatch->data.data() + offset; const char* notNull = dblBatch->hasNulls ? dblBatch->notNull.data() + offset : nullptr; - size_t bytes = isFloat ? 4 : 8; - char* data = buffer.data(); + size_t bytes = isFloat_ ? 4 : 8; + char* data = buffer_.data(); uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { - if (isFloat) { + if (isFloat_) { encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data); } else { encodeFloatNum<double, int64_t>(static_cast<double>(doubleData[i]), data); } - dataStream->write(data, bytes); + dataStream_->write(data, bytes); ++count; if (enableBloomFilter) { bloomFilter->addDouble(static_cast<double>(doubleData[i])); @@ -847,14 +888,14 @@ namespace orc { proto::Stream stream; stream.set_kind(proto::Stream_Kind_DATA); stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(dataStream->flush()); + stream.set_length(dataStream_->flush()); streams.push_back(stream); } template <typename ValueType, typename BatchType> uint64_t FloatingColumnWriter<ValueType, BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - size += dataStream->getSize(); + size += dataStream_->getSize(); return size; } @@ -873,7 +914,13 @@ namespace orc { template <typename ValueType, typename BatchType> void FloatingColumnWriter<ValueType, BatchType>::recordPosition() const { ColumnWriter::recordPosition(); - dataStream->recordPosition(rowIndexPosition.get()); + dataStream_->recordPosition(rowIndexPosition.get()); + } + + template <typename ValueType, typename BatchType> + void FloatingColumnWriter<ValueType, BatchType>::finishStreams() { + ColumnWriter::finishStreams(); + dataStream_->finishStream(); } /** @@ -887,10 +934,17 @@ namespace orc { size_t length; }; - SortedStringDictionary() : totalLength(0) {} + struct DictEntryWithIndex { + DictEntryWithIndex(const char* str, size_t len, size_t index) + : entry(str, len), index(index) {} + DictEntry entry; + size_t index; + }; + + SortedStringDictionary() : totalLength_(0) {} // insert a new string into dictionary, return its insertion order - size_t insert(const char* data, size_t len); + size_t insert(const char* str, size_t len); // write dictionary data & length to output buffer void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; @@ -911,7 +965,9 @@ namespace orc { private: struct LessThan { - bool operator()(const DictEntry& left, const DictEntry& right) const { + bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& r) { + const auto& left = l.entry; + const auto& right = r.entry; int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); if (ret != 0) { return ret < 0; @@ -920,29 +976,25 @@ namespace orc { } }; - std::map<DictEntry, size_t, LessThan> dict; - std::vector<std::vector<char>> data; - uint64_t totalLength; + mutable std::vector<DictEntryWithIndex> flatDict_; + std::unordered_map<std::string, size_t> keyToIndex_; + uint64_t totalLength_; // use friend class here to avoid being bothered by const function calls friend class StringColumnWriter; friend class CharColumnWriter; friend class VarCharColumnWriter; // store indexes of insertion order in the dictionary for not-null rows - std::vector<int64_t> idxInDictBuffer; + std::vector<int64_t> idxInDictBuffer_; }; // insert a new string into dictionary, return its insertion order size_t SortedStringDictionary::insert(const char* str, size_t len) { - auto ret = dict.insert({DictEntry(str, len), dict.size()}); + size_t index = flatDict_.size(); + auto ret = keyToIndex_.emplace(std::string(str, len), index); if (ret.second) { - // make a copy to internal storage - data.push_back(std::vector<char>(len)); - memcpy(data.back().data(), str, len); - // update dictionary entry to link pointer to internal storage - DictEntry* entry = const_cast<DictEntry*>(&(ret.first->first)); - entry->data = data.back().data(); - totalLength += len; + flatDict_.emplace_back(ret.first->first.data(), ret.first->first.size(), index); + totalLength_ += len; } return ret.first->second; } @@ -950,9 +1002,12 @@ namespace orc { // write dictionary data & length to output buffer void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const { - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - dataStream->write(it->first.data, it->first.length); - lengthEncoder->write(static_cast<int64_t>(it->first.length)); + std::sort(flatDict_.begin(), flatDict_.end(), LessThan()); + + for (const auto& entryWithIndex : flatDict_) { + const auto& entry = entryWithIndex.entry; + dataStream->write(entry.data, entry.length); + lengthEncoder->write(static_cast<int64_t>(entry.length)); } } @@ -968,10 +1023,9 @@ namespace orc { */ void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const { // iterate the dictionary to get mapping from insertion order to value order - std::vector<size_t> mapping(dict.size()); - size_t dictIdx = 0; - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - mapping[it->second] = dictIdx++; + std::vector<size_t> mapping(flatDict_.size()); + for (size_t i = 0; i < flatDict_.size(); ++i) { + mapping[flatDict_[i].index] = i; } // do the transformation @@ -983,26 +1037,31 @@ namespace orc { // get dict entries in insertion order void SortedStringDictionary::getEntriesInInsertionOrder( std::vector<const DictEntry*>& entries) const { - entries.resize(dict.size()); - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - entries[it->second] = &(it->first); + std::sort(flatDict_.begin(), flatDict_.end(), + [](const DictEntryWithIndex& left, const DictEntryWithIndex& right) { + return left.index < right.index; + }); + + entries.resize(flatDict_.size()); + for (size_t i = 0; i < flatDict_.size(); ++i) { + entries[i] = &(flatDict_[i].entry); } } // return count of entries size_t SortedStringDictionary::size() const { - return dict.size(); + return flatDict_.size(); } // return total length of strings in the dictioanry uint64_t SortedStringDictionary::length() const { - return totalLength; + return totalLength_; } void SortedStringDictionary::clear() { - totalLength = 0; - data.clear(); - dict.clear(); + totalLength_ = 0; + keyToIndex_.clear(); + flatDict_.clear(); } class StringColumnWriter : public ColumnWriter { @@ -1027,6 +1086,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: /** * dictionary related functions @@ -1123,7 +1184,7 @@ namespace orc { const size_t len = static_cast<size_t>(length[i]); if (useDictionary) { size_t index = dictionary.insert(data[i], len); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + dictionary.idxInDictBuffer_.push_back(static_cast<int64_t>(index)); } else { directDataStream->write(data[i], len); } @@ -1184,7 +1245,7 @@ namespace orc { } else { size += dictionary.length(); size += dictionary.size() * sizeof(int32_t); - size += dictionary.idxInDictBuffer.size() * sizeof(int32_t); + size += dictionary.idxInDictBuffer_.size() * sizeof(int32_t); if (useCompression) { size /= 3; // estimated ratio is 3:1 } @@ -1215,15 +1276,23 @@ namespace orc { directLengthEncoder->recordPosition(rowIndexPosition.get()); } else { if (enableIndex) { - startOfRowGroups.push_back(dictionary.idxInDictBuffer.size()); + startOfRowGroups.push_back(dictionary.idxInDictBuffer_.size()); } } } + void StringColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + if (!useDictionary) { + directDataStream->finishStream(); + directLengthEncoder->finishEncode(); + } + } + bool StringColumnWriter::checkDictionaryKeyRatio() { if (!doneDictionaryCheck) { useDictionary = dictionary.size() <= - static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer.size()) * + static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer_.size()) * dictSizeThreshold); doneDictionaryCheck = true; } @@ -1244,7 +1313,7 @@ namespace orc { ColumnWriter::reset(); dictionary.clear(); - dictionary.idxInDictBuffer.resize(0); + dictionary.idxInDictBuffer_.resize(0); startOfRowGroups.clear(); startOfRowGroups.push_back(0); } @@ -1277,7 +1346,7 @@ namespace orc { dictStream.reset(nullptr); dictionary.clear(); - dictionary.idxInDictBuffer.clear(); + dictionary.idxInDictBuffer_.clear(); startOfRowGroups.clear(); } @@ -1295,10 +1364,10 @@ namespace orc { dictionary.flush(dictStream.get(), dictLengthEncoder.get()); // convert index from insertion order to dictionary order - dictionary.reorder(dictionary.idxInDictBuffer); + dictionary.reorder(dictionary.idxInDictBuffer_); // write data sequences - int64_t* data = dictionary.idxInDictBuffer.data(); + int64_t* data = dictionary.idxInDictBuffer_.data(); if (enableIndex) { size_t prevOffset = 0; for (size_t i = 0; i < startOfRowGroups.size(); ++i) { @@ -1319,10 +1388,10 @@ namespace orc { prevOffset = offset; } - dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer.size() - prevOffset, + dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer_.size() - prevOffset, nullptr); } else { - dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr); + dictDataEncoder->add(data, dictionary.idxInDictBuffer_.size(), nullptr); } } } @@ -1345,9 +1414,9 @@ namespace orc { // store each length of the data into a vector const SortedStringDictionary::DictEntry* dictEntry = nullptr; - for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) { + for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) { // write one row data in direct encoding - dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])]; + dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])]; directDataStream->write(dictEntry->data, dictEntry->length); directLengthEncoder->write(static_cast<int64_t>(dictEntry->length)); } @@ -1355,91 +1424,22 @@ namespace orc { deleteDictStreams(); } - struct Utf8Utils { - /** - * Counts how many utf-8 chars of the input data - */ - static uint64_t charLength(const char* data, uint64_t length) { - uint64_t chars = 0; - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - } - return chars; - } - - /** - * Return the number of bytes required to read at most maxCharLength - * characters in full from a utf-8 encoded byte array provided - * by data. This does not validate utf-8 data, but - * operates correctly on already valid utf-8 data. - * - * @param maxCharLength number of characters required - * @param data the bytes of UTF-8 - * @param length the length of data to truncate - */ - static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { - uint64_t chars = 0; - if (length <= maxCharLength) { - return length; - } - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - if (chars > maxCharLength) { - return i; - } - } - // everything fits - return length; - } - - /** - * Checks if b is the first byte of a UTF-8 character. - */ - inline static bool isUtfStartByte(char b) { - return (b & 0xC0) != 0x80; - } - - /** - * Find the start of the last character that ends in the current string. - * @param text the bytes of the utf-8 - * @param from the first byte location - * @param until the last byte location - * @return the index of the last character - */ - static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { - uint64_t posn = until; - /* we don't expect characters more than 5 bytes */ - while (posn >= from) { - if (isUtfStartByte(text[posn])) { - return posn; - } - posn -= 1; - } - /* beginning of a valid char not found */ - throw std::logic_error("Could not truncate string, beginning of a valid char not found"); - } - }; - class CharColumnWriter : public StringColumnWriter { public: CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) : StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()), - padBuffer(*options.getMemoryPool()) { + maxLength_(type.getMaximumLength()), + padBuffer_(*options.getMemoryPool()) { // utf-8 is currently 4 bytes long, but it could be up to 6 - padBuffer.resize(maxLength * 6); + padBuffer_.resize(maxLength_ * 6); } virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; private: - uint64_t maxLength; - DataBuffer<char> padBuffer; + uint64_t maxLength_; + DataBuffer<char> padBuffer_; }; void CharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, @@ -1467,22 +1467,22 @@ namespace orc { const char* charData = nullptr; uint64_t originLength = static_cast<uint64_t>(length[i]); uint64_t charLength = Utf8Utils::charLength(data[i], originLength); - if (charLength >= maxLength) { + if (charLength >= maxLength_) { charData = data[i]; length[i] = - static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); + static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength_, data[i], originLength)); } else { - charData = padBuffer.data(); + charData = padBuffer_.data(); // the padding is exactly 1 byte per char - length[i] = length[i] + static_cast<int64_t>(maxLength - charLength); - memcpy(padBuffer.data(), data[i], originLength); - memset(padBuffer.data() + originLength, ' ', + length[i] = length[i] + static_cast<int64_t>(maxLength_ - charLength); + memcpy(padBuffer_.data(), data[i], originLength); + memset(padBuffer_.data() + originLength, ' ', static_cast<size_t>(length[i]) - originLength); } if (useDictionary) { size_t index = dictionary.insert(charData, static_cast<size_t>(length[i])); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + dictionary.idxInDictBuffer_.push_back(static_cast<int64_t>(index)); } else { directDataStream->write(charData, static_cast<size_t>(length[i])); } @@ -1509,7 +1509,7 @@ namespace orc { public: VarCharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) - : StringColumnWriter(type, factory, options), maxLength(type.getMaximumLength()) { + : StringColumnWriter(type, factory, options), maxLength_(type.getMaximumLength()) { // PASS } @@ -1517,7 +1517,7 @@ namespace orc { const char* incomingMask) override; private: - uint64_t maxLength; + uint64_t maxLength_; }; void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, @@ -1543,12 +1543,12 @@ namespace orc { for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { uint64_t itemLength = - Utf8Utils::truncateBytesTo(maxLength, data[i], static_cast<uint64_t>(length[i])); + Utf8Utils::truncateBytesTo(maxLength_, data[i], static_cast<uint64_t>(length[i])); length[i] = static_cast<int64_t>(itemLength); if (useDictionary) { size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i])); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + dictionary.idxInDictBuffer_.push_back(static_cast<int64_t>(index)); } else { directDataStream->write(data[i], static_cast<size_t>(length[i])); } @@ -1638,28 +1638,30 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder; private: - RleVersion rleVersion; - const Timezone* timezone; - const bool isUTC; + RleVersion rleVersion_; + const Timezone* timezone_; + const bool isUTC_; }; TimestampColumnWriter::TimestampColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options, bool isInstantType) : ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - timezone(isInstantType ? &getTimezoneByName("GMT") : &options.getTimezone()), - isUTC(isInstantType || options.getTimezoneName() == "GMT") { + rleVersion_(options.getRleVersion()), + timezone_(isInstantType ? &getTimezoneByName("GMT") : &options.getTimezone()), + isUTC_(isInstantType || options.getTimezoneName() == "GMT") { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); std::unique_ptr<BufferedOutputStream> secondaryStream = factory.createStream(proto::Stream_Kind_SECONDARY); - secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool, + secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion_, memPool, options.getAlignedBitpacking()); - nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion, memPool, + nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion_, memPool, options.getAlignedBitpacking()); if (enableIndex) { @@ -1712,8 +1714,8 @@ namespace orc { if (notNull == nullptr || notNull[i]) { // TimestampVectorBatch already stores data in UTC int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; - if (!isUTC) { - millsUTC = timezone->convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000; + if (!isUTC_) { + millsUTC = timezone_->convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000; } ++count; if (enableBloomFilter) { @@ -1725,7 +1727,7 @@ namespace orc { secs[i] += 1; } - secs[i] -= timezone->getEpoch(); + secs[i] -= timezone_->getEpoch(); nanos[i] = formatNano(nanos[i]); } } @@ -1764,7 +1766,7 @@ namespace orc { void TimestampColumnWriter::getColumnEncoding( std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_kind(RleVersionMapper(rleVersion_)); encoding.set_dictionary_size(0); if (enableBloomFilter) { encoding.set_bloom_encoding(BloomFilterVersion::UTF8); @@ -1778,6 +1780,12 @@ namespace orc { nanoRleEncoder->recordPosition(rowIndexPosition.get()); } + void TimestampColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + secRleEncoder->finishEncode(); + nanoRleEncoder->finishEncode(); + } + class DateColumnWriter : public IntegerColumnWriter<LongVectorBatch> { public: DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); @@ -1847,6 +1855,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: RleVersion rleVersion; uint64_t precision; @@ -1855,7 +1865,7 @@ namespace orc { std::unique_ptr<RleEncoder> scaleEncoder; private: - char buffer[10]; + char buffer_[10]; }; Decimal64ColumnWriter::Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory, @@ -1897,7 +1907,7 @@ namespace orc { for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { int64_t val = zigZag(values[i]); - char* data = buffer; + char* data = buffer_; while (true) { if ((val & ~0x7f) == 0) { *(data++) = (static_cast<char>(val)); @@ -1908,7 +1918,7 @@ namespace orc { val = (static_cast<uint64_t>(val) >> 7); } } - valueStream->write(buffer, static_cast<size_t>(data - buffer)); + valueStream->write(buffer_, static_cast<size_t>(data - buffer_)); ++count; if (enableBloomFilter) { std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true); @@ -1965,6 +1975,12 @@ namespace orc { scaleEncoder->recordPosition(rowIndexPosition.get()); } + void Decimal64ColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + valueStream->finishStream(); + scaleEncoder->finishEncode(); + } + class Decimal64ColumnWriterV2 : public ColumnWriter { public: Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory, @@ -1981,6 +1997,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: uint64_t precision; uint64_t scale; @@ -2071,6 +2089,11 @@ namespace orc { valueEncoder->recordPosition(rowIndexPosition.get()); } + void Decimal64ColumnWriterV2::finishStreams() { + ColumnWriter::finishStreams(); + valueEncoder->finishEncode(); + } + class Decimal128ColumnWriter : public Decimal64ColumnWriter { public: Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2080,7 +2103,7 @@ namespace orc { const char* incomingMask) override; private: - char buffer[20]; + char buffer_[20]; }; Decimal128ColumnWriter::Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2126,7 +2149,7 @@ namespace orc { for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { Int128 val = zigZagInt128(values[i]); - char* data = buffer; + char* data = buffer_; while (true) { if ((val & ~0x7f) == 0) { *(data++) = (static_cast<char>(val.getLowBits())); @@ -2136,7 +2159,7 @@ namespace orc { val >>= 7; } } - valueStream->write(buffer, static_cast<size_t>(data - buffer)); + valueStream->write(buffer_, static_cast<size_t>(data - buffer_)); ++count; if (enableBloomFilter) { @@ -2186,22 +2209,24 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: - std::unique_ptr<RleEncoder> lengthEncoder; - RleVersion rleVersion; - std::unique_ptr<ColumnWriter> child; + std::unique_ptr<RleEncoder> lengthEncoder_; + RleVersion rleVersion_; + std::unique_ptr<ColumnWriter> child_; }; ListColumnWriter::ListColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) - : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) { + : ColumnWriter(type, factory, options), rleVersion_(options.getRleVersion()) { std::unique_ptr<BufferedOutputStream> lengthStream = factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool, - options.getAlignedBitpacking()); + lengthEncoder_ = createRleEncoder(std::move(lengthStream), false, rleVersion_, memPool, + options.getAlignedBitpacking()); if (type.getSubtypeCount() == 1) { - child = buildWriter(*type.getSubtype(0), factory, options); + child_ = buildWriter(*type.getSubtype(0), factory, options); } if (enableIndex) { @@ -2239,10 +2264,10 @@ namespace orc { } // unnecessary to deal with null as elements are packed together - if (child.get()) { - child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr); + if (child_.get()) { + child_->add(*listBatch->elements, elemOffset, totalNumValues, nullptr); } - lengthEncoder->add(offsets, numValues, notNull); + lengthEncoder_->add(offsets, numValues, notNull); if (enableIndex) { if (!notNull) { @@ -2272,93 +2297,101 @@ namespace orc { proto::Stream stream; stream.set_kind(proto::Stream_Kind_LENGTH); stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(lengthEncoder->flush()); + stream.set_length(lengthEncoder_->flush()); streams.push_back(stream); - if (child.get()) { - child->flush(streams); + if (child_.get()) { + child_->flush(streams); } } void ListColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); - if (child.get()) { - child->writeIndex(streams); + if (child_.get()) { + child_->writeIndex(streams); } } uint64_t ListColumnWriter::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - if (child.get()) { - size += lengthEncoder->getBufferSize(); - size += child->getEstimatedSize(); + if (child_.get()) { + size += lengthEncoder_->getBufferSize(); + size += child_->getEstimatedSize(); } return size; } void ListColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_kind(RleVersionMapper(rleVersion_)); encoding.set_dictionary_size(0); if (enableBloomFilter) { encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); - if (child.get()) { - child->getColumnEncoding(encodings); + if (child_.get()) { + child_->getColumnEncoding(encodings); } } void ListColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); - if (child.get()) { - child->getStripeStatistics(stats); + if (child_.get()) { + child_->getStripeStatistics(stats); } } void ListColumnWriter::mergeStripeStatsIntoFileStats() { ColumnWriter::mergeStripeStatsIntoFileStats(); - if (child.get()) { - child->mergeStripeStatsIntoFileStats(); + if (child_.get()) { + child_->mergeStripeStatsIntoFileStats(); } } void ListColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); - if (child.get()) { - child->getFileStatistics(stats); + if (child_.get()) { + child_->getFileStatistics(stats); } } void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - if (child.get()) { - child->mergeRowGroupStatsIntoStripeStats(); + if (child_.get()) { + child_->mergeRowGroupStatsIntoStripeStats(); } } void ListColumnWriter::createRowIndexEntry() { ColumnWriter::createRowIndexEntry(); - if (child.get()) { - child->createRowIndexEntry(); + if (child_.get()) { + child_->createRowIndexEntry(); } } void ListColumnWriter::recordPosition() const { ColumnWriter::recordPosition(); - lengthEncoder->recordPosition(rowIndexPosition.get()); + lengthEncoder_->recordPosition(rowIndexPosition.get()); } void ListColumnWriter::reset() { ColumnWriter::reset(); - if (child) { - child->reset(); + if (child_) { + child_->reset(); } } void ListColumnWriter::writeDictionary() { - if (child) { - child->writeDictionary(); + if (child_) { + child_->writeDictionary(); + } + } + + void ListColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + lengthEncoder_->finishEncode(); + if (child_) { + child_->finishStreams(); } } @@ -2394,27 +2427,29 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: - std::unique_ptr<ColumnWriter> keyWriter; - std::unique_ptr<ColumnWriter> elemWriter; - std::unique_ptr<RleEncoder> lengthEncoder; - RleVersion rleVersion; + std::unique_ptr<ColumnWriter> keyWriter_; + std::unique_ptr<ColumnWriter> elemWriter_; + std::unique_ptr<RleEncoder> lengthEncoder_; + RleVersion rleVersion_; }; MapColumnWriter::MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) - : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) { + : ColumnWriter(type, factory, options), rleVersion_(options.getRleVersion()) { std::unique_ptr<BufferedOutputStream> lengthStream = factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool, - options.getAlignedBitpacking()); + lengthEncoder_ = createRleEncoder(std::move(lengthStream), false, rleVersion_, memPool, + options.getAlignedBitpacking()); if (type.getSubtypeCount() > 0) { - keyWriter = buildWriter(*type.getSubtype(0), factory, options); + keyWriter_ = buildWriter(*type.getSubtype(0), factory, options); } if (type.getSubtypeCount() > 1) { - elemWriter = buildWriter(*type.getSubtype(1), factory, options); + elemWriter_ = buildWriter(*type.getSubtype(1), factory, options); } if (enableIndex) { @@ -2451,14 +2486,14 @@ namespace orc { offsets[i] = offsets[i + 1] - offsets[i]; } - lengthEncoder->add(offsets, numValues, notNull); + lengthEncoder_->add(offsets, numValues, notNull); // unnecessary to deal with null as keys and values are packed together - if (keyWriter.get()) { - keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr); + if (keyWriter_.get()) { + keyWriter_->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr); } - if (elemWriter.get()) { - elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr); + if (elemWriter_.get()) { + elemWriter_->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr); } if (enableIndex) { @@ -2489,126 +2524,137 @@ namespace orc { proto::Stream stream; stream.set_kind(proto::Stream_Kind_LENGTH); stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(lengthEncoder->flush()); + stream.set_length(lengthEncoder_->flush()); streams.push_back(stream); - if (keyWriter.get()) { - keyWriter->flush(streams); + if (keyWriter_.get()) { + keyWriter_->flush(streams); } - if (elemWriter.get()) { - elemWriter->flush(streams); + if (elemWriter_.get()) { + elemWriter_->flush(streams); } } void MapColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); - if (keyWriter.get()) { - keyWriter->writeIndex(streams); + if (keyWriter_.get()) { + keyWriter_->writeIndex(streams); } - if (elemWriter.get()) { - elemWriter->writeIndex(streams); + if (elemWriter_.get()) { + elemWriter_->writeIndex(streams); } } uint64_t MapColumnWriter::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - size += lengthEncoder->getBufferSize(); - if (keyWriter.get()) { - size += keyWriter->getEstimatedSize(); + size += lengthEncoder_->getBufferSize(); + if (keyWriter_.get()) { + size += keyWriter_->getEstimatedSize(); } - if (elemWriter.get()) { - size += elemWriter->getEstimatedSize(); + if (elemWriter_.get()) { + size += elemWriter_->getEstimatedSize(); } return size; } void MapColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_kind(RleVersionMapper(rleVersion_)); encoding.set_dictionary_size(0); if (enableBloomFilter) { encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); - if (keyWriter.get()) { - keyWriter->getColumnEncoding(encodings); + if (keyWriter_.get()) { + keyWriter_->getColumnEncoding(encodings); } - if (elemWriter.get()) { - elemWriter->getColumnEncoding(encodings); + if (elemWriter_.get()) { + elemWriter_->getColumnEncoding(encodings); } } void MapColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); - if (keyWriter.get()) { - keyWriter->getStripeStatistics(stats); + if (keyWriter_.get()) { + keyWriter_->getStripeStatistics(stats); } - if (elemWriter.get()) { - elemWriter->getStripeStatistics(stats); + if (elemWriter_.get()) { + elemWriter_->getStripeStatistics(stats); } } void MapColumnWriter::mergeStripeStatsIntoFileStats() { ColumnWriter::mergeStripeStatsIntoFileStats(); - if (keyWriter.get()) { - keyWriter->mergeStripeStatsIntoFileStats(); + if (keyWriter_.get()) { + keyWriter_->mergeStripeStatsIntoFileStats(); } - if (elemWriter.get()) { - elemWriter->mergeStripeStatsIntoFileStats(); + if (elemWriter_.get()) { + elemWriter_->mergeStripeStatsIntoFileStats(); } } void MapColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); - if (keyWriter.get()) { - keyWriter->getFileStatistics(stats); + if (keyWriter_.get()) { + keyWriter_->getFileStatistics(stats); } - if (elemWriter.get()) { - elemWriter->getFileStatistics(stats); + if (elemWriter_.get()) { + elemWriter_->getFileStatistics(stats); } } void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - if (keyWriter.get()) { - keyWriter->mergeRowGroupStatsIntoStripeStats(); + if (keyWriter_.get()) { + keyWriter_->mergeRowGroupStatsIntoStripeStats(); } - if (elemWriter.get()) { - elemWriter->mergeRowGroupStatsIntoStripeStats(); + if (elemWriter_.get()) { + elemWriter_->mergeRowGroupStatsIntoStripeStats(); } } void MapColumnWriter::createRowIndexEntry() { ColumnWriter::createRowIndexEntry(); - if (keyWriter.get()) { - keyWriter->createRowIndexEntry(); + if (keyWriter_.get()) { + keyWriter_->createRowIndexEntry(); } - if (elemWriter.get()) { - elemWriter->createRowIndexEntry(); + if (elemWriter_.get()) { + elemWriter_->createRowIndexEntry(); } } void MapColumnWriter::recordPosition() const { ColumnWriter::recordPosition(); - lengthEncoder->recordPosition(rowIndexPosition.get()); + lengthEncoder_->recordPosition(rowIndexPosition.get()); } void MapColumnWriter::reset() { ColumnWriter::reset(); - if (keyWriter) { - keyWriter->reset(); + if (keyWriter_) { + keyWriter_->reset(); } - if (elemWriter) { - elemWriter->reset(); + if (elemWriter_) { + elemWriter_->reset(); } } void MapColumnWriter::writeDictionary() { - if (keyWriter) { - keyWriter->writeDictionary(); + if (keyWriter_) { + keyWriter_->writeDictionary(); + } + if (elemWriter_) { + elemWriter_->writeDictionary(); + } + } + + void MapColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + lengthEncoder_->finishEncode(); + if (keyWriter_) { + keyWriter_->finishStreams(); } - if (elemWriter) { - elemWriter->writeDictionary(); + if (elemWriter_) { + elemWriter_->finishStreams(); } } @@ -2644,9 +2690,11 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: - std::unique_ptr<ByteRleEncoder> rleEncoder; - std::vector<std::unique_ptr<ColumnWriter>> children; + std::unique_ptr<ByteRleEncoder> rleEncoder_; + std::vector<std::unique_ptr<ColumnWriter>> children_; }; UnionColumnWriter::UnionColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2654,10 +2702,10 @@ namespace orc { : ColumnWriter(type, factory, options) { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createByteRleEncoder(std::move(dataStream)); + rleEncoder_ = createByteRleEncoder(std::move(dataStream)); for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { - children.push_back(buildWriter(*type.getSubtype(i), factory, options)); + children_.push_back(buildWriter(*type.getSubtype(i), factory, options)); } if (enableIndex) { @@ -2678,8 +2726,8 @@ namespace orc { unsigned char* tags = unionBatch->tags.data() + offset; uint64_t* offsets = unionBatch->offsets.data() + offset; - std::vector<int64_t> childOffset(children.size(), -1); - std::vector<uint64_t> childLength(children.size(), 0); + std::vector<int64_t> childOffset(children_.size(), -1); + std::vector<uint64_t> childLength(children_.size(), 0); for (uint64_t i = 0; i != numValues; ++i) { if (childOffset[tags[i]] == -1) { @@ -2688,12 +2736,12 @@ namespace orc { ++childLength[tags[i]]; } - rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull); + rleEncoder_->add(reinterpret_cast<char*>(tags), numValues, notNull); - for (uint32_t i = 0; i < children.size(); ++i) { + for (uint32_t i = 0; i < children_.size(); ++i) { if (childLength[i] > 0) { - children[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]), - childLength[i], nullptr); + children_[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]), + childLength[i], nullptr); } } @@ -2725,26 +2773,26 @@ namespace orc { proto::Stream stream; stream.set_kind(proto::Stream_Kind_DATA); stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); + stream.set_length(rleEncoder_->flush()); streams.push_back(stream); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->flush(streams); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->flush(streams); } } void UnionColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeIndex(streams); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->writeIndex(streams); } } uint64_t UnionColumnWriter::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - for (uint32_t i = 0; i < children.size(); ++i) { - size += children[i]->getEstimatedSize(); + size += rleEncoder_->getBufferSize(); + for (uint32_t i = 0; i < children_.size(); ++i) { + size += children_[i]->getEstimatedSize(); } return size; } @@ -2757,61 +2805,69 @@ namespace orc { encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getColumnEncoding(encodings); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->getColumnEncoding(encodings); } } void UnionColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getStripeStatistics(stats); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->getStripeStatistics(stats); } } void UnionColumnWriter::mergeStripeStatsIntoFileStats() { ColumnWriter::mergeStripeStatsIntoFileStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeStripeStatsIntoFileStats(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->mergeStripeStatsIntoFileStats(); } } void UnionColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getFileStatistics(stats); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->getFileStatistics(stats); } } void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeRowGroupStatsIntoStripeStats(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->mergeRowGroupStatsIntoStripeStats(); } } void UnionColumnWriter::createRowIndexEntry() { ColumnWriter::createRowIndexEntry(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->createRowIndexEntry(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->createRowIndexEntry(); } } void UnionColumnWriter::recordPosition() const { ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); + rleEncoder_->recordPosition(rowIndexPosition.get()); } void UnionColumnWriter::reset() { ColumnWriter::reset(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->reset(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->reset(); } } void UnionColumnWriter::writeDictionary() { - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeDictionary(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->writeDictionary(); + } + } + + void UnionColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder_->finishEncode(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->finishStreams(); } } diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh index f21ffd6f83..1c5e15d707 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh @@ -53,14 +53,14 @@ namespace orc { public: virtual ~RowIndexPositionRecorder() override; - RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry(entry) {} + RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry_(entry) {} virtual void add(uint64_t pos) override { - rowIndexEntry.add_positions(pos); + rowIndexEntry_.add_positions(pos); } private: - proto::RowIndexEntry& rowIndexEntry; + proto::RowIndexEntry& rowIndexEntry_; }; /** @@ -179,6 +179,18 @@ namespace orc { */ virtual void writeDictionary(); + /** + * Finalize the encoding and compressing process. This function should be + * called after all data required for encoding has been added. It ensures + * that any remaining data is processed and the final state of the streams + * is set. + * Note: boolean type cannot cut off the current byte if it is not filled + * with 8 bits, otherwise Boolean RLE may incorrectly read the unfilled + * trailing bits. In this case, the last byte will be the head of the next + * compression block. + */ + virtual void finishStreams(); + protected: /** * Utility function to translate ColumnStatistics into protobuf form and diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc index cf2ff27ef1..52efa12d94 100644 --- a/contrib/libs/apache/orc/c++/src/Common.cc +++ b/contrib/libs/apache/orc/c++/src/Common.cc @@ -133,11 +133,11 @@ namespace orc { } std::string FileVersion::toString() const { - if (majorVersion == 1 && minorVersion == 9999) { + if (majorVersion_ == 1 && minorVersion_ == 9999) { return "UNSTABLE-PRE-2.0"; } std::stringstream ss; - ss << majorVersion << '.' << minorVersion; + ss << majorVersion_ << '.' << minorVersion_; return ss.str(); } diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc index 94be774ab4..f373a75bff 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.cc +++ b/contrib/libs/apache/orc/c++/src/Compression.cc @@ -52,19 +52,22 @@ namespace orc { class CompressionStreamBase : public BufferedOutputStream { public: CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual bool Next(void** data, int* size) override = 0; - virtual void BackUp(int count) override; + virtual void BackUp(int count) override = 0; virtual std::string getName() const override = 0; - virtual uint64_t flush() override; - virtual void suppress() override; + virtual uint64_t flush() override = 0; + virtual void suppress() override = 0; virtual bool isCompressed() const override { return true; } virtual uint64_t getSize() const override; + virtual uint64_t getRawInputBufferSize() const override = 0; + virtual void finishStream() override = 0; protected: void writeData(const unsigned char* data, int size); @@ -78,9 +81,6 @@ namespace orc { // ensure enough room for compression block header void ensureHeader(); - // Buffer to hold uncompressed data until user calls Next() - DataBuffer<unsigned char> rawInputBuffer; - // Compress level int level; @@ -99,46 +99,26 @@ namespace orc { // Compression block header pointer array static const uint32_t HEADER_SIZE = 3; std::array<char*, HEADER_SIZE> header; + + // Compression block size + uint64_t compressionBlockSize; }; CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, - MemoryPool& pool, WriterMetrics* metrics) - : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics), - rawInputBuffer(pool, blockSize), + uint64_t capacity, uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics) + : BufferedOutputStream(pool, outStream, capacity, memoryBlockSize, metrics), level(compressionLevel), outputBuffer(nullptr), bufferSize(0), outputPosition(0), - outputSize(0) { + outputSize(0), + compressionBlockSize(compressionBlockSize) { // init header pointer array header.fill(nullptr); } - void CompressionStreamBase::BackUp(int count) { - if (count > bufferSize) { - throw std::logic_error("Can't backup that much!"); - } - bufferSize -= count; - } - - uint64_t CompressionStreamBase::flush() { - void* data; - int size; - if (!Next(&data, &size)) { - throw std::runtime_error("Failed to flush compression buffer."); - } - BufferedOutputStream::BackUp(outputSize - outputPosition); - bufferSize = outputSize = outputPosition = 0; - return BufferedOutputStream::flush(); - } - - void CompressionStreamBase::suppress() { - outputBuffer = nullptr; - bufferSize = outputPosition = outputSize = 0; - BufferedOutputStream::suppress(); - } - uint64_t CompressionStreamBase::getSize() const { return BufferedOutputStream::getSize() - static_cast<uint64_t>(outputSize - outputPosition); } @@ -149,12 +129,12 @@ namespace orc { while (offset < size) { if (outputPosition == outputSize) { if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); + throw CompressionError("Failed to get next output buffer from output stream."); } outputPosition = 0; } else if (outputPosition > outputSize) { // for safety this will unlikely happen - throw std::logic_error("Write to an out-of-bound place during compression!"); + throw CompressionError("Write to an out-of-bound place during compression!"); } int currentSize = std::min(outputSize - outputPosition, size - offset); memcpy(outputBuffer + outputPosition, data + offset, static_cast<size_t>(currentSize)); @@ -168,7 +148,7 @@ namespace orc { for (uint32_t i = 0; i < HEADER_SIZE; ++i) { if (outputPosition >= outputSize) { if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); + throw CompressionError("Failed to get next output buffer from output stream."); } outputPosition = 0; } @@ -183,31 +163,74 @@ namespace orc { class CompressionStream : public CompressionStreamBase { public: CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual bool Next(void** data, int* size) override; virtual std::string getName() const override = 0; + virtual void BackUp(int count) override; + virtual void suppress() override; + virtual uint64_t flush() override; + uint64_t getRawInputBufferSize() const override { + return rawInputBuffer.size(); + } + virtual void finishStream() override { + compressInternal(); + BufferedOutputStream::finishStream(); + } protected: // return total compressed size virtual uint64_t doStreamingCompression() = 0; + + // Buffer to hold uncompressed data until user calls Next() + BlockBuffer rawInputBuffer; + + void compressInternal(); }; + void CompressionStream::BackUp(int count) { + uint64_t backup = static_cast<uint64_t>(count); + uint64_t currSize = rawInputBuffer.size(); + if (backup > currSize) { + throw CompressionError("Can't backup that much!"); + } + rawInputBuffer.resize(currSize - backup); + } + + uint64_t CompressionStream::flush() { + compressInternal(); + BufferedOutputStream::BackUp(outputSize - outputPosition); + rawInputBuffer.resize(0); + outputSize = outputPosition = 0; + return BufferedOutputStream::flush(); + } + + void CompressionStream::suppress() { + outputBuffer = nullptr; + outputPosition = outputSize = 0; + rawInputBuffer.resize(0); + BufferedOutputStream::suppress(); + } + CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, MemoryPool& pool, + uint64_t capacity, uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, WriterMetrics* metrics) - : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) { + : CompressionStreamBase(outStream, compressionLevel, capacity, compressionBlockSize, + memoryBlockSize, pool, metrics), + rawInputBuffer(pool, memoryBlockSize) { // PASS } - bool CompressionStream::Next(void** data, int* size) { - if (bufferSize != 0) { + void CompressionStream::compressInternal() { + if (rawInputBuffer.size() != 0) { ensureHeader(); uint64_t preSize = getSize(); uint64_t totalCompressedSize = doStreamingCompression(); - if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { - writeHeader(static_cast<size_t>(bufferSize), true); + if (totalCompressedSize >= static_cast<unsigned long>(rawInputBuffer.size())) { + writeHeader(static_cast<size_t>(rawInputBuffer.size()), true); // reset output buffer outputBuffer = nullptr; outputPosition = outputSize = 0; @@ -215,23 +238,42 @@ namespace orc { BufferedOutputStream::BackUp(static_cast<int>(backup)); // copy raw input buffer into block buffer - writeData(rawInputBuffer.data(), bufferSize); + uint64_t blockNumber = rawInputBuffer.getBlockNumber(); + for (uint64_t i = 0; i < blockNumber; ++i) { + auto block = rawInputBuffer.getBlock(i); + writeData(reinterpret_cast<const unsigned char*>(block.data), block.size); + } } else { writeHeader(totalCompressedSize, false); } + rawInputBuffer.resize(0); } + } - *data = rawInputBuffer.data(); - *size = static_cast<int>(rawInputBuffer.size()); - bufferSize = *size; + bool CompressionStream::Next(void** data, int* size) { + if (rawInputBuffer.size() > compressionBlockSize) { + std::stringstream ss; + ss << "uncompressed data size " << rawInputBuffer.size() + << " is larger than compression block size " << compressionBlockSize; + throw CompressionError(ss.str()); + } + + // compress data in the rawInputBuffer when it is full + if (rawInputBuffer.size() == compressionBlockSize) { + compressInternal(); + } + auto block = rawInputBuffer.getNextBlock(); + *data = block.data; + *size = static_cast<int>(block.size); return true; } class ZlibCompressionStream : public CompressionStream { public: - ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t bufferCapacity, + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual ~ZlibCompressionStream() override { end(); @@ -245,47 +287,62 @@ namespace orc { private: void init(); void end(); - z_stream strm; + z_stream strm_; }; ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, - MemoryPool& pool, WriterMetrics* metrics) - : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) { + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics) + : CompressionStream(outStream, compressionLevel, bufferCapacity, compressionBlockSize, + memoryBlockSize, pool, metrics) { init(); } uint64_t ZlibCompressionStream::doStreamingCompression() { - if (deflateReset(&strm) != Z_OK) { - throw std::runtime_error("Failed to reset inflate."); + if (deflateReset(&strm_) != Z_OK) { + throw CompressionError("Failed to reset inflate."); } - strm.avail_in = static_cast<unsigned int>(bufferSize); - strm.next_in = rawInputBuffer.data(); + // iterate through all blocks + uint64_t blockId = 0; + bool finish = false; do { - if (outputPosition >= outputSize) { - if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); - } - outputPosition = 0; + if (blockId == rawInputBuffer.getBlockNumber()) { + finish = true; + strm_.avail_in = 0; + strm_.next_in = nullptr; + } else { + auto block = rawInputBuffer.getBlock(blockId++); + strm_.avail_in = static_cast<unsigned int>(block.size); + strm_.next_in = reinterpret_cast<unsigned char*>(block.data); } - strm.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition); - strm.avail_out = static_cast<unsigned int>(outputSize - outputPosition); - int ret = deflate(&strm, Z_FINISH); - outputPosition = outputSize - static_cast<int>(strm.avail_out); + do { + if (outputPosition >= outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { + throw CompressionError("Failed to get next output buffer from output stream."); + } + outputPosition = 0; + } + strm_.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition); + strm_.avail_out = static_cast<unsigned int>(outputSize - outputPosition); - if (ret == Z_STREAM_END) { - break; - } else if (ret == Z_OK) { - // needs more buffer so will continue the loop - } else { - throw std::runtime_error("Failed to deflate input data."); - } - } while (strm.avail_out == 0); + int ret = deflate(&strm_, finish ? Z_FINISH : Z_NO_FLUSH); + outputPosition = outputSize - static_cast<int>(strm_.avail_out); - return strm.total_out; + if (ret == Z_STREAM_END) { + break; + } else if (ret == Z_OK) { + // needs more buffer so will continue the loop + } else { + throw CompressionError("Failed to deflate input data."); + } + } while (strm_.avail_out == 0); + } while (!finish); + return strm_.total_out; } std::string ZlibCompressionStream::getName() const { @@ -299,18 +356,18 @@ namespace orc { #endif void ZlibCompressionStream::init() { - strm.zalloc = nullptr; - strm.zfree = nullptr; - strm.opaque = nullptr; - strm.next_in = nullptr; + strm_.zalloc = nullptr; + strm_.zfree = nullptr; + strm_.opaque = nullptr; + strm_.next_in = nullptr; - if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) { - throw std::runtime_error("Error while calling deflateInit2() for zlib."); + if (deflateInit2(&strm_, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) { + throw CompressionError("Error while calling deflateInit2() for zlib."); } } void ZlibCompressionStream::end() { - (void)deflateEnd(&strm); + (void)deflateEnd(&strm_); } DIAGNOSTIC_PUSH @@ -399,9 +456,9 @@ namespace orc { }; DecompressionStream::DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, MemoryPool& _pool, - ReaderMetrics* _metrics) - : pool(_pool), + size_t bufferSize, MemoryPool& pool, + ReaderMetrics* metrics) + : pool(pool), input(std::move(inStream)), outputDataBuffer(pool, bufferSize), state(DECOMPRESS_HEADER), @@ -416,7 +473,7 @@ namespace orc { headerPosition(0), inputBufferStartPosition(0), bytesReturned(0), - metrics(_metrics) {} + metrics(metrics) {} std::string DecompressionStream::getStreamName() const { return input->getName(); @@ -505,7 +562,7 @@ namespace orc { } else if (state == DECOMPRESS_START) { NextDecompress(data, size, availableSize); } else { - throw std::logic_error( + throw CompressionError( "Unknown compression state in " "DecompressionStream::Next"); } @@ -519,7 +576,7 @@ namespace orc { void DecompressionStream::BackUp(int count) { if (outputBuffer == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in " + getName()); + throw CompressionError("Backup without previous Next in " + getName()); } outputBuffer -= static_cast<size_t>(count); outputBufferLength = static_cast<size_t>(count); @@ -622,7 +679,7 @@ namespace orc { virtual void NextDecompress(const void** data, int* size, size_t availableSize) override; private: - z_stream zstream; + z_stream zstream_; }; DIAGNOSTIC_PUSH @@ -632,35 +689,39 @@ namespace orc { #endif ZlibDecompressionStream::ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, MemoryPool& _pool, - ReaderMetrics* _metrics) - : DecompressionStream(std::move(inStream), bufferSize, _pool, _metrics) { - zstream.next_in = nullptr; - zstream.avail_in = 0; - zstream.zalloc = nullptr; - zstream.zfree = nullptr; - zstream.opaque = nullptr; - zstream.next_out = reinterpret_cast<Bytef*>(outputDataBuffer.data()); - zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity()); - int64_t result = inflateInit2(&zstream, -15); + size_t bufferSize, MemoryPool& pool, + ReaderMetrics* metrics) + : DecompressionStream(std::move(inStream), bufferSize, pool, metrics) { + zstream_.next_in = nullptr; + zstream_.avail_in = 0; + zstream_.zalloc = nullptr; + zstream_.zfree = nullptr; + zstream_.opaque = nullptr; + zstream_.next_out = reinterpret_cast<Bytef*>(outputDataBuffer.data()); + zstream_.avail_out = static_cast<uInt>(outputDataBuffer.capacity()); + int64_t result = inflateInit2(&zstream_, -15); switch (result) { case Z_OK: break; case Z_MEM_ERROR: - throw std::logic_error("Memory error from inflateInit2"); + throw CompressionError( + "Memory error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); case Z_VERSION_ERROR: - throw std::logic_error("Version error from inflateInit2"); + throw CompressionError( + "Version error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); case Z_STREAM_ERROR: - throw std::logic_error("Stream error from inflateInit2"); + throw CompressionError( + "Stream error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); default: - throw std::logic_error("Unknown error from inflateInit2"); + throw CompressionError( + "Unknown error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); } } DIAGNOSTIC_POP ZlibDecompressionStream::~ZlibDecompressionStream() { - int64_t result = inflateEnd(&zstream); + int64_t result = inflateEnd(&zstream_); if (result != Z_OK) { // really can't throw in destructors std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n"; @@ -668,19 +729,19 @@ namespace orc { } void ZlibDecompressionStream::NextDecompress(const void** data, int* size, size_t availableSize) { - zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availableSize); + zstream_.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream_.avail_in = static_cast<uInt>(availableSize); outputBuffer = outputDataBuffer.data(); - zstream.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); - zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity()); - if (inflateReset(&zstream) != Z_OK) { - throw std::logic_error( + zstream_.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); + zstream_.avail_out = static_cast<uInt>(outputDataBuffer.capacity()); + if (inflateReset(&zstream_) != Z_OK) { + throw CompressionError( "Bad inflateReset in " "ZlibDecompressionStream::NextDecompress"); } int64_t result; do { - result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH); + result = inflate(&zstream_, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH); switch (result) { case Z_OK: remainingLength -= availableSize; @@ -688,30 +749,30 @@ namespace orc { readBuffer(true); availableSize = std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength); - zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availableSize); + zstream_.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream_.avail_in = static_cast<uInt>(availableSize); break; case Z_STREAM_END: break; case Z_BUF_ERROR: - throw std::logic_error( + throw CompressionError( "Buffer error in " "ZlibDecompressionStream::NextDecompress"); case Z_DATA_ERROR: - throw std::logic_error( + throw CompressionError( "Data error in " "ZlibDecompressionStream::NextDecompress"); case Z_STREAM_ERROR: - throw std::logic_error( + throw CompressionError( "Stream error in " "ZlibDecompressionStream::NextDecompress"); default: - throw std::logic_error( + throw CompressionError( "Unknown error in " "ZlibDecompressionStream::NextDecompress"); } } while (result != Z_STREAM_END); - *size = static_cast<int>(outputDataBuffer.capacity() - zstream.avail_out); + *size = static_cast<int>(outputDataBuffer.capacity() - zstream_.avail_out); *data = outputBuffer; outputBufferLength = 0; outputBuffer += *size; @@ -742,14 +803,14 @@ namespace orc { private: // may need to stitch together multiple input buffers; // to give snappy a contiguous block - DataBuffer<char> inputDataBuffer; + DataBuffer<char> inputDataBuffer_; }; BlockDecompressionStream::BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, MemoryPool& _pool, - ReaderMetrics* _metrics) - : DecompressionStream(std::move(inStream), blockSize, _pool, _metrics), - inputDataBuffer(pool, blockSize) {} + size_t blockSize, MemoryPool& pool, + ReaderMetrics* metrics) + : DecompressionStream(std::move(inStream), blockSize, pool, metrics), + inputDataBuffer_(pool, blockSize) {} void BlockDecompressionStream::NextDecompress(const void** data, int* size, size_t availableSize) { @@ -759,18 +820,18 @@ namespace orc { inputBuffer += availableSize; } else { // Did not read enough from input. - if (inputDataBuffer.capacity() < remainingLength) { - inputDataBuffer.resize(remainingLength); + if (inputDataBuffer_.capacity() < remainingLength) { + inputDataBuffer_.resize(remainingLength); } - ::memcpy(inputDataBuffer.data(), inputBuffer, availableSize); + ::memcpy(inputDataBuffer_.data(), inputBuffer, availableSize); inputBuffer += availableSize; - compressed = inputDataBuffer.data(); + compressed = inputDataBuffer_.data(); for (size_t pos = availableSize; pos < remainingLength;) { readBuffer(true); size_t avail = std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength - pos); - ::memcpy(inputDataBuffer.data() + pos, inputBuffer, avail); + ::memcpy(inputDataBuffer_.data() + pos, inputBuffer, avail); pos += avail; inputBuffer += avail; } @@ -788,8 +849,8 @@ namespace orc { class SnappyDecompressionStream : public BlockDecompressionStream { public: SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, - MemoryPool& _pool, ReaderMetrics* _metrics) - : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { + MemoryPool& pool, ReaderMetrics* metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) { // PASS } @@ -804,18 +865,18 @@ namespace orc { size_t maxOutputLength) override; }; - uint64_t SnappyDecompressionStream::decompress(const char* _input, uint64_t length, char* output, + uint64_t SnappyDecompressionStream::decompress(const char* input, uint64_t length, char* output, size_t maxOutputLength) { size_t outLength; - if (!snappy::GetUncompressedLength(_input, length, &outLength)) { + if (!snappy::GetUncompressedLength(input, length, &outLength)) { throw ParseError("SnappyDecompressionStream choked on corrupt input"); } if (outLength > maxOutputLength) { - throw std::logic_error("Snappy length exceeds block size"); + throw CompressionError("Snappy length exceeds block size"); } - if (!snappy::RawUncompress(_input, length, output)) { + if (!snappy::RawUncompress(input, length, output)) { throw ParseError("SnappyDecompressionStream choked on corrupt input"); } return outLength; @@ -824,8 +885,8 @@ namespace orc { class LzoDecompressionStream : public BlockDecompressionStream { public: LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, - MemoryPool& _pool, ReaderMetrics* _metrics) - : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { + MemoryPool& pool, ReaderMetrics* metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) { // PASS } @@ -848,8 +909,8 @@ namespace orc { class Lz4DecompressionStream : public BlockDecompressionStream { public: Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, - MemoryPool& _pool, ReaderMetrics* _metrics) - : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { + MemoryPool& pool, ReaderMetrics* metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) { // PASS } @@ -881,14 +942,23 @@ namespace orc { public: BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) - : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics), - compressorBuffer(pool) { + : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, blockSize, pool, + metrics), + compressorBuffer(pool), + rawInputBuffer(pool, blockSize) { // PASS } virtual bool Next(void** data, int* size) override; virtual void suppress() override; + virtual void BackUp(int count) override; + virtual uint64_t flush() override; virtual std::string getName() const override = 0; + uint64_t getRawInputBufferSize() const override { + return bufferSize; + } + + virtual void finishStream() override; protected: // compresses a block and returns the compressed size @@ -900,8 +970,23 @@ namespace orc { // should allocate max possible compressed size DataBuffer<unsigned char> compressorBuffer; + + // Buffer to hold uncompressed data until user calls Next() + DataBuffer<unsigned char> rawInputBuffer; }; + void BlockCompressionStream::BackUp(int count) { + if (count > bufferSize) { + throw CompressionError("Can't backup that much!"); + } + bufferSize -= count; + } + + uint64_t BlockCompressionStream::flush() { + finishStream(); + return BufferedOutputStream::flush(); + } + bool BlockCompressionStream::Next(void** data, int* size) { if (bufferSize != 0) { ensureHeader(); @@ -935,7 +1020,19 @@ namespace orc { void BlockCompressionStream::suppress() { compressorBuffer.resize(0); - CompressionStreamBase::suppress(); + outputBuffer = nullptr; + bufferSize = outputPosition = outputSize = 0; + BufferedOutputStream::suppress(); + } + + void BlockCompressionStream::finishStream() { + void* data; + int size; + if (!Next(&data, &size)) { + throw CompressionError("Failed to flush compression buffer."); + } + BufferedOutputStream::BackUp(outputSize - outputPosition); + bufferSize = outputSize = outputPosition = 0; } /** @@ -967,30 +1064,30 @@ namespace orc { private: void init(); void end(); - LZ4_stream_t* state; + LZ4_stream_t* state_; }; uint64_t Lz4CompressionSteam::doBlockCompression() { int result = LZ4_compress_fast_extState( - static_cast<void*>(state), reinterpret_cast<const char*>(rawInputBuffer.data()), + static_cast<void*>(state_), reinterpret_cast<const char*>(rawInputBuffer.data()), reinterpret_cast<char*>(compressorBuffer.data()), bufferSize, static_cast<int>(compressorBuffer.size()), level); if (result == 0) { - throw std::runtime_error("Error during block compression using lz4."); + throw CompressionError("Error during block compression using lz4."); } return static_cast<uint64_t>(result); } void Lz4CompressionSteam::init() { - state = LZ4_createStream(); - if (!state) { - throw std::runtime_error("Error while allocating state for lz4."); + state_ = LZ4_createStream(); + if (!state_) { + throw CompressionError("Error while allocating state for lz4."); } } void Lz4CompressionSteam::end() { - (void)LZ4_freeStream(state); - state = nullptr; + (void)LZ4_freeStream(state_); + state_ = nullptr; } /** @@ -1055,11 +1152,11 @@ namespace orc { private: void init(); void end(); - ZSTD_CCtx* cctx; + ZSTD_CCtx* cctx_; }; uint64_t ZSTDCompressionStream::doBlockCompression() { - return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(), + return ZSTD_compressCCtx(cctx_, compressorBuffer.data(), compressorBuffer.size(), rawInputBuffer.data(), static_cast<size_t>(bufferSize), level); } @@ -1070,15 +1167,15 @@ namespace orc { #endif void ZSTDCompressionStream::init() { - cctx = ZSTD_createCCtx(); - if (!cctx) { - throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd."); + cctx_ = ZSTD_createCCtx(); + if (!cctx_) { + throw CompressionError("Error while calling ZSTD_createCCtx() for zstd."); } } void ZSTDCompressionStream::end() { - (void)ZSTD_freeCCtx(cctx); - cctx = nullptr; + (void)ZSTD_freeCCtx(cctx_); + cctx_ = nullptr; } DIAGNOSTIC_PUSH @@ -1089,8 +1186,8 @@ namespace orc { class ZSTDDecompressionStream : public BlockDecompressionStream { public: ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, - MemoryPool& _pool, ReaderMetrics* _metrics) - : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { + MemoryPool& pool, ReaderMetrics* metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) { this->init(); } @@ -1111,13 +1208,13 @@ namespace orc { private: void init(); void end(); - ZSTD_DCtx* dctx; + ZSTD_DCtx* dctx_; }; uint64_t ZSTDDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output, size_t maxOutputLength) { return static_cast<uint64_t>( - ZSTD_decompressDCtx(dctx, output, maxOutputLength, inputPtr, length)); + ZSTD_decompressDCtx(dctx_, output, maxOutputLength, inputPtr, length)); } DIAGNOSTIC_PUSH @@ -1127,25 +1224,23 @@ namespace orc { #endif void ZSTDDecompressionStream::init() { - dctx = ZSTD_createDCtx(); - if (!dctx) { - throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd."); + dctx_ = ZSTD_createDCtx(); + if (!dctx_) { + throw CompressionError("Error while calling ZSTD_createDCtx() for zstd."); } } void ZSTDDecompressionStream::end() { - (void)ZSTD_freeDCtx(dctx); - dctx = nullptr; + (void)ZSTD_freeDCtx(dctx_); + dctx_ = nullptr; } DIAGNOSTIC_PUSH - std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind, - OutputStream* outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool, WriterMetrics* metrics) { + std::unique_ptr<BufferedOutputStream> createCompressor( + CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy, + uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize, + MemoryPool& pool, WriterMetrics* metrics) { switch (static_cast<int64_t>(kind)) { case CompressionKind_NONE: { return std::make_unique<BufferedOutputStream>(pool, outStream, bufferCapacity, @@ -1154,8 +1249,8 @@ namespace orc { case CompressionKind_ZLIB: { int level = (strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; - return std::make_unique<ZlibCompressionStream>(outStream, level, bufferCapacity, - compressionBlockSize, pool, metrics); + return std::make_unique<ZlibCompressionStream>( + outStream, level, bufferCapacity, compressionBlockSize, memoryBlockSize, pool, metrics); } case CompressionKind_ZSTD: { int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT; diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh index 55b152dd63..24170c56b4 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.hh +++ b/contrib/libs/apache/orc/c++/src/Compression.hh @@ -42,15 +42,16 @@ namespace orc { * @param outStream the output stream that is the underlying target * @param strategy compression strategy * @param bufferCapacity compression stream buffer total capacity - * @param compressionBlockSize compression buffer block size + * @param compressionBlockSize compression is triggered when the original input buffer size + * reaches this size + * @param memoryBlockSize the block size for original input buffer * @param pool the memory pool + * @param metrics the writer metrics */ - std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind, - OutputStream* outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool, WriterMetrics* metrics); + std::unique_ptr<BufferedOutputStream> createCompressor( + CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy, + uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize, + MemoryPool& pool, WriterMetrics* metrics); } // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc index 459cafa1a0..a9003bc163 100644 --- a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc +++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc @@ -17,15 +17,18 @@ */ #include "ConvertColumnReader.hh" +#include "Utils.hh" + +#include <optional> namespace orc { // Assume that we are using tight numeric vector batch using BooleanVectorBatch = ByteVectorBatch; - ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType, - StripeStreams& stripe, bool _throwOnOverflow) - : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) { + ConvertColumnReader::ConvertColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ColumnReader(readType, stripe), readType(readType), throwOnOverflow(throwOnOverflow) { reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true, /*throwOnOverflow=*/false, /*convertToReadType*/ false); data = @@ -72,6 +75,23 @@ namespace orc { } } + static inline void handleParseFromStringError(ColumnVectorBatch& dstBatch, uint64_t idx, + bool shouldThrow, const std::string& typeName, + const std::string& str, + const std::string& expectedFormat = "") { + if (!shouldThrow) { + dstBatch.notNull.data()[idx] = 0; + dstBatch.hasNulls = true; + } else { + std::ostringstream ss; + ss << "Failed to parse " << typeName << " from string:" << str; + if (expectedFormat != "") { + ss << " the following format \"" << expectedFormat << "\" is expected"; + } + throw SchemaEvolutionError(ss.str()); + } + } + // return false if overflow template <typename ReadType> static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) { @@ -135,9 +155,9 @@ namespace orc { template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType> class NumericConvertColumnReader : public ConvertColumnReader { public: - NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, - bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + NumericConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { ConvertColumnReader::next(rowBatch, numValues, notNull); @@ -164,9 +184,9 @@ namespace orc { class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool> : public ConvertColumnReader { public: - NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, - bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + NumericConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { ConvertColumnReader::next(rowBatch, numValues, notNull); @@ -188,9 +208,9 @@ namespace orc { class ConvertToStringVariantColumnReader : public ConvertColumnReader { public: - ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType, - StripeStreams& stripe, bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + ConvertToStringVariantColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; @@ -225,19 +245,19 @@ namespace orc { class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader { public: - BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType, - StripeStreams& stripe, bool _throwOnOverflow) - : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) { - trueValue = "TRUE"; - falseValue = "FALSE"; + BooleanToStringVariantColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) { + trueValue_ = "TRUE"; + falseValue_ = "FALSE"; if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) { if (readType.getMaximumLength() < 5) { throw SchemaEvolutionError("Invalid maximum length for boolean type: " + std::to_string(readType.getMaximumLength())); } if (readType.getKind() == CHAR) { - trueValue.resize(readType.getMaximumLength(), ' '); - falseValue.resize(readType.getMaximumLength(), ' '); + trueValue_.resize(readType.getMaximumLength(), ' '); + falseValue_.resize(readType.getMaximumLength(), ' '); } } } @@ -245,8 +265,8 @@ namespace orc { uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override; private: - std::string trueValue; - std::string falseValue; + std::string trueValue_; + std::string falseValue_; }; uint64_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch, @@ -257,7 +277,7 @@ namespace orc { // cast the bool value to string for (uint64_t i = 0; i < numValues; ++i) { if (!rowBatch.hasNulls || rowBatch.notNull[i]) { - strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue); + strBuffer[i] = (srcBatch.data[i] ? trueValue_ : falseValue_); size += strBuffer[i].size(); } } @@ -267,9 +287,9 @@ namespace orc { template <typename FileTypeBatch> class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader { public: - NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType, - StripeStreams& stripe, bool _throwOnOverflow) - : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + NumericToStringVariantColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {} uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override; }; @@ -321,13 +341,13 @@ namespace orc { template <typename FileTypeBatch, typename ReadTypeBatch, bool isFloatingFileType> class NumericToDecimalColumnReader : public ConvertColumnReader { public: - NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, - bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) { - precision = static_cast<int32_t>(readType.getPrecision()); - scale = static_cast<int32_t>(readType.getScale()); + NumericToDecimalColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) { + precision_ = static_cast<int32_t>(readType.getPrecision()); + scale_ = static_cast<int32_t>(readType.getScale()); bool overflow = false; - upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow); + upperBound_ = scaleUpInt128ByPowerOfTen(1, precision_, overflow); } void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { @@ -335,8 +355,8 @@ namespace orc { const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); - dstBatch.precision = precision; - dstBatch.scale = scale; + dstBatch.precision = precision_; + dstBatch.scale = scale_; for (uint64_t i = 0; i < numValues; ++i) { if (!rowBatch.hasNulls || rowBatch.notNull[i]) { if constexpr (isFloatingFileType) { @@ -351,7 +371,7 @@ namespace orc { private: template <typename SrcType> void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) { - const auto result = convertDecimal(value, precision, scale); + const auto result = convertDecimal(value, precision_, scale_); Int128 i128 = result.second; if (result.first) { handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow); @@ -372,7 +392,7 @@ namespace orc { template <typename SrcType> void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) { int fromScale = 0; - auto result = convertDecimal(value, fromScale, precision, scale); + auto result = convertDecimal(value, fromScale, precision_, scale_); if (result.first) { handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow); } else { @@ -388,24 +408,25 @@ namespace orc { } } - int32_t precision; - int32_t scale; - int64_t scaleMultiplier; - Int128 upperBound; + int32_t precision_; + int32_t scale_; + int64_t scaleMultiplier_; + Int128 upperBound_; }; class ConvertToTimestampColumnReader : public ConvertColumnReader { public: - ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType, - StripeStreams& stripe, bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow), - readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT") - : &stripe.getReaderTimezone()), + ConvertToTimestampColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow), + isInstant(readType.getKind() == TIMESTAMP_INSTANT), + readerTimezone(isInstant ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()), needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; protected: + const bool isInstant; const orc::Timezone* readerTimezone; const bool needConvertTimezone; }; @@ -419,9 +440,9 @@ namespace orc { template <typename FileTypeBatch> class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader { public: - NumericToTimestampColumnReader(const Type& _readType, const Type& fileType, - StripeStreams& stripe, bool _throwOnOverflow) - : ConvertToTimestampColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + NumericToTimestampColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull); @@ -469,14 +490,14 @@ namespace orc { template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType> class DecimalToNumericColumnReader : public ConvertColumnReader { public: - DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, - bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) { - precision = fileType.getPrecision(); - scale = fileType.getScale(); - factor = 1; - for (int i = 0; i < scale; i++) { - factor *= 10; + DecimalToNumericColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) { + precision_ = fileType.getPrecision(); + scale_ = fileType.getScale(); + factor_ = 1; + for (int i = 0; i < scale_; i++) { + factor_ *= 10; } } @@ -500,7 +521,7 @@ namespace orc { void convertDecimalToInteger(ReadTypeBatch& dstBatch, uint64_t idx, const FileTypeBatch& srcBatch) { using FileType = decltype(srcBatch.values[idx]); - Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale); + Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale_); if (!result.fitsInLong()) { handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow); return; @@ -512,21 +533,21 @@ namespace orc { void convertDecimalToDouble(ReadTypeBatch& dstBatch, uint64_t idx, const FileTypeBatch& srcBatch) { double doubleValue = Int128(srcBatch.values[idx]).toDouble(); - dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor); + dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor_); } - int32_t precision; - int32_t scale; - int64_t factor; + int32_t precision_; + int32_t scale_; + int64_t factor_; }; template <typename FileTypeBatch> class DecimalToNumericColumnReader<FileTypeBatch, BooleanVectorBatch, bool> : public ConvertColumnReader { public: - DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, - bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + DecimalToNumericColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { ConvertColumnReader::next(rowBatch, numValues, notNull); @@ -544,13 +565,13 @@ namespace orc { template <typename FileTypeBatch, typename ReadTypeBatch> class DecimalConvertColumnReader : public ConvertColumnReader { public: - DecimalConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, - bool _throwOnOverflow) - : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) { - fromPrecision = fileType.getPrecision(); - fromScale = fileType.getScale(); - toPrecision = _readType.getPrecision(); - toScale = _readType.getScale(); + DecimalConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) { + fromPrecision_ = fileType.getPrecision(); + fromScale_ = fileType.getScale(); + toPrecision_ = readType.getPrecision(); + toScale_ = readType.getScale(); } void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { @@ -572,7 +593,7 @@ namespace orc { using ReadType = decltype(dstBatch.values[idx]); auto [overflows, resultI128] = - convertDecimal(srcBatch.values[idx], fromScale, toPrecision, toScale); + convertDecimal(srcBatch.values[idx], fromScale_, toPrecision_, toScale_); if (overflows) { handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow); } @@ -587,10 +608,423 @@ namespace orc { } } - int32_t fromPrecision; - int32_t fromScale; - int32_t toPrecision; - int32_t toScale; + int32_t fromPrecision_; + int32_t fromScale_; + int32_t toPrecision_; + int32_t toScale_; + }; + + template <typename FileTypeBatch> + class DecimalToTimestampColumnReader : public ConvertToTimestampColumnReader { + public: + DecimalToTimestampColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow), + precision_(static_cast<int32_t>(fileType.getPrecision())), + scale_(static_cast<int32_t>(fileType.getScale())) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch); + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertDecimalToTimestamp(dstBatch, i, srcBatch); + } + } + } + + private: + void convertDecimalToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, + const FileTypeBatch& srcBatch) { + constexpr int SecondToNanoFactor = 9; + // Following constant comes from java.time.Instant + // '-1000000000-01-01T00:00Z' + constexpr int64_t MIN_EPOCH_SECONDS = -31557014167219200L; + // '1000000000-12-31T23:59:59.999999999Z' + constexpr int64_t MAX_EPOCH_SECONDS = 31556889864403199L; + // dummy variable, there's no risk of overflow + bool overflow = false; + + Int128 i128(srcBatch.values[idx]); + Int128 integerPortion = scaleDownInt128ByPowerOfTen(i128, scale_); + if (integerPortion < MIN_EPOCH_SECONDS || integerPortion > MAX_EPOCH_SECONDS) { + handleOverflow<Decimal, int64_t>(dstBatch, idx, throwOnOverflow); + return; + } + i128 -= scaleUpInt128ByPowerOfTen(integerPortion, scale_, overflow); + Int128 fractionPortion = std::move(i128); + if (scale_ < SecondToNanoFactor) { + fractionPortion = + scaleUpInt128ByPowerOfTen(fractionPortion, SecondToNanoFactor - scale_, overflow); + } else { + fractionPortion = scaleDownInt128ByPowerOfTen(fractionPortion, scale_ - SecondToNanoFactor); + } + if (fractionPortion < 0) { + fractionPortion += 1e9; + integerPortion -= 1; + } + // line 630 has guaranteed toLong() will not overflow + dstBatch.data[idx] = integerPortion.toLong(); + dstBatch.nanoseconds[idx] = fractionPortion.toLong(); + + if (needConvertTimezone) { + dstBatch.data[idx] = readerTimezone->convertFromUTC(dstBatch.data[idx]); + } + } + + const int32_t precision_; + const int32_t scale_; + }; + + template <typename FileTypeBatch> + class DecimalToStringVariantColumnReader : public ConvertToStringVariantColumnReader { + public: + DecimalToStringVariantColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow), + scale_(fileType.getScale()) {} + + uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override { + uint64_t size = 0; + strBuffer.resize(numValues); + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + if (readType.getKind() == STRING) { + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = Int128(srcBatch.values[i]).toDecimalString(scale_, true); + size += strBuffer[i].size(); + } + } + } else { + const auto maxLength = readType.getMaximumLength(); + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = Int128(srcBatch.values[i]).toDecimalString(scale_, true); + } + if (strBuffer[i].size() > maxLength) { + strBuffer[i].resize(maxLength); + } + size += strBuffer[i].size(); + } + } + return size; + } + + private: + const int32_t scale_; + }; + + template <typename ReadTypeBatch, typename ReadType> + class StringVariantToNumericColumnReader : public ConvertColumnReader { + public: + StringVariantToNumericColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + if constexpr (std::is_floating_point_v<ReadType>) { + convertToDouble(dstBatch, srcBatch, i); + } else { + convertToInteger(dstBatch, srcBatch, i); + } + } + } + } + + private: + void convertToInteger(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, + uint64_t idx) { + int64_t longValue = 0; + const std::string longStr(srcBatch.data[idx], srcBatch.length[idx]); + try { + longValue = std::stoll(longStr); + } catch (...) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Long", longStr); + return; + } + if constexpr (std::is_same_v<ReadType, bool>) { + dstBatch.data[idx] = longValue == 0 ? 0 : 1; + } else { + if (!downCastToInteger(dstBatch.data[idx], longValue)) { + handleOverflow<std::string, ReadType>(dstBatch, idx, throwOnOverflow); + } + } + } + + void convertToDouble(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, uint64_t idx) { + const std::string floatValue(srcBatch.data[idx], srcBatch.length[idx]); + try { + if constexpr (std::is_same_v<ReadType, float>) { + dstBatch.data[idx] = std::stof(floatValue); + } else { + dstBatch.data[idx] = std::stod(floatValue); + } + } catch (...) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, typeid(readType).name(), + floatValue); + } + } + }; + + class StringVariantConvertColumnReader : public ConvertToStringVariantColumnReader { + public: + StringVariantConvertColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override { + uint64_t size = 0; + strBuffer.resize(numValues); + const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get()); + const auto maxLength = readType.getMaximumLength(); + if (readType.getKind() == STRING) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = std::string(srcBatch.data[i], srcBatch.length[i]); + size += strBuffer[i].size(); + } + } + } else if (readType.getKind() == VARCHAR) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const char* charData = srcBatch.data[i]; + uint64_t originLength = srcBatch.length[i]; + uint64_t itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength); + strBuffer[i] = std::string(charData, itemLength); + size += strBuffer[i].length(); + } + } + } else if (readType.getKind() == CHAR) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const char* charData = srcBatch.data[i]; + uint64_t originLength = srcBatch.length[i]; + uint64_t charLength = Utf8Utils::charLength(charData, originLength); + auto itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength); + strBuffer[i] = std::string(srcBatch.data[i], itemLength); + // the padding is exactly 1 byte per char + if (charLength < maxLength) { + strBuffer[i].resize(itemLength + maxLength - charLength, ' '); + } + size += strBuffer[i].length(); + } + } + } else { + throw SchemaEvolutionError("Invalid type for numeric to string conversion: " + + readType.toString()); + } + return size; + } + }; + + class StringVariantToTimestampColumnReader : public ConvertToTimestampColumnReader { + public: + StringVariantToTimestampColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i])); + } + } + } + + private: + // Algorithm: http://howardhinnant.github.io/date_algorithms.html + // The algorithm implements a proleptic Gregorian calendar. + int64_t daysFromProlepticGregorianCalendar(int32_t y, int32_t m, int32_t d) { + y -= m <= 2; + int32_t era = y / 400; + int32_t yoe = y - era * 400; // [0, 399] + int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365] + int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096] + return 1ll * era * 146097 + doe - 719468; + } + + std::optional<std::pair<int64_t, int64_t>> tryBestToParseFromString( + const std::string& timeStr) { + int32_t year, month, day, hour, min, sec, nanos = 0; + int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d %2d:%2d:%2d.%d", &year, &month, + &day, &hour, &min, &sec, &nanos); + if (matched != 6 && matched != 7) { + return std::nullopt; + } + if (nanos) { + if (nanos < 0 || nanos >= 1e9) { + return std::nullopt; + } + while (nanos < static_cast<int64_t>(1e8)) { + nanos *= 10; + } + } + int64_t daysSinceEpoch = daysFromProlepticGregorianCalendar(year, month, day); + int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + min) + sec; + return std::make_optional(std::pair<int64_t, int64_t>{secondSinceEpoch, nanos}); + } + + void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, + const std::string& timeStr) { + // Expected timestamp_instant format string : yyyy-mm-dd hh:mm:ss[.xxx] timezone + // Eg. "2019-07-09 13:11:00 America/Los_Angeles" + // Expected timestamp format string : yyyy-mm-dd hh:mm:ss[.xxx] + // Eg. "2019-07-09 13:11:00" + static std::string expectedTimestampInstantFormat = "yyyy-mm-dd hh:mm:ss[.xxx] timezone"; + static std::string expectedTimestampFormat = "yyyy-mm-dd hh:mm:ss[.xxx]"; + auto timestamp = tryBestToParseFromString(timeStr); + if (!timestamp.has_value()) { + if (!isInstant) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp", timeStr, + expectedTimestampFormat); + return; + } + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + + auto& [second, nanos] = timestamp.value(); + + if (isInstant) { + size_t pos = 0; // get the name of timezone + pos = timeStr.find(' ', pos) + 1; + pos = timeStr.find(' ', pos); + if (pos == std::string::npos) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + pos += 1; + size_t subStrLength = timeStr.length() - pos; + try { + second = getTimezoneByName(timeStr.substr(pos, subStrLength)).convertFromUTC(second); + } catch (const TimezoneError&) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + } else { + if (needConvertTimezone) { + second = readerTimezone->convertFromUTC(second); + } + } + dstBatch.data[idx] = second; + dstBatch.nanoseconds[idx] = nanos; + } + }; + + template <typename ReadTypeBatch> + class StringVariantToDecimalColumnReader : public ConvertColumnReader { + public: + StringVariantToDecimalColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow), + precision_(static_cast<int32_t>(readType.getPrecision())), + scale_(static_cast<int32_t>(readType.getScale())) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToDecimal(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i])); + } + } + } + + private: + void convertToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, const std::string& decimalStr) { + constexpr int32_t MAX_PRECISION_128 = 38; + int32_t fromPrecision = 0; + int32_t fromScale = 0; + uint32_t start = 0; + bool negative = false; + if (decimalStr.empty()) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + auto dotPos = decimalStr.find('.'); + if (dotPos == std::string::npos) { + fromScale = 0; + fromPrecision = decimalStr.length(); + dotPos = decimalStr.length(); + } else { + if (dotPos + 1 == decimalStr.length()) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + fromPrecision = decimalStr.length() - 1; + fromScale = decimalStr.length() - dotPos - 1; + } + if (decimalStr.front() == '-') { + negative = true; + start++; + fromPrecision--; + } + const std::string integerPortion = decimalStr.substr(start, dotPos - start); + if (dotPos == start || fromPrecision > MAX_PRECISION_128 || fromPrecision <= 0 || + !std::all_of(integerPortion.begin(), integerPortion.end(), ::isdigit)) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + + Int128 i128; + try { + bool overflow = false; + i128 = Int128(integerPortion); + // overflow won't happen + i128 *= scaleUpInt128ByPowerOfTen(Int128(1), fromScale, overflow); + } catch (const std::exception& e) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + if (dotPos + 1 < decimalStr.length()) { + const std::string fractionPortion = decimalStr.substr(dotPos + 1, fromScale); + if (!std::all_of(fractionPortion.begin(), fractionPortion.end(), ::isdigit)) { + handleOverflow<std::string, Int128>(dstBatch, idx, throwOnOverflow); + return; + } + i128 += Int128(fractionPortion); + } + + auto [overflow, result] = convertDecimal(i128, fromScale, precision_, scale_); + if (overflow) { + handleOverflow<std::string, Int128>(dstBatch, idx, throwOnOverflow); + return; + } + if (negative) { + result.negate(); + } + + if constexpr (std::is_same_v<ReadTypeBatch, Decimal128VectorBatch>) { + dstBatch.values[idx] = result; + } else { + if (!result.fitsInLong()) { + handleOverflow<std::string, decltype(dstBatch.values[idx])>(dstBatch, idx, + throwOnOverflow); + } else { + dstBatch.values[idx] = result.toLong(); + } + } + } + + const int32_t precision_; + const int32_t scale_; }; #define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \ @@ -621,6 +1055,26 @@ namespace orc { using Decimal128##To##TO##ColumnReader = \ DecimalConvertColumnReader<Decimal128VectorBatch, TO##VectorBatch>; +#define DEFINE_DECIMAL_CONVERT_TO_TIMESTAMP_READER \ + using Decimal64ToTimestampColumnReader = DecimalToTimestampColumnReader<Decimal64VectorBatch>; \ + using Decimal128ToTimestampColumnReader = DecimalToTimestampColumnReader<Decimal128VectorBatch>; + +#define DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(TO) \ + using Decimal64To##TO##ColumnReader = DecimalToStringVariantColumnReader<Decimal64VectorBatch>; \ + using Decimal128To##TO##ColumnReader = DecimalToStringVariantColumnReader<Decimal128VectorBatch>; + +#define DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(FROM, TO, TYPE) \ + using FROM##To##TO##ColumnReader = StringVariantToNumericColumnReader<TO##VectorBatch, TYPE>; + +#define DEFINE_STRING_VARIANT_CONVERT_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantConvertColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantToTimestampColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantToDecimalColumnReader<TO##VectorBatch>; + DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t) DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t) DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t) @@ -720,8 +1174,62 @@ namespace orc { DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal64) DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal128) + DEFINE_DECIMAL_CONVERT_TO_TIMESTAMP_READER + DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(String) + DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Char) + DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Varchar) + + // String variant to numeric + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Double, double) + + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Double, double) + + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Double, double) + + // String variant to string variant + DEFINE_STRING_VARIANT_CONVERT_READER(String, String) + DEFINE_STRING_VARIANT_CONVERT_READER(String, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(String, Varchar) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, String) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, Varchar) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, String) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Varchar) + + // String variant to timestamp + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(String, Timestamp) + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Char, Timestamp) + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Varchar, Timestamp) + + // String variant to decimal + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal128) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal128) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal128) + #define CREATE_READER(NAME) \ - return std::make_unique<NAME>(_readType, fileType, stripe, throwOnOverflow); + return std::make_unique<NAME>(readType, fileType, stripe, throwOnOverflow); #define CASE_CREATE_READER(TYPE, CONVERT) \ case TYPE: \ @@ -744,7 +1252,7 @@ namespace orc { #define CASE_CREATE_DECIMAL_READER(FROM) \ case DECIMAL: { \ - if (isDecimal64(_readType)) { \ + if (isDecimal64(readType)) { \ CREATE_READER(FROM##ToDecimal64ColumnReader) \ } else { \ CREATE_READER(FROM##ToDecimal128ColumnReader) \ @@ -754,7 +1262,7 @@ namespace orc { #define CASE_EXCEPTION \ default: \ throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \ - _readType.toString()); + readType.toString()); std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe, bool useTightNumericVector, @@ -764,11 +1272,11 @@ namespace orc { "SchemaEvolution only support tight vector, please create ColumnVectorBatch with " "option useTightNumericVector"); } - const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType); + const auto& readType = *stripe.getSchemaEvolution()->getReadType(fileType); switch (fileType.getKind()) { case BOOLEAN: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BYTE, BooleanToByte) CASE_CREATE_READER(SHORT, BooleanToShort) CASE_CREATE_READER(INT, BooleanToInt) @@ -792,7 +1300,7 @@ namespace orc { } } case BYTE: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, ByteToBoolean) CASE_CREATE_READER(SHORT, ByteToShort) CASE_CREATE_READER(INT, ByteToInt) @@ -816,7 +1324,7 @@ namespace orc { } } case SHORT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, ShortToBoolean) CASE_CREATE_READER(BYTE, ShortToByte) CASE_CREATE_READER(INT, ShortToInt) @@ -840,7 +1348,7 @@ namespace orc { } } case INT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, IntToBoolean) CASE_CREATE_READER(BYTE, IntToByte) CASE_CREATE_READER(SHORT, IntToShort) @@ -864,7 +1372,7 @@ namespace orc { } } case LONG: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, LongToBoolean) CASE_CREATE_READER(BYTE, LongToByte) CASE_CREATE_READER(SHORT, LongToShort) @@ -888,7 +1396,7 @@ namespace orc { } } case FLOAT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, FloatToBoolean) CASE_CREATE_READER(BYTE, FloatToByte) CASE_CREATE_READER(SHORT, FloatToShort) @@ -912,7 +1420,7 @@ namespace orc { } } case DOUBLE: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, DoubleToBoolean) CASE_CREATE_READER(BYTE, DoubleToByte) CASE_CREATE_READER(SHORT, DoubleToShort) @@ -935,15 +1443,8 @@ namespace orc { CASE_EXCEPTION } } - case STRING: - case BINARY: - case TIMESTAMP: - case LIST: - case MAP: - case STRUCT: - case UNION: case DECIMAL: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean) CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte) CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short) @@ -951,26 +1452,26 @@ namespace orc { CASE_CREATE_FROM_DECIMAL_READER(LONG, Long) CASE_CREATE_FROM_DECIMAL_READER(FLOAT, Float) CASE_CREATE_FROM_DECIMAL_READER(DOUBLE, Double) + CASE_CREATE_FROM_DECIMAL_READER(STRING, String) + CASE_CREATE_FROM_DECIMAL_READER(CHAR, Char) + CASE_CREATE_FROM_DECIMAL_READER(VARCHAR, Varchar) + CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP, Timestamp) + CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP_INSTANT, Timestamp) case DECIMAL: { if (isDecimal64(fileType)) { - if (isDecimal64(_readType)) { + if (isDecimal64(readType)) { CREATE_READER(Decimal64ToDecimal64ColumnReader) } else { CREATE_READER(Decimal64ToDecimal128ColumnReader) } } else { - if (isDecimal64(_readType)) { + if (isDecimal64(readType)) { CREATE_READER(Decimal128ToDecimal64ColumnReader) } else { CREATE_READER(Decimal128ToDecimal128ColumnReader) } } } - case STRING: - case CHAR: - case VARCHAR: - case TIMESTAMP: - case TIMESTAMP_INSTANT: case BINARY: case LIST: case MAP: @@ -980,22 +1481,106 @@ namespace orc { CASE_EXCEPTION } } + case STRING: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, StringToBoolean) + CASE_CREATE_READER(BYTE, StringToByte) + CASE_CREATE_READER(SHORT, StringToShort) + CASE_CREATE_READER(INT, StringToInt) + CASE_CREATE_READER(LONG, StringToLong) + CASE_CREATE_READER(FLOAT, StringToFloat) + CASE_CREATE_READER(DOUBLE, StringToDouble) + CASE_CREATE_READER(STRING, StringToString) + CASE_CREATE_READER(CHAR, StringToChar) + CASE_CREATE_READER(VARCHAR, StringToVarchar) + CASE_CREATE_READER(TIMESTAMP, StringToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, StringToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(StringToDecimal64ColumnReader) + } else { + CREATE_READER(StringToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case CHAR: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, CharToBoolean) + CASE_CREATE_READER(BYTE, CharToByte) + CASE_CREATE_READER(SHORT, CharToShort) + CASE_CREATE_READER(INT, CharToInt) + CASE_CREATE_READER(LONG, CharToLong) + CASE_CREATE_READER(FLOAT, CharToFloat) + CASE_CREATE_READER(DOUBLE, CharToDouble) + CASE_CREATE_READER(STRING, CharToString) + CASE_CREATE_READER(CHAR, CharToChar) + CASE_CREATE_READER(VARCHAR, CharToVarchar) + CASE_CREATE_READER(TIMESTAMP, CharToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, CharToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(CharToDecimal64ColumnReader) + } else { + CREATE_READER(CharToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case VARCHAR: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, VarcharToBoolean) + CASE_CREATE_READER(BYTE, VarcharToByte) + CASE_CREATE_READER(SHORT, VarcharToShort) + CASE_CREATE_READER(INT, VarcharToInt) + CASE_CREATE_READER(LONG, VarcharToLong) + CASE_CREATE_READER(FLOAT, VarcharToFloat) + CASE_CREATE_READER(DOUBLE, VarcharToDouble) + CASE_CREATE_READER(STRING, VarcharToString) + CASE_CREATE_READER(CHAR, VarcharToChar) + CASE_CREATE_READER(VARCHAR, VarcharToVarchar) + CASE_CREATE_READER(TIMESTAMP, VarcharToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, VarcharToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(VarcharToDecimal64ColumnReader) + } else { + CREATE_READER(VarcharToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case BINARY: + case TIMESTAMP: + case LIST: + case MAP: + case STRUCT: + case UNION: case DATE: - case VARCHAR: - case CHAR: case TIMESTAMP_INSTANT: CASE_EXCEPTION } } -#undef DEFINE_NUMERIC_CONVERT_READER -#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER -#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER -#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER -#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER -#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER -#undef CASE_CREATE_FROM_DECIMAL_READER -#undef CASE_CREATE_READER -#undef CASE_EXCEPTION - } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc index 7e6958deef..588f8dc96a 100644 --- a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc +++ b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc @@ -74,7 +74,7 @@ namespace orc { #if defined(_WIN32) //------------------------------ WINDOWS ------------------------------// - void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) { + void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cacheSizes) { PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; DWORD buffer_size = 0; @@ -108,8 +108,8 @@ namespace orc { if (RelationCache == buffer_position->Relationship) { PCACHE_DESCRIPTOR cache = &buffer_position->Cache; if (cache->Level >= 1 && cache->Level <= kCacheLevels) { - const int64_t current = (*cache_sizes)[cache->Level - 1]; - (*cache_sizes)[cache->Level - 1] = std::max<int64_t>(current, cache->Size); + const int64_t current = (*cacheSizes)[cache->Level - 1]; + (*cacheSizes)[cache->Level - 1] = std::max<int64_t>(current, cache->Size); } } offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); @@ -136,23 +136,22 @@ namespace orc { } #endif // MINGW - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { int register_EAX_id = 1; int highest_valid_id = 0; int highest_extended_valid_id = 0; std::bitset<32> features_ECX; - std::array<int, 4> cpu_info; + std::array<int, 4> cpuInfo; // Get highest valid id - __cpuid(cpu_info.data(), 0); - highest_valid_id = cpu_info[0]; + __cpuid(cpuInfo.data(), 0); + highest_valid_id = cpuInfo[0]; // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C // HEX of "AuthenticAMD": 41757468 656E7469 63414D44 - if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) { + if (cpuInfo[1] == 0x756e6547 && cpuInfo[3] == 0x49656e69 && cpuInfo[2] == 0x6c65746e) { *vendor = CpuInfo::Vendor::Intel; - } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 && - cpu_info[2] == 0x444d4163) { + } else if (cpuInfo[1] == 0x68747541 && cpuInfo[3] == 0x69746e65 && cpuInfo[2] == 0x444d4163) { *vendor = CpuInfo::Vendor::AMD; } @@ -161,19 +160,19 @@ namespace orc { } // EAX=1: Processor Info and Feature Bits - __cpuidex(cpu_info.data(), register_EAX_id, 0); - features_ECX = cpu_info[2]; + __cpuidex(cpuInfo.data(), register_EAX_id, 0); + features_ECX = cpuInfo[2]; // Get highest extended id - __cpuid(cpu_info.data(), 0x80000000); - highest_extended_valid_id = cpu_info[0]; + __cpuid(cpuInfo.data(), 0x80000000); + highest_extended_valid_id = cpuInfo[0]; // Retrieve CPU model name if (highest_extended_valid_id >= static_cast<int>(0x80000004)) { - model_name->clear(); + modelName->clear(); for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) { - __cpuidex(cpu_info.data(), i, 0); - *model_name += std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info)); + __cpuidex(cpuInfo.data(), i, 0); + *modelName += std::string(reinterpret_cast<char*>(cpuInfo.data()), sizeof(cpuInfo)); } } @@ -184,37 +183,37 @@ namespace orc { zmm_enabled = (xcr0 & 0xE0) == 0xE0; } - if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; - if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; - if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; - if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; - if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + if (features_ECX[9]) *hardwareFlags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardwareFlags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardwareFlags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardwareFlags |= CpuInfo::POPCNT; + if (features_ECX[28]) *hardwareFlags |= CpuInfo::AVX; // cpuid with EAX=7, ECX=0: Extended Features register_EAX_id = 7; if (highest_valid_id > register_EAX_id) { - __cpuidex(cpu_info.data(), register_EAX_id, 0); - std::bitset<32> features_EBX = cpu_info[1]; + __cpuidex(cpuInfo.data(), register_EAX_id, 0); + std::bitset<32> features_EBX = cpuInfo[1]; - if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; - if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; - if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + if (features_EBX[3]) *hardwareFlags |= CpuInfo::BMI1; + if (features_EBX[5]) *hardwareFlags |= CpuInfo::AVX2; + if (features_EBX[8]) *hardwareFlags |= CpuInfo::BMI2; if (zmm_enabled) { - if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; - if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; - if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; - if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW; - if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL; + if (features_EBX[16]) *hardwareFlags |= CpuInfo::AVX512F; + if (features_EBX[17]) *hardwareFlags |= CpuInfo::AVX512DQ; + if (features_EBX[28]) *hardwareFlags |= CpuInfo::AVX512CD; + if (features_EBX[30]) *hardwareFlags |= CpuInfo::AVX512BW; + if (features_EBX[31]) *hardwareFlags |= CpuInfo::AVX512VL; } } } #elif defined(CPUINFO_ARCH_ARM) // Windows on Arm - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - *hardware_flags |= CpuInfo::ASIMD; - // TODO: vendor, model_name + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { + *hardwareFlags |= CpuInfo::ASIMD; + // TODO: vendor, modelName } #endif @@ -236,25 +235,25 @@ namespace orc { return std::nullopt; } - void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) { + void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cacheSizes) { static_assert(kCacheLevels >= 3, ""); auto c = IntegerSysCtlByName("hw.l1dcachesize"); if (c.has_value()) { - (*cache_sizes)[0] = *c; + (*cacheSizes)[0] = *c; } c = IntegerSysCtlByName("hw.l2cachesize"); if (c.has_value()) { - (*cache_sizes)[1] = *c; + (*cacheSizes)[1] = *c; } c = IntegerSysCtlByName("hw.l3cachesize"); if (c.has_value()) { - (*cache_sizes)[2] = *c; + (*cacheSizes)[2] = *c; } } - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - // hardware_flags + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { + // hardwareFlags struct SysCtlCpuFeature { const char* name; int64_t flag; @@ -280,13 +279,13 @@ namespace orc { for (const auto& feature : features) { auto v = IntegerSysCtlByName(feature.name); if (v.value_or(0)) { - *hardware_flags |= feature.flag; + *hardwareFlags |= feature.flag; } } - // TODO: vendor, model_name + // TODO: vendor, modelName *vendor = CpuInfo::Vendor::Unknown; - *model_name = "Unknown"; + *modelName = "Unknown"; } #else @@ -345,7 +344,7 @@ namespace orc { const struct { std::string name; int64_t flag; - } flag_mappings[] = { + } flagMappings[] = { #if defined(CPUINFO_ARCH_X86) {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, @@ -364,22 +363,22 @@ namespace orc { {"asimd", CpuInfo::ASIMD}, #endif }; - const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + const int64_t num_flags = sizeof(flagMappings) / sizeof(flagMappings[0]); int64_t flags = 0; for (int i = 0; i < num_flags; ++i) { - if (values.find(flag_mappings[i].name) != std::string::npos) { - flags |= flag_mappings[i].flag; + if (values.find(flagMappings[i].name) != std::string::npos) { + flags |= flagMappings[i].flag; } } return flags; } - void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) { + void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cacheSizes) { for (int i = 0; i < kCacheLevels; ++i) { const int64_t cache_size = LinuxGetCacheSize(i); if (cache_size > 0) { - (*cache_sizes)[i] = cache_size; + (*cacheSizes)[i] = cache_size; } } } @@ -403,8 +402,8 @@ namespace orc { } // Read from /proc/cpuinfo - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); while (cpuinfo) { std::string line; @@ -414,9 +413,9 @@ namespace orc { const std::string name = TrimString(line.substr(0, colon - 1)); const std::string value = TrimString(line.substr(colon + 1, std::string::npos)); if (name.compare("flags") == 0 || name.compare("Features") == 0) { - *hardware_flags |= LinuxParseCpuFlags(value); + *hardwareFlags |= LinuxParseCpuFlags(value); } else if (name.compare("model name") == 0) { - *model_name = value; + *modelName = value; } else if (name.compare("vendor_id") == 0) { if (value.compare("GenuineIntel") == 0) { *vendor = CpuInfo::Vendor::Intel; @@ -433,7 +432,7 @@ namespace orc { #if defined(CPUINFO_ARCH_X86) //------------------------------ X86_64 ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { enum { USER_SIMD_NONE, USER_SIMD_AVX512, @@ -442,9 +441,9 @@ namespace orc { int level = USER_SIMD_MAX; // Parse the level - if (simd_level == "AVX512") { + if (simdLevel == "AVX512") { level = USER_SIMD_AVX512; - } else if (simd_level == "NONE") { + } else if (simdLevel == "NONE") { level = USER_SIMD_NONE; } else { return false; @@ -452,7 +451,7 @@ namespace orc { // Disable feature as the level if (level < USER_SIMD_AVX512) { - *hardware_flags &= ~CpuInfo::AVX512; + *hardwareFlags &= ~CpuInfo::AVX512; } return true; } @@ -469,9 +468,9 @@ namespace orc { #elif defined(CPUINFO_ARCH_ARM) //------------------------------ AARCH64 ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { - if (simd_level == "NONE") { - *hardware_flags &= ~CpuInfo::ASIMD; + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { + if (simdLevel == "NONE") { + *hardwareFlags &= ~CpuInfo::ASIMD; return true; } return false; @@ -485,7 +484,7 @@ namespace orc { #else //------------------------------ PPC, ... ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { return true; } @@ -496,17 +495,17 @@ namespace orc { } // namespace struct CpuInfo::Impl { - int64_t hardware_flags = 0; + int64_t hardwareFlags = 0; int numCores = 0; - int64_t original_hardware_flags = 0; + int64_t originalHardwareFlags = 0; Vendor vendor = Vendor::Unknown; - std::string model_name = "Unknown"; - std::array<int64_t, kCacheLevels> cache_sizes{}; + std::string modelName = "Unknown"; + std::array<int64_t, kCacheLevels> cacheSizes{}; Impl() { - OsRetrieveCacheSize(&cache_sizes); - OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name); - original_hardware_flags = hardware_flags; + OsRetrieveCacheSize(&cacheSizes); + OsRetrieveCpuInfo(&hardwareFlags, &vendor, &modelName); + originalHardwareFlags = hardwareFlags; numCores = std::max(static_cast<int>(std::thread::hardware_concurrency()), 1); // parse user simd level @@ -514,7 +513,7 @@ namespace orc { std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var); std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(), [](unsigned char c) { return std::toupper(c); }); - if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) { + if (!ArchParseUserSimdLevel(userSimdLevel, &hardwareFlags)) { throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); } } @@ -530,8 +529,8 @@ namespace orc { #endif const CpuInfo* CpuInfo::getInstance() { - static CpuInfo cpu_info; - return &cpu_info; + static CpuInfo cpuInfo; + return &cpuInfo; } #ifdef __clang__ @@ -539,7 +538,7 @@ namespace orc { #endif int64_t CpuInfo::hardwareFlags() const { - return impl_->hardware_flags; + return impl_->hardwareFlags; } int CpuInfo::numCores() const { @@ -551,7 +550,7 @@ namespace orc { } const std::string& CpuInfo::modelName() const { - return impl_->model_name; + return impl_->modelName; } int64_t CpuInfo::cacheSize(CacheLevel level) const { @@ -564,18 +563,18 @@ namespace orc { static_assert(static_cast<int>(CacheLevel::L1) == 0, ""); const int i = static_cast<int>(level); - if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i]; + if (impl_->cacheSizes[i] > 0) return impl_->cacheSizes[i]; if (i == 0) return kDefaultCacheSizes[0]; // l3 may be not available, return maximum of l2 or default size - return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); + return std::max(kDefaultCacheSizes[i], impl_->cacheSizes[i - 1]); } bool CpuInfo::isSupported(int64_t flags) const { - return (impl_->hardware_flags & flags) == flags; + return (impl_->hardwareFlags & flags) == flags; } bool CpuInfo::isDetected(int64_t flags) const { - return (impl_->original_hardware_flags & flags) == flags; + return (impl_->originalHardwareFlags & flags) == flags; } void CpuInfo::verifyCpuRequirements() const { diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc index 23703ff324..2ba1ab404c 100644 --- a/contrib/libs/apache/orc/c++/src/Exceptions.cc +++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc @@ -20,11 +20,11 @@ namespace orc { - NotImplementedYet::NotImplementedYet(const std::string& what_arg) : logic_error(what_arg) { + NotImplementedYet::NotImplementedYet(const std::string& whatArg) : logic_error(whatArg) { // PASS } - NotImplementedYet::NotImplementedYet(const char* what_arg) : logic_error(what_arg) { + NotImplementedYet::NotImplementedYet(const char* whatArg) : logic_error(whatArg) { // PASS } @@ -36,11 +36,11 @@ namespace orc { // PASS } - ParseError::ParseError(const std::string& what_arg) : runtime_error(what_arg) { + ParseError::ParseError(const std::string& whatArg) : runtime_error(whatArg) { // PASS } - ParseError::ParseError(const char* what_arg) : runtime_error(what_arg) { + ParseError::ParseError(const char* whatArg) : runtime_error(whatArg) { // PASS } @@ -52,11 +52,11 @@ namespace orc { // PASS } - InvalidArgument::InvalidArgument(const std::string& what_arg) : runtime_error(what_arg) { + InvalidArgument::InvalidArgument(const std::string& whatArg) : runtime_error(whatArg) { // PASS } - InvalidArgument::InvalidArgument(const char* what_arg) : runtime_error(what_arg) { + InvalidArgument::InvalidArgument(const char* whatArg) : runtime_error(whatArg) { // PASS } @@ -68,11 +68,11 @@ namespace orc { // PASS } - SchemaEvolutionError::SchemaEvolutionError(const std::string& what_arg) : logic_error(what_arg) { + SchemaEvolutionError::SchemaEvolutionError(const std::string& whatArg) : logic_error(whatArg) { // PASS } - SchemaEvolutionError::SchemaEvolutionError(const char* what_arg) : logic_error(what_arg) { + SchemaEvolutionError::SchemaEvolutionError(const char* whatArg) : logic_error(whatArg) { // PASS } @@ -84,4 +84,20 @@ namespace orc { SchemaEvolutionError::~SchemaEvolutionError() noexcept { // PASS } + + CompressionError::CompressionError(const std::string& whatArg) : runtime_error(whatArg) { + // PASS + } + + CompressionError::CompressionError(const char* whatArg) : runtime_error(whatArg) { + // PASS + } + + CompressionError::CompressionError(const CompressionError& error) : runtime_error(error) { + // PASS + } + + CompressionError::~CompressionError() noexcept { + // PASS + } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc index 434a8dda80..1e059fd4e2 100644 --- a/contrib/libs/apache/orc/c++/src/Int128.cc +++ b/contrib/libs/apache/orc/c++/src/Int128.cc @@ -35,8 +35,8 @@ namespace orc { } Int128::Int128(const std::string& str) { - lowbits = 0; - highbits = 0; + lowbits_ = 0; + highbits_ = 0; size_t length = str.length(); if (length > 0) { bool isNegative = str[0] == '-'; @@ -64,30 +64,30 @@ namespace orc { // Break the left and right numbers into 32 bit chunks // so that we can multiply them without overflow. - uint64_t L0 = static_cast<uint64_t>(highbits) >> 32; - uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK; - uint64_t L2 = lowbits >> 32; - uint64_t L3 = lowbits & INT_MASK; - uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32; - uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK; - uint64_t R2 = right.lowbits >> 32; - uint64_t R3 = right.lowbits & INT_MASK; + uint64_t L0 = static_cast<uint64_t>(highbits_) >> 32; + uint64_t L1 = static_cast<uint64_t>(highbits_) & INT_MASK; + uint64_t L2 = lowbits_ >> 32; + uint64_t L3 = lowbits_ & INT_MASK; + uint64_t R0 = static_cast<uint64_t>(right.highbits_) >> 32; + uint64_t R1 = static_cast<uint64_t>(right.highbits_) & INT_MASK; + uint64_t R2 = right.lowbits_ >> 32; + uint64_t R3 = right.lowbits_ & INT_MASK; uint64_t product = L3 * R3; - lowbits = product & INT_MASK; + lowbits_ = product & INT_MASK; uint64_t sum = product >> 32; product = L2 * R3; sum += product; - highbits = sum < product ? CARRY_BIT : 0; + highbits_ = sum < product ? CARRY_BIT : 0; product = L3 * R2; sum += product; if (sum < product) { - highbits += CARRY_BIT; + highbits_ += CARRY_BIT; } - lowbits += sum << 32; - highbits += static_cast<int64_t>(sum >> 32); - highbits += L1 * R3 + L2 * R2 + L3 * R1; - highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; + lowbits_ += sum << 32; + highbits_ += static_cast<int64_t>(sum >> 32); + highbits_ += L1 * R3 + L2 * R2 + L3 * R1; + highbits_ += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; return *this; } @@ -103,16 +103,16 @@ namespace orc { int64_t Int128::fillInArray(uint32_t* array, bool& wasNegative) const { uint64_t high; uint64_t low; - if (highbits < 0) { - low = ~lowbits + 1; - high = static_cast<uint64_t>(~highbits); + if (highbits_ < 0) { + low = ~lowbits_ + 1; + high = static_cast<uint64_t>(~highbits_); if (low == 0) { high += 1; } wasNegative = true; } else { - low = lowbits; - high = static_cast<uint64_t>(highbits); + low = lowbits_; + high = static_cast<uint64_t>(highbits_); wasNegative = false; } if (high != 0) { @@ -430,8 +430,8 @@ namespace orc { std::string Int128::toHexString() const { std::stringstream buf; - buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits << std::setw(16) - << std::setfill('0') << lowbits; + buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits_ << std::setw(16) + << std::setfill('0') << lowbits_; return buf.str(); } @@ -439,7 +439,7 @@ namespace orc { if (fitsInLong()) { return static_cast<double>(toLong()); } - return static_cast<double>(lowbits) + std::ldexp(static_cast<double>(highbits), 64); + return static_cast<double>(lowbits_) + std::ldexp(static_cast<double>(highbits_), 64); } const static int32_t MAX_PRECISION_64 = 18; diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc index 8c8837aa64..ed7fee7373 100644 --- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc +++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc @@ -53,72 +53,72 @@ namespace orc { template <class T> DataBuffer<T>::DataBuffer(MemoryPool& pool, uint64_t newSize) - : memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) { + : memoryPool_(pool), buf_(nullptr), currentSize_(0), currentCapacity_(0) { reserve(newSize); - currentSize = newSize; + currentSize_ = newSize; } template <class T> DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer) noexcept - : memoryPool(buffer.memoryPool), - buf(buffer.buf), - currentSize(buffer.currentSize), - currentCapacity(buffer.currentCapacity) { - buffer.buf = nullptr; - buffer.currentSize = 0; - buffer.currentCapacity = 0; + : memoryPool_(buffer.memoryPool_), + buf_(buffer.buf_), + currentSize_(buffer.currentSize_), + currentCapacity_(buffer.currentCapacity_) { + buffer.buf_ = nullptr; + buffer.currentSize_ = 0; + buffer.currentCapacity_ = 0; } template <class T> DataBuffer<T>::~DataBuffer() { - for (uint64_t i = currentSize; i > 0; --i) { - (buf + i - 1)->~T(); + for (uint64_t i = currentSize_; i > 0; --i) { + (buf_ + i - 1)->~T(); } - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <class T> void DataBuffer<T>::resize(uint64_t newSize) { reserve(newSize); - if (currentSize > newSize) { - for (uint64_t i = currentSize; i > newSize; --i) { - (buf + i - 1)->~T(); + if (currentSize_ > newSize) { + for (uint64_t i = currentSize_; i > newSize; --i) { + (buf_ + i - 1)->~T(); } - } else if (newSize > currentSize) { - for (uint64_t i = currentSize; i < newSize; ++i) { - new (buf + i) T(); + } else if (newSize > currentSize_) { + for (uint64_t i = currentSize_; i < newSize; ++i) { + new (buf_ + i) T(); } } - currentSize = newSize; + currentSize_ = newSize; } template <class T> void DataBuffer<T>::reserve(uint64_t newCapacity) { - if (newCapacity > currentCapacity || !buf) { - if (buf) { - T* buf_old = buf; - buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); - memcpy(buf, buf_old, sizeof(T) * currentSize); - memoryPool.free(reinterpret_cast<char*>(buf_old)); + if (newCapacity > currentCapacity_ || !buf_) { + if (buf_) { + T* buf_old = buf_; + buf_ = reinterpret_cast<T*>(memoryPool_.malloc(sizeof(T) * newCapacity)); + memcpy(buf_, buf_old, sizeof(T) * currentSize_); + memoryPool_.free(reinterpret_cast<char*>(buf_old)); } else { - buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); + buf_ = reinterpret_cast<T*>(memoryPool_.malloc(sizeof(T) * newCapacity)); } - currentCapacity = newCapacity; + currentCapacity_ = newCapacity; } } template <class T> void DataBuffer<T>::zeroOut() { - memset(buf, 0, sizeof(T) * currentCapacity); + memset(buf_, 0, sizeof(T) * currentCapacity_); } // Specializations for Int128 template <> void DataBuffer<Int128>::zeroOut() { - for (uint64_t i = 0; i < currentCapacity; ++i) { - new (buf + i) Int128(); + for (uint64_t i = 0; i < currentCapacity_; ++i) { + new (buf_ + i) Int128(); } } @@ -126,180 +126,180 @@ namespace orc { template <> DataBuffer<char>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<char>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, newSize - currentSize); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, newSize - currentSize_); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for char* template <> DataBuffer<char*>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<char*>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(char*)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for double template <> DataBuffer<double>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<double>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(double)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for float template <> DataBuffer<float>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<float>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(float)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(float)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for int64_t template <> DataBuffer<int64_t>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<int64_t>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int64_t)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for int32_t template <> DataBuffer<int32_t>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<int32_t>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int32_t)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int32_t)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for int16_t template <> DataBuffer<int16_t>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<int16_t>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int16_t)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int16_t)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for int8_t template <> DataBuffer<int8_t>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<int8_t>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int8_t)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int8_t)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for uint64_t template <> DataBuffer<uint64_t>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<uint64_t>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t)); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(uint64_t)); } - currentSize = newSize; + currentSize_ = newSize; } // Specializations for unsigned char template <> DataBuffer<unsigned char>::~DataBuffer() { - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); + if (buf_) { + memoryPool_.free(reinterpret_cast<char*>(buf_)); } } template <> void DataBuffer<unsigned char>::resize(uint64_t newSize) { reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, newSize - currentSize); + if (newSize > currentSize_) { + memset(buf_ + currentSize_, 0, newSize - currentSize_); } - currentSize = newSize; + currentSize_ = newSize; } #ifdef __clang__ diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh index 51cd8efd64..0a4bd56d8f 100644 --- a/contrib/libs/apache/orc/c++/src/Options.hh +++ b/contrib/libs/apache/orc/c++/src/Options.hh @@ -23,6 +23,8 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "io/Cache.hh" + #include <limits> namespace orc { @@ -43,6 +45,7 @@ namespace orc { MemoryPool* memoryPool; std::string serializedTail; ReaderMetrics* metrics; + CacheOptions cacheOptions; ReaderOptionsPrivate() { tailLocation = std::numeric_limits<uint64_t>::max(); @@ -52,23 +55,23 @@ namespace orc { } }; - ReaderOptions::ReaderOptions() : privateBits(std::make_unique<ReaderOptionsPrivate>()) { + ReaderOptions::ReaderOptions() : privateBits_(std::make_unique<ReaderOptionsPrivate>()) { // PASS } ReaderOptions::ReaderOptions(const ReaderOptions& rhs) - : privateBits(std::make_unique<ReaderOptionsPrivate>(*(rhs.privateBits.get()))) { + : privateBits_(std::make_unique<ReaderOptionsPrivate>(*(rhs.privateBits_.get()))) { // PASS } ReaderOptions::ReaderOptions(ReaderOptions& rhs) { // swap privateBits with rhs - privateBits.swap(rhs.privateBits); + privateBits_.swap(rhs.privateBits_); } ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) { if (this != &rhs) { - privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get()))); + privateBits_.reset(new ReaderOptionsPrivate(*(rhs.privateBits_.get()))); } return *this; } @@ -78,48 +81,57 @@ namespace orc { } ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) { - privateBits->memoryPool = &pool; + privateBits_->memoryPool = &pool; return *this; } MemoryPool* ReaderOptions::getMemoryPool() const { - return privateBits->memoryPool; + return privateBits_->memoryPool; } ReaderOptions& ReaderOptions::setReaderMetrics(ReaderMetrics* metrics) { - privateBits->metrics = metrics; + privateBits_->metrics = metrics; return *this; } ReaderMetrics* ReaderOptions::getReaderMetrics() const { - return privateBits->metrics; + return privateBits_->metrics; } ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) { - privateBits->tailLocation = offset; + privateBits_->tailLocation = offset; return *this; } uint64_t ReaderOptions::getTailLocation() const { - return privateBits->tailLocation; + return privateBits_->tailLocation; } ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value) { - privateBits->serializedTail = value; + privateBits_->serializedTail = value; return *this; } std::string ReaderOptions::getSerializedFileTail() const { - return privateBits->serializedTail; + return privateBits_->serializedTail; } ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) { - privateBits->errorStream = &stream; + privateBits_->errorStream = &stream; return *this; } std::ostream* ReaderOptions::getErrorStream() const { - return privateBits->errorStream; + return privateBits_->errorStream; + } + + ReaderOptions& ReaderOptions::setCacheOptions(const CacheOptions& cacheOptions) { + privateBits_->cacheOptions = cacheOptions; + return *this; + } + + const CacheOptions& ReaderOptions::getCacheOptions() const { + return privateBits_->cacheOptions; } /** @@ -155,23 +167,23 @@ namespace orc { } }; - RowReaderOptions::RowReaderOptions() : privateBits(std::make_unique<RowReaderOptionsPrivate>()) { + RowReaderOptions::RowReaderOptions() : privateBits_(std::make_unique<RowReaderOptionsPrivate>()) { // PASS } RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs) - : privateBits(std::make_unique<RowReaderOptionsPrivate>(*(rhs.privateBits.get()))) { + : privateBits_(std::make_unique<RowReaderOptionsPrivate>(*(rhs.privateBits_.get()))) { // PASS } RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) { // swap privateBits with rhs - privateBits.swap(rhs.privateBits); + privateBits_.swap(rhs.privateBits_); } RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) { if (this != &rhs) { - privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get()))); + privateBits_.reset(new RowReaderOptionsPrivate(*(rhs.privateBits_.get()))); } return *this; } @@ -181,150 +193,150 @@ namespace orc { } RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) { - privateBits->selection = ColumnSelection_FIELD_IDS; - privateBits->includedColumnIndexes.assign(include.begin(), include.end()); - privateBits->includedColumnNames.clear(); - privateBits->idReadIntentMap.clear(); + privateBits_->selection = ColumnSelection_FIELD_IDS; + privateBits_->includedColumnIndexes.assign(include.begin(), include.end()); + privateBits_->includedColumnNames.clear(); + privateBits_->idReadIntentMap.clear(); return *this; } RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) { - privateBits->selection = ColumnSelection_NAMES; - privateBits->includedColumnNames.assign(include.begin(), include.end()); - privateBits->includedColumnIndexes.clear(); - privateBits->idReadIntentMap.clear(); + privateBits_->selection = ColumnSelection_NAMES; + privateBits_->includedColumnNames.assign(include.begin(), include.end()); + privateBits_->includedColumnIndexes.clear(); + privateBits_->idReadIntentMap.clear(); return *this; } RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) { - privateBits->selection = ColumnSelection_TYPE_IDS; - privateBits->includedColumnIndexes.assign(types.begin(), types.end()); - privateBits->includedColumnNames.clear(); - privateBits->idReadIntentMap.clear(); + privateBits_->selection = ColumnSelection_TYPE_IDS; + privateBits_->includedColumnIndexes.assign(types.begin(), types.end()); + privateBits_->includedColumnNames.clear(); + privateBits_->idReadIntentMap.clear(); return *this; } RowReaderOptions& RowReaderOptions::includeTypesWithIntents( const IdReadIntentMap& idReadIntentMap) { - privateBits->selection = ColumnSelection_TYPE_IDS; - privateBits->includedColumnIndexes.clear(); - privateBits->idReadIntentMap.clear(); + privateBits_->selection = ColumnSelection_TYPE_IDS; + privateBits_->includedColumnIndexes.clear(); + privateBits_->idReadIntentMap.clear(); for (const auto& typeIntentPair : idReadIntentMap) { - privateBits->idReadIntentMap[typeIntentPair.first] = typeIntentPair.second; - privateBits->includedColumnIndexes.push_back(typeIntentPair.first); + privateBits_->idReadIntentMap[typeIntentPair.first] = typeIntentPair.second; + privateBits_->includedColumnIndexes.push_back(typeIntentPair.first); } - privateBits->includedColumnNames.clear(); + privateBits_->includedColumnNames.clear(); return *this; } RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { - privateBits->dataStart = offset; - privateBits->dataLength = length; + privateBits_->dataStart = offset; + privateBits_->dataLength = length; return *this; } bool RowReaderOptions::getIndexesSet() const { - return privateBits->selection == ColumnSelection_FIELD_IDS; + return privateBits_->selection == ColumnSelection_FIELD_IDS; } bool RowReaderOptions::getTypeIdsSet() const { - return privateBits->selection == ColumnSelection_TYPE_IDS; + return privateBits_->selection == ColumnSelection_TYPE_IDS; } const std::list<uint64_t>& RowReaderOptions::getInclude() const { - return privateBits->includedColumnIndexes; + return privateBits_->includedColumnIndexes; } bool RowReaderOptions::getNamesSet() const { - return privateBits->selection == ColumnSelection_NAMES; + return privateBits_->selection == ColumnSelection_NAMES; } const std::list<std::string>& RowReaderOptions::getIncludeNames() const { - return privateBits->includedColumnNames; + return privateBits_->includedColumnNames; } uint64_t RowReaderOptions::getOffset() const { - return privateBits->dataStart; + return privateBits_->dataStart; } uint64_t RowReaderOptions::getLength() const { - return privateBits->dataLength; + return privateBits_->dataLength; } RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow) { - privateBits->throwOnHive11DecimalOverflow = shouldThrow; + privateBits_->throwOnHive11DecimalOverflow = shouldThrow; return *this; } bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const { - return privateBits->throwOnHive11DecimalOverflow; + return privateBits_->throwOnHive11DecimalOverflow; } RowReaderOptions& RowReaderOptions::throwOnSchemaEvolutionOverflow(bool shouldThrow) { - privateBits->throwOnSchemaEvolutionOverflow = shouldThrow; + privateBits_->throwOnSchemaEvolutionOverflow = shouldThrow; return *this; } bool RowReaderOptions::getThrowOnSchemaEvolutionOverflow() const { - return privateBits->throwOnSchemaEvolutionOverflow; + return privateBits_->throwOnSchemaEvolutionOverflow; } RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale) { - privateBits->forcedScaleOnHive11Decimal = forcedScale; + privateBits_->forcedScaleOnHive11Decimal = forcedScale; return *this; } int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const { - return privateBits->forcedScaleOnHive11Decimal; + return privateBits_->forcedScaleOnHive11Decimal; } bool RowReaderOptions::getEnableLazyDecoding() const { - return privateBits->enableLazyDecoding; + return privateBits_->enableLazyDecoding; } RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) { - privateBits->enableLazyDecoding = enable; + privateBits_->enableLazyDecoding = enable; return *this; } RowReaderOptions& RowReaderOptions::searchArgument(std::unique_ptr<SearchArgument> sargs) { - privateBits->sargs = std::move(sargs); + privateBits_->sargs = std::move(sargs); return *this; } std::shared_ptr<SearchArgument> RowReaderOptions::getSearchArgument() const { - return privateBits->sargs; + return privateBits_->sargs; } RowReaderOptions& RowReaderOptions::setTimezoneName(const std::string& zoneName) { - privateBits->readerTimezone = zoneName; + privateBits_->readerTimezone = zoneName; return *this; } const std::string& RowReaderOptions::getTimezoneName() const { - return privateBits->readerTimezone; + return privateBits_->readerTimezone; } const RowReaderOptions::IdReadIntentMap RowReaderOptions::getIdReadIntentMap() const { - return privateBits->idReadIntentMap; + return privateBits_->idReadIntentMap; } RowReaderOptions& RowReaderOptions::setUseTightNumericVector(bool useTightNumericVector) { - privateBits->useTightNumericVector = useTightNumericVector; + privateBits_->useTightNumericVector = useTightNumericVector; return *this; } bool RowReaderOptions::getUseTightNumericVector() const { - return privateBits->useTightNumericVector; + return privateBits_->useTightNumericVector; } RowReaderOptions& RowReaderOptions::setReadType(std::shared_ptr<Type> type) { - privateBits->readType = std::move(type); + privateBits_->readType = std::move(type); return *this; } std::shared_ptr<Type>& RowReaderOptions::getReadType() const { - return privateBits->readType; + return privateBits_->readType; } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc index d4b6a86e2f..be86724329 100644 --- a/contrib/libs/apache/orc/c++/src/OrcFile.cc +++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc @@ -49,29 +49,29 @@ namespace orc { class FileInputStream : public InputStream { private: - std::string filename; - int file; - uint64_t totalLength; - ReaderMetrics* metrics; + std::string filename_; + int file_; + uint64_t totalLength_; + ReaderMetrics* metrics_; public: - FileInputStream(std::string _filename, ReaderMetrics* _metrics) - : filename(_filename), metrics(_metrics) { - file = open(filename.c_str(), O_BINARY | O_RDONLY); - if (file == -1) { - throw ParseError("Can't open " + filename); + FileInputStream(std::string filename, ReaderMetrics* metrics) + : filename_(filename), metrics_(metrics) { + file_ = open(filename_.c_str(), O_BINARY | O_RDONLY); + if (file_ == -1) { + throw ParseError("Can't open " + filename_); } struct stat fileStat; - if (fstat(file, &fileStat) == -1) { - throw ParseError("Can't stat " + filename); + if (fstat(file_, &fileStat) == -1) { + throw ParseError("Can't stat " + filename_); } - totalLength = static_cast<uint64_t>(fileStat.st_size); + totalLength_ = static_cast<uint64_t>(fileStat.st_size); } ~FileInputStream() override; uint64_t getLength() const override { - return totalLength; + return totalLength_; } uint64_t getNaturalReadSize() const override { @@ -79,27 +79,27 @@ namespace orc { } void read(void* buf, uint64_t length, uint64_t offset) override { - SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount); if (!buf) { throw ParseError("Buffer is null"); } - ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset)); + ssize_t bytesRead = pread(file_, buf, length, static_cast<off_t>(offset)); if (bytesRead == -1) { - throw ParseError("Bad read of " + filename); + throw ParseError("Bad read of " + filename_); } if (static_cast<uint64_t>(bytesRead) != length) { - throw ParseError("Short read of " + filename); + throw ParseError("Short read of " + filename_); } } const std::string& getName() const override { - return filename; + return filename_; } }; FileInputStream::~FileInputStream() { - close(file); + close(file_); } std::unique_ptr<InputStream> readFile(const std::string& path, ReaderMetrics* metrics) { @@ -126,26 +126,26 @@ namespace orc { class FileOutputStream : public OutputStream { private: - std::string filename; - int file; - uint64_t bytesWritten; - bool closed; + std::string filename_; + int file_; + uint64_t bytesWritten_; + bool closed_; public: - FileOutputStream(std::string _filename) { - bytesWritten = 0; - filename = _filename; - closed = false; - file = open(filename.c_str(), O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR); - if (file == -1) { - throw ParseError("Can't open " + filename); + FileOutputStream(std::string filename) { + bytesWritten_ = 0; + filename_ = filename; + closed_ = false; + file_ = open(filename_.c_str(), O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR); + if (file_ == -1) { + throw ParseError("Can't open " + filename_); } } ~FileOutputStream() override; uint64_t getLength() const override { - return bytesWritten; + return bytesWritten_; } uint64_t getNaturalWriteSize() const override { @@ -153,41 +153,41 @@ namespace orc { } void write(const void* buf, size_t length) override { - if (closed) { + if (closed_) { throw std::logic_error("Cannot write to closed stream."); } - ssize_t bytesWrite = ::write(file, buf, length); + ssize_t bytesWrite = ::write(file_, buf, length); if (bytesWrite == -1) { - throw ParseError("Bad write of " + filename); + throw ParseError("Bad write of " + filename_); } if (static_cast<uint64_t>(bytesWrite) != length) { - throw ParseError("Short write of " + filename); + throw ParseError("Short write of " + filename_); } - bytesWritten += static_cast<uint64_t>(bytesWrite); + bytesWritten_ += static_cast<uint64_t>(bytesWrite); } const std::string& getName() const override { - return filename; + return filename_; } void close() override { - if (!closed) { - ::close(file); - closed = true; + if (!closed_) { + ::close(file_); + closed_ = true; } } void flush() override { - if (!closed) { - ::fsync(file); + if (!closed_) { + ::fsync(file_); } } }; FileOutputStream::~FileOutputStream() { - if (!closed) { - ::close(file); - closed = true; + if (!closed_) { + ::close(file_); + closed_ = true; } } diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc index 89aca6a10e..cb831c80f7 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.cc +++ b/contrib/libs/apache/orc/c++/src/RLE.cc @@ -108,15 +108,23 @@ namespace orc { void RleEncoder::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); + uint64_t unusedBufferSize = static_cast<uint64_t>(bufferLength - bufferPosition); if (outputStream->isCompressed()) { recorder->add(flushedSize); - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } recorder->add(static_cast<uint64_t>(numLiterals)); } + void RleEncoder::finishEncode() { + outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + outputStream->finishStream(); + bufferLength = bufferPosition = 0; + } + } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh index 51f9b6f58a..e46504e885 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.hh +++ b/contrib/libs/apache/orc/c++/src/RLE.hh @@ -84,6 +84,13 @@ namespace orc { virtual void write(int64_t val) = 0; + /** + * Finalize the encoding process. This function should be called after all data required for + * encoding has been added. It ensures that any remaining data is processed and the final state + * of the encoder is set. + */ + virtual void finishEncode(); + protected: std::unique_ptr<BufferedOutputStream> outputStream; size_t bufferPosition; @@ -105,7 +112,7 @@ namespace orc { // must be non-inline! virtual ~RleDecoder(); - RleDecoder(ReaderMetrics* _metrics) : metrics(_metrics) { + RleDecoder(ReaderMetrics* metrics) : metrics(metrics) { // pass } diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc index b221e8b8aa..72c555e610 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.cc +++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc @@ -38,9 +38,9 @@ namespace orc { RleEncoderV1::RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned) : RleEncoder(std::move(outStream), hasSigned) { literals = new int64_t[MAX_LITERAL_SIZE]; - delta = 0; - repeat = false; - tailRunLength = 0; + delta_ = 0; + repeat_ = false; + tailRunLength_ = 0; } RleEncoderV1::~RleEncoderV1() { @@ -49,9 +49,9 @@ namespace orc { void RleEncoderV1::writeValues() { if (numLiterals != 0) { - if (repeat) { + if (repeat_) { writeByte(static_cast<char>(static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); - writeByte(static_cast<char>(delta)); + writeByte(static_cast<char>(delta_)); if (isSigned) { writeVslong(literals[0]); } else { @@ -67,26 +67,24 @@ namespace orc { } } } - repeat = false; + repeat_ = false; numLiterals = 0; - tailRunLength = 0; + tailRunLength_ = 0; } } uint64_t RleEncoderV1::flush() { - writeValues(); - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + finishEncode(); uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; return dataSize; } void RleEncoderV1::write(int64_t value) { if (numLiterals == 0) { literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { + tailRunLength_ = 1; + } else if (repeat_) { + if (value == literals[0] + delta_ * static_cast<int64_t>(numLiterals)) { numLiterals += 1; if (numLiterals == MAXIMUM_REPEAT) { writeValues(); @@ -94,36 +92,36 @@ namespace orc { } else { writeValues(); literals[numLiterals++] = value; - tailRunLength = 1; + tailRunLength_ = 1; } } else { - if (tailRunLength == 1) { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; + if (tailRunLength_ == 1) { + delta_ = value - literals[numLiterals - 1]; + if (delta_ < MIN_DELTA || delta_ > MAX_DELTA) { + tailRunLength_ = 1; } else { - tailRunLength = 2; + tailRunLength_ = 2; } - } else if (value == literals[numLiterals - 1] + delta) { - tailRunLength += 1; + } else if (value == literals[numLiterals - 1] + delta_) { + tailRunLength_ += 1; } else { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; + delta_ = value - literals[numLiterals - 1]; + if (delta_ < MIN_DELTA || delta_ > MAX_DELTA) { + tailRunLength_ = 1; } else { - tailRunLength = 2; + tailRunLength_ = 2; } } - if (tailRunLength == MINIMUM_REPEAT) { + if (tailRunLength_ == MINIMUM_REPEAT) { if (numLiterals + 1 == MINIMUM_REPEAT) { - repeat = true; + repeat_ = true; numLiterals += 1; } else { numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); int64_t base = literals[numLiterals]; writeValues(); literals[0] = base; - repeat = true; + repeat_ = true; numLiterals = MINIMUM_REPEAT; } } else { @@ -135,18 +133,23 @@ namespace orc { } } + void RleEncoderV1::finishEncode() { + writeValues(); + RleEncoder::finishEncode(); + } + signed char RleDecoderV1::readByte() { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); - if (bufferStart == bufferEnd) { + if (bufferStart_ == bufferEnd_) { int bufferLength; const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { + if (!inputStream_->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in readByte"); } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; + bufferStart_ = static_cast<const char*>(bufferPointer); + bufferEnd_ = bufferStart_ + bufferLength; } - return static_cast<signed char>(*(bufferStart++)); + return static_cast<signed char>(*(bufferStart_++)); } uint64_t RleDecoderV1::readLong() { @@ -177,34 +180,34 @@ namespace orc { void RleDecoderV1::readHeader() { signed char ch = readByte(); if (ch < 0) { - remainingValues = static_cast<uint64_t>(-ch); - repeating = false; + remainingValues_ = static_cast<uint64_t>(-ch); + repeating_ = false; } else { - remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; - repeating = true; - delta = readByte(); - value = isSigned ? unZigZag(readLong()) : static_cast<int64_t>(readLong()); + remainingValues_ = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; + repeating_ = true; + delta_ = readByte(); + value_ = isSigned_ ? unZigZag(readLong()) : static_cast<int64_t>(readLong()); } } void RleDecoderV1::reset() { - remainingValues = 0; - value = 0; - bufferStart = nullptr; - bufferEnd = nullptr; - delta = 0; - repeating = false; + remainingValues_ = 0; + value_ = 0; + bufferStart_ = nullptr; + bufferEnd_ = nullptr; + delta_ = 0; + repeating_ = false; } RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool hasSigned, - ReaderMetrics* _metrics) - : RleDecoder(_metrics), inputStream(std::move(input)), isSigned(hasSigned) { + ReaderMetrics* metrics) + : RleDecoder(metrics), inputStream_(std::move(input)), isSigned_(hasSigned) { reset(); } void RleDecoderV1::seek(PositionProvider& location) { // move the input stream - inputStream->seek(location); + inputStream_->seek(location); // reset the decoder status and lazily call readHeader() reset(); // skip ahead the given number of records @@ -213,14 +216,14 @@ namespace orc { void RleDecoderV1::skip(uint64_t numValues) { while (numValues > 0) { - if (remainingValues == 0) { + if (remainingValues_ == 0) { readHeader(); } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; + uint64_t count = std::min(numValues, remainingValues_); + remainingValues_ -= count; numValues -= count; - if (repeating) { - value += delta * static_cast<int64_t>(count); + if (repeating_) { + value_ += delta_ * static_cast<int64_t>(count); } else { skipLongs(count); } @@ -240,38 +243,38 @@ namespace orc { } while (position < numValues) { // If we are out of values, read more. - if (remainingValues == 0) { + if (remainingValues_ == 0) { readHeader(); } // How many do we read out of this block? - uint64_t count = std::min(numValues - position, remainingValues); + uint64_t count = std::min(numValues - position, remainingValues_); uint64_t consumed = 0; - if (repeating) { + if (repeating_) { if (notNull) { for (uint64_t i = 0; i < count; ++i) { if (notNull[position + i]) { - data[position + i] = static_cast<T>(value + static_cast<int64_t>(consumed) * delta); + data[position + i] = static_cast<T>(value_ + static_cast<int64_t>(consumed) * delta_); consumed += 1; } } } else { for (uint64_t i = 0; i < count; ++i) { - data[position + i] = static_cast<T>(value + static_cast<int64_t>(i) * delta); + data[position + i] = static_cast<T>(value_ + static_cast<int64_t>(i) * delta_); } consumed = count; } - value += static_cast<int64_t>(consumed) * delta; + value_ += static_cast<int64_t>(consumed) * delta_; } else { if (notNull) { for (uint64_t i = 0; i < count; ++i) { if (notNull[position + i]) { data[position + i] = - isSigned ? static_cast<T>(unZigZag(readLong())) : static_cast<T>(readLong()); + isSigned_ ? static_cast<T>(unZigZag(readLong())) : static_cast<T>(readLong()); ++consumed; } } } else { - if (isSigned) { + if (isSigned_) { for (uint64_t i = 0; i < count; ++i) { data[position + i] = static_cast<T>(unZigZag(readLong())); } @@ -283,7 +286,7 @@ namespace orc { consumed = count; } } - remainingValues -= consumed; + remainingValues_ -= consumed; position += count; // skipNulls() diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh index fbe6b0f9c6..024b1e5e97 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh @@ -38,10 +38,12 @@ namespace orc { void write(int64_t val) override; + void finishEncode() override; + private: - int64_t delta; - bool repeat; - uint64_t tailRunLength; + int64_t delta_; + bool repeat_; + uint64_t tailRunLength_; void writeValues(); }; @@ -83,14 +85,14 @@ namespace orc { inline void reset(); - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - uint64_t remainingValues; - int64_t value; - const char* bufferStart; - const char* bufferEnd; - int64_t delta; - bool repeating; + const std::unique_ptr<SeekableInputStream> inputStream_; + const bool isSigned_; + uint64_t remainingValues_; + int64_t value_; + const char* bufferStart_; + const char* bufferEnd_; + int64_t delta_; + bool repeating_; }; } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh index 1cee59d0a6..8ceb7f125b 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv2.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh @@ -96,10 +96,10 @@ namespace orc { ~RleEncoderV2() override { delete[] literals; - delete[] gapVsPatchList; - delete[] zigzagLiterals; - delete[] baseRedLiterals; - delete[] adjDeltas; + delete[] gapVsPatchList_; + delete[] zigzagLiterals_; + delete[] baseRedLiterals_; + delete[] adjDeltas_; } /** * Flushing underlying BufferedOutputStream @@ -108,19 +108,21 @@ namespace orc { void write(int64_t val) override; + void finishEncode() override; + private: - const bool alignedBitPacking; - uint32_t fixedRunLength; - uint32_t variableRunLength; - int64_t prevDelta; - int32_t histgram[HIST_LEN]; + const bool alignedBitPacking_; + uint32_t fixedRunLength_; + uint32_t variableRunLength_; + int64_t prevDelta_; + int32_t histgram_[HIST_LEN]; // The four list below should actually belong to EncodingOption since it only holds temporal // values in write(int64_t val), it is move here for performance consideration. - int64_t* gapVsPatchList; - int64_t* zigzagLiterals; - int64_t* baseRedLiterals; - int64_t* adjDeltas; + int64_t* gapVsPatchList_; + int64_t* zigzagLiterals_; + int64_t* baseRedLiterals_; + int64_t* adjDeltas_; uint32_t getOpCode(EncodingType encoding); int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); @@ -169,39 +171,39 @@ namespace orc { unsigned char readByte(); void setBufStart(const char* start) { - bufferStart = const_cast<char*>(start); + bufferStart_ = const_cast<char*>(start); } char* getBufStart() { - return bufferStart; + return bufferStart_; } void setBufEnd(const char* end) { - bufferEnd = const_cast<char*>(end); + bufferEnd_ = const_cast<char*>(end); } char* getBufEnd() { - return bufferEnd; + return bufferEnd_; } uint64_t bufLength() { - return bufferEnd - bufferStart; + return bufferEnd_ - bufferStart_; } void setBitsLeft(const uint32_t bits) { - bitsLeft = bits; + bitsLeft_ = bits; } void setCurByte(const uint32_t byte) { - curByte = byte; + curByte_ = byte; } uint32_t getBitsLeft() { - return bitsLeft; + return bitsLeft_; } uint32_t getCurByte() { - return curByte; + return curByte_; } /** @@ -225,8 +227,8 @@ namespace orc { int64_t* resPatch, uint64_t* patchIdx); void resetReadLongs() { - bitsLeft = 0; - curByte = 0; + bitsLeft_ = 0; + curByte_ = 0; } void resetRun() { @@ -249,17 +251,17 @@ namespace orc { template <typename T> uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull); - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - unsigned char firstByte; - char* bufferStart; - char* bufferEnd; - uint64_t runLength; // Length of the current run - uint64_t runRead; // Number of returned values of the current run - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs - DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE - DataBuffer<int64_t> literals; // Values of the current run + const std::unique_ptr<SeekableInputStream> inputStream_; + const bool isSigned_; + unsigned char firstByte_; + char* bufferStart_; + char* bufferEnd_; + uint64_t runLength_; // Length of the current run + uint64_t runRead_; // Number of returned values of the current run + uint32_t bitsLeft_; // Used by readLongs when bitSize < 8 + uint32_t curByte_; // Used by anything that uses readLongs + DataBuffer<int64_t> unpackedPatch_; // Used by PATCHED_BASE + DataBuffer<int64_t> literals_; // Values of the current run }; inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { @@ -268,20 +270,20 @@ namespace orc { const void* bufferPointer = nullptr; if (backupByteLen != 0) { - inputStream->BackUp(backupByteLen); + inputStream_->BackUp(backupByteLen); } if (len >= remainingLen && resetBuf) { - if (!inputStream->Next(&bufferPointer, &bufferLength)) { + if (!inputStream_->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::resetBufferStart"); } } if (bufferPointer == nullptr) { - bufferStart += len; + bufferStart_ += len; } else { - bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer)); - bufferEnd = bufferStart + bufferLength; + bufferStart_ = const_cast<char*>(static_cast<const char*>(bufferPointer)); + bufferEnd_ = bufferStart_ + bufferLength; } } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc index 82e77e4705..f47c40ebbe 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.cc +++ b/contrib/libs/apache/orc/c++/src/Reader.cc @@ -73,11 +73,11 @@ namespace orc { } std::string ColumnSelector::toDotColumnPath() { - if (columns.empty()) { + if (columns_.empty()) { return std::string(); } std::ostringstream columnStream; - std::copy(columns.begin(), columns.end(), + std::copy(columns_.begin(), columns_.end(), std::ostream_iterator<std::string>(columnStream, ".")); std::string columnPath = columnStream.str(); return columnPath.substr(0, columnPath.length() - 1); @@ -150,15 +150,15 @@ namespace orc { */ void ColumnSelector::buildTypeNameIdMap(const Type* type) { // map<type_id, Type*> - idTypeMap[type->getColumnId()] = type; + idTypeMap_[type->getColumnId()] = type; if (STRUCT == type->getKind()) { for (size_t i = 0; i < type->getSubtypeCount(); ++i) { const std::string& fieldName = type->getFieldName(i); - columns.push_back(fieldName); - nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId(); + columns_.push_back(fieldName); + nameIdMap_[toDotColumnPath()] = type->getSubtype(i)->getColumnId(); buildTypeNameIdMap(type->getSubtype(i)); - columns.pop_back(); + columns_.pop_back(); } } else { // other non-primitive type @@ -170,13 +170,13 @@ namespace orc { void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options) { - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) { + selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false); + if (contents_->schema->getKind() == STRUCT && options.getIndexesSet()) { for (std::list<uint64_t>::const_iterator field = options.getInclude().begin(); field != options.getInclude().end(); ++field) { updateSelectedByFieldId(selectedColumns, *field); } - } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { + } else if (contents_->schema->getKind() == STRUCT && options.getNamesSet()) { for (std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); field != options.getIncludeNames().end(); ++field) { updateSelectedByName(selectedColumns, *field); @@ -191,18 +191,18 @@ namespace orc { // default is to select all columns std::fill(selectedColumns.begin(), selectedColumns.end(), true); } - selectParents(selectedColumns, *contents->schema.get()); + selectParents(selectedColumns, *contents_->schema.get()); selectedColumns[0] = true; // column 0 is selected by default } void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId) { - if (fieldId < contents->schema->getSubtypeCount()) { - selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId)); + if (fieldId < contents_->schema->getSubtypeCount()) { + selectChildren(selectedColumns, *contents_->schema->getSubtype(fieldId)); } else { std::stringstream buffer; buffer << "Invalid column selected " << fieldId << " out of " - << contents->schema->getSubtypeCount(); + << contents_->schema->getSubtypeCount(); throw ParseError(buffer.str()); } } @@ -215,7 +215,7 @@ namespace orc { std::vector<bool>& selectedColumns, uint64_t typeId, const RowReaderOptions::IdReadIntentMap& idReadIntentMap) { if (typeId < selectedColumns.size()) { - const Type& type = *idTypeMap[typeId]; + const Type& type = *idTypeMap_[typeId]; selectChildren(selectedColumns, type, idReadIntentMap); } else { std::stringstream buffer; @@ -226,14 +226,14 @@ namespace orc { void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& fieldName) { - std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName); - if (ite != nameIdMap.end()) { + std::map<std::string, uint64_t>::const_iterator ite = nameIdMap_.find(fieldName); + if (ite != nameIdMap_.end()) { updateSelectedByTypeId(selectedColumns, ite->second); } else { bool first = true; std::ostringstream ss; ss << "Invalid column selected " << fieldName << ". Valid names are "; - for (auto it = nameIdMap.begin(); it != nameIdMap.end(); ++it) { + for (auto it = nameIdMap_.begin(); it != nameIdMap_.end(); ++it) { if (!first) ss << ", "; ss << it->first; first = false; @@ -242,89 +242,88 @@ namespace orc { } } - ColumnSelector::ColumnSelector(const FileContents* _contents) : contents(_contents) { - buildTypeNameIdMap(contents->schema.get()); + ColumnSelector::ColumnSelector(const FileContents* contents) : contents_(contents) { + buildTypeNameIdMap(contents_->schema.get()); } - RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents, - const RowReaderOptions& opts) - : localTimezone(getLocalTimezone()), - contents(_contents), - throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), - forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), - footer(contents->footer.get()), - firstRowOfStripe(*contents->pool, 0), - enableEncodedBlock(opts.getEnableLazyDecoding()), - readerTimezone(getTimezoneByName(opts.getTimezoneName())), - schemaEvolution(opts.getReadType(), contents->schema.get()) { + RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> contents, const RowReaderOptions& opts) + : localTimezone_(getLocalTimezone()), + contents_(contents), + throwOnHive11DecimalOverflow_(opts.getThrowOnHive11DecimalOverflow()), + forcedScaleOnHive11Decimal_(opts.getForcedScaleOnHive11Decimal()), + footer_(contents_->footer.get()), + firstRowOfStripe_(*contents_->pool, 0), + enableEncodedBlock_(opts.getEnableLazyDecoding()), + readerTimezone_(getTimezoneByName(opts.getTimezoneName())), + schemaEvolution_(opts.getReadType(), contents_->schema.get()) { uint64_t numberOfStripes; - numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - currentStripe = numberOfStripes; - lastStripe = 0; - currentRowInStripe = 0; - rowsInCurrentStripe = 0; - numRowGroupsInStripeRange = 0; - useTightNumericVector = opts.getUseTightNumericVector(); - throwOnSchemaEvolutionOverflow = opts.getThrowOnSchemaEvolutionOverflow(); + numberOfStripes = static_cast<uint64_t>(footer_->stripes_size()); + currentStripe_ = numberOfStripes; + lastStripe_ = 0; + currentRowInStripe_ = 0; + rowsInCurrentStripe_ = 0; + numRowGroupsInStripeRange_ = 0; + useTightNumericVector_ = opts.getUseTightNumericVector(); + throwOnSchemaEvolutionOverflow_ = opts.getThrowOnSchemaEvolutionOverflow(); uint64_t rowTotal = 0; - firstRowOfStripe.resize(numberOfStripes); + firstRowOfStripe_.resize(numberOfStripes); for (size_t i = 0; i < numberOfStripes; ++i) { - firstRowOfStripe[i] = rowTotal; - proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(i)); + firstRowOfStripe_[i] = rowTotal; + proto::StripeInformation stripeInfo = footer_->stripes(static_cast<int>(i)); rowTotal += stripeInfo.number_of_rows(); bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() && stripeInfo.offset() < opts.getOffset() + opts.getLength(); if (isStripeInRange) { - if (i < currentStripe) { - currentStripe = i; + if (i < currentStripe_) { + currentStripe_ = i; } - if (i >= lastStripe) { - lastStripe = i + 1; + if (i >= lastStripe_) { + lastStripe_ = i + 1; } - if (footer->row_index_stride() > 0) { - numRowGroupsInStripeRange += - (stripeInfo.number_of_rows() + footer->row_index_stride() - 1) / - footer->row_index_stride(); + if (footer_->row_index_stride() > 0) { + numRowGroupsInStripeRange_ += + (stripeInfo.number_of_rows() + footer_->row_index_stride() - 1) / + footer_->row_index_stride(); } } } - firstStripe = currentStripe; - processingStripe = lastStripe; + firstStripe_ = currentStripe_; + processingStripe_ = lastStripe_; - if (currentStripe == 0) { - previousRow = (std::numeric_limits<uint64_t>::max)(); - } else if (currentStripe == numberOfStripes) { - previousRow = footer->number_of_rows(); + if (currentStripe_ == 0) { + previousRow_ = (std::numeric_limits<uint64_t>::max)(); + } else if (currentStripe_ == numberOfStripes) { + previousRow_ = footer_->number_of_rows(); } else { - previousRow = firstRowOfStripe[firstStripe] - 1; + previousRow_ = firstRowOfStripe_[firstStripe_] - 1; } - ColumnSelector column_selector(contents.get()); - column_selector.updateSelected(selectedColumns, opts); + ColumnSelector column_selector(contents_.get()); + column_selector.updateSelected(selectedColumns_, opts); // prepare SargsApplier if SearchArgument is available - if (opts.getSearchArgument() && footer->row_index_stride() > 0) { - sargs = opts.getSearchArgument(); - sargsApplier.reset( - new SargsApplier(*contents->schema, sargs.get(), footer->row_index_stride(), - getWriterVersionImpl(_contents.get()), contents->readerMetrics)); + if (opts.getSearchArgument() && footer_->row_index_stride() > 0) { + sargs_ = opts.getSearchArgument(); + sargsApplier_.reset( + new SargsApplier(*contents_->schema, sargs_.get(), footer_->row_index_stride(), + getWriterVersionImpl(contents.get()), contents_->readerMetrics)); } - skipBloomFilters = hasBadBloomFilters(); + skipBloomFilters_ = hasBadBloomFilters(); } // Check if the file has inconsistent bloom filters. bool RowReaderImpl::hasBadBloomFilters() { // Only C++ writer in old releases could have bad bloom filters. - if (footer->writer() != ORC_CPP_WRITER) return false; + if (footer_->writer() != ORC_CPP_WRITER) return false; // 'softwareVersion' is added in 1.5.13, 1.6.11, and 1.7.0. // 1.6.x releases before 1.6.11 won't have it. On the other side, the C++ writer // supports writing bloom filters since 1.6.0. So files written by the C++ writer // and with 'softwareVersion' unset would have bad bloom filters. - if (!footer->has_software_version()) return true; + if (!footer_->has_software_version()) return true; - const std::string& fullVersion = footer->software_version(); + const std::string& fullVersion = footer_->software_version(); std::string version; // Deal with snapshot versions, e.g. 1.6.12-SNAPSHOT. if (fullVersion.find('-') != std::string::npos) { @@ -341,31 +340,31 @@ namespace orc { } CompressionKind RowReaderImpl::getCompression() const { - return contents->compression; + return contents_->compression; } uint64_t RowReaderImpl::getCompressionSize() const { - return contents->blockSize; + return contents_->blockSize; } const std::vector<bool> RowReaderImpl::getSelectedColumns() const { - return selectedColumns; + return selectedColumns_; } const Type& RowReaderImpl::getSelectedType() const { - if (selectedSchema.get() == nullptr) { - selectedSchema = buildSelectedType(contents->schema.get(), selectedColumns); + if (selectedSchema_.get() == nullptr) { + selectedSchema_ = buildSelectedType(contents_->schema.get(), selectedColumns_); } - return *(selectedSchema.get()); + return *(selectedSchema_.get()); } uint64_t RowReaderImpl::getRowNumber() const { - return previousRow; + return previousRow_; } void RowReaderImpl::seekToRow(uint64_t rowNumber) { // Empty file - if (lastStripe == 0) { + if (lastStripe_ == 0) { return; } @@ -375,53 +374,53 @@ namespace orc { // Implement this by setting previousRow to the number of rows in the file. // seeking past lastStripe - uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size()); - if ((lastStripe == num_stripes && rowNumber >= footer->number_of_rows()) || - (lastStripe < num_stripes && rowNumber >= firstRowOfStripe[lastStripe])) { - currentStripe = num_stripes; - previousRow = footer->number_of_rows(); + uint64_t num_stripes = static_cast<uint64_t>(footer_->stripes_size()); + if ((lastStripe_ == num_stripes && rowNumber >= footer_->number_of_rows()) || + (lastStripe_ < num_stripes && rowNumber >= firstRowOfStripe_[lastStripe_])) { + currentStripe_ = num_stripes; + previousRow_ = footer_->number_of_rows(); return; } uint64_t seekToStripe = 0; - while (seekToStripe + 1 < lastStripe && firstRowOfStripe[seekToStripe + 1] <= rowNumber) { + while (seekToStripe + 1 < lastStripe_ && firstRowOfStripe_[seekToStripe + 1] <= rowNumber) { seekToStripe++; } // seeking before the first stripe - if (seekToStripe < firstStripe) { - currentStripe = num_stripes; - previousRow = footer->number_of_rows(); + if (seekToStripe < firstStripe_) { + currentStripe_ = num_stripes; + previousRow_ = footer_->number_of_rows(); return; } - previousRow = rowNumber; - auto rowIndexStride = footer->row_index_stride(); - if (!isCurrentStripeInited() || currentStripe != seekToStripe || rowIndexStride == 0 || - currentStripeInfo.index_length() == 0) { + previousRow_ = rowNumber; + auto rowIndexStride = footer_->row_index_stride(); + if (!isCurrentStripeInited() || currentStripe_ != seekToStripe || rowIndexStride == 0 || + currentStripeInfo_.index_length() == 0) { // current stripe is not initialized or // target stripe is not current stripe or // current stripe doesn't have row indexes - currentStripe = seekToStripe; - currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; + currentStripe_ = seekToStripe; + currentRowInStripe_ = rowNumber - firstRowOfStripe_[currentStripe_]; startNextStripe(); - if (currentStripe >= lastStripe) { + if (currentStripe_ >= lastStripe_) { return; } } else { - currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; - if (sargsApplier) { + currentRowInStripe_ = rowNumber - firstRowOfStripe_[currentStripe_]; + if (sargsApplier_) { // advance to selected row group if predicate pushdown is enabled - currentRowInStripe = - advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, - footer->row_index_stride(), sargsApplier->getNextSkippedRows()); + currentRowInStripe_ = + advanceToNextRowGroup(currentRowInStripe_, rowsInCurrentStripe_, + footer_->row_index_stride(), sargsApplier_->getNextSkippedRows()); } } - uint64_t rowsToSkip = currentRowInStripe; + uint64_t rowsToSkip = currentRowInStripe_; // seek to the target row group if row indexes exists - if (rowIndexStride > 0 && currentStripeInfo.index_length() > 0) { - if (rowIndexes.empty()) { + if (rowIndexStride > 0 && currentStripeInfo_.index_length() > 0) { + if (rowIndexes_.empty()) { loadStripeIndex(); } // TODO(ORC-1175): process the failures of loadStripeIndex() call @@ -432,36 +431,36 @@ namespace orc { // 'reader' is reset in startNextStripe(). It could be nullptr if 'rowsToSkip' is 0, // e.g. when startNextStripe() skips all remaining rows of the file. if (rowsToSkip > 0) { - reader->skip(rowsToSkip); + reader_->skip(rowsToSkip); } } void RowReaderImpl::loadStripeIndex() { // reset all previous row indexes - rowIndexes.clear(); - bloomFilterIndex.clear(); + rowIndexes_.clear(); + bloomFilterIndex_.clear(); // obtain row indexes for selected columns - uint64_t offset = currentStripeInfo.offset(); - for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { - const proto::Stream& pbStream = currentStripeFooter.streams(i); + uint64_t offset = currentStripeInfo_.offset(); + for (int i = 0; i < currentStripeFooter_.streams_size(); ++i) { + const proto::Stream& pbStream = currentStripeFooter_.streams(i); uint64_t colId = pbStream.column(); - if (selectedColumns[colId] && pbStream.has_kind() && + if (selectedColumns_[colId] && pbStream.has_kind() && (pbStream.kind() == proto::Stream_Kind_ROW_INDEX || pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) { std::unique_ptr<SeekableInputStream> inStream = createDecompressor( getCompression(), std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream( - contents->stream.get(), offset, pbStream.length(), *contents->pool)), - getCompressionSize(), *contents->pool, contents->readerMetrics); + contents_->stream.get(), offset, pbStream.length(), *contents_->pool)), + getCompressionSize(), *contents_->pool, contents_->readerMetrics); if (pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { proto::RowIndex rowIndex; if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) { throw ParseError("Failed to parse the row index"); } - rowIndexes[colId] = rowIndex; - } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8 + rowIndexes_[colId] = rowIndex; + } else if (!skipBloomFilters_) { // Stream_Kind_BLOOM_FILTER_UTF8 proto::BloomFilterIndex pbBFIndex; if (!pbBFIndex.ParseFromZeroCopyStream(inStream.get())) { throw ParseError("Failed to parse bloom filter index"); @@ -469,11 +468,11 @@ namespace orc { BloomFilterIndex bfIndex; for (int j = 0; j < pbBFIndex.bloom_filter_size(); j++) { bfIndex.entries.push_back(BloomFilterUTF8Utils::deserialize( - pbStream.kind(), currentStripeFooter.columns(static_cast<int>(pbStream.column())), + pbStream.kind(), currentStripeFooter_.columns(static_cast<int>(pbStream.column())), pbBFIndex.bloom_filter(j))); } // add bloom filters to result for one column - bloomFilterIndex[pbStream.column()] = bfIndex; + bloomFilterIndex_[pbStream.column()] = bfIndex; } } offset += pbStream.length(); @@ -486,7 +485,7 @@ namespace orc { // store position providers for selected colimns std::unordered_map<uint64_t, PositionProvider> positionProviders; - for (auto rowIndex = rowIndexes.cbegin(); rowIndex != rowIndexes.cend(); ++rowIndex) { + for (auto rowIndex = rowIndexes_.cbegin(); rowIndex != rowIndexes_.cend(); ++rowIndex) { uint64_t colId = rowIndex->first; const proto::RowIndexEntry& entry = rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); @@ -500,23 +499,23 @@ namespace orc { positionProviders.insert(std::make_pair(colId, PositionProvider(position))); } - reader->seekToRowGroup(positionProviders); + reader_->seekToRowGroup(positionProviders); } const FileContents& RowReaderImpl::getFileContents() const { - return *contents; + return *contents_; } bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const { - return throwOnHive11DecimalOverflow; + return throwOnHive11DecimalOverflow_; } bool RowReaderImpl::getIsDecimalAsLong() const { - return contents->isDecimalAsLong; + return contents_->isDecimalAsLong; } int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const { - return forcedScaleOnHive11Decimal; + return forcedScaleOnHive11Decimal_; } proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, @@ -542,29 +541,29 @@ namespace orc { return result; } - ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, const ReaderOptions& opts, - uint64_t _fileLength, uint64_t _postscriptLength) - : contents(std::move(_contents)), - options(opts), - fileLength(_fileLength), - postscriptLength(_postscriptLength), - footer(contents->footer.get()) { - isMetadataLoaded = false; + ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> contents, const ReaderOptions& opts, + uint64_t fileLength, uint64_t postscriptLength) + : contents_(std::move(contents)), + options_(opts), + fileLength_(fileLength), + postscriptLength_(postscriptLength), + footer_(contents_->footer.get()) { + isMetadataLoaded_ = false; checkOrcVersion(); - numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - contents->schema = convertType(footer->types(0), *footer); - contents->blockSize = getCompressionBlockSize(*contents->postscript); - contents->compression = convertCompressionKind(*contents->postscript); + numberOfStripes_ = static_cast<uint64_t>(footer_->stripes_size()); + contents_->schema = convertType(footer_->types(0), *footer_); + contents_->blockSize = getCompressionBlockSize(*contents_->postscript); + contents_->compression = convertCompressionKind(*contents_->postscript); } std::string ReaderImpl::getSerializedFileTail() const { proto::FileTail tail; proto::PostScript* mutable_ps = tail.mutable_postscript(); - mutable_ps->CopyFrom(*contents->postscript); + mutable_ps->CopyFrom(*contents_->postscript); proto::Footer* mutableFooter = tail.mutable_footer(); - mutableFooter->CopyFrom(*footer); - tail.set_file_length(fileLength); - tail.set_postscript_length(postscriptLength); + mutableFooter->CopyFrom(*footer_); + tail.set_file_length(fileLength_); + tail.set_postscript_length(postscriptLength_); TProtobufString result; if (!tail.SerializeToString(&result)) { throw ParseError("Failed to serialize file tail"); @@ -573,56 +572,56 @@ namespace orc { } const ReaderOptions& ReaderImpl::getReaderOptions() const { - return options; + return options_; } CompressionKind ReaderImpl::getCompression() const { - return contents->compression; + return contents_->compression; } uint64_t ReaderImpl::getCompressionSize() const { - return contents->blockSize; + return contents_->blockSize; } uint64_t ReaderImpl::getNumberOfStripes() const { - return numberOfStripes; + return numberOfStripes_; } uint64_t ReaderImpl::getNumberOfStripeStatistics() const { - if (!isMetadataLoaded) { + if (!isMetadataLoaded_) { readMetadata(); } - return contents->metadata == nullptr + return contents_->metadata == nullptr ? 0 - : static_cast<uint64_t>(contents->metadata->stripe_stats_size()); + : static_cast<uint64_t>(contents_->metadata->stripe_stats_size()); } std::unique_ptr<StripeInformation> ReaderImpl::getStripe(uint64_t stripeIndex) const { if (stripeIndex > getNumberOfStripes()) { throw std::logic_error("stripe index out of range"); } - proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(stripeIndex)); + proto::StripeInformation stripeInfo = footer_->stripes(static_cast<int>(stripeIndex)); return std::unique_ptr<StripeInformation>(new StripeInformationImpl( stripeInfo.offset(), stripeInfo.index_length(), stripeInfo.data_length(), - stripeInfo.footer_length(), stripeInfo.number_of_rows(), contents->stream.get(), - *contents->pool, contents->compression, contents->blockSize, contents->readerMetrics)); + stripeInfo.footer_length(), stripeInfo.number_of_rows(), contents_->stream.get(), + *contents_->pool, contents_->compression, contents_->blockSize, contents_->readerMetrics)); } FileVersion ReaderImpl::getFormatVersion() const { - if (contents->postscript->version_size() != 2) { + if (contents_->postscript->version_size() != 2) { return FileVersion::v_0_11(); } - return {contents->postscript->version(0), contents->postscript->version(1)}; + return {contents_->postscript->version(0), contents_->postscript->version(1)}; } uint64_t ReaderImpl::getNumberOfRows() const { - return footer->number_of_rows(); + return footer_->number_of_rows(); } WriterId ReaderImpl::getWriterId() const { - if (footer->has_writer()) { - uint32_t id = footer->writer(); + if (footer_->has_writer()) { + uint32_t id = footer_->writer(); if (id > WriterId::CUDF_WRITER) { return WriterId::UNKNOWN_WRITER; } else { @@ -633,8 +632,8 @@ namespace orc { } uint32_t ReaderImpl::getWriterIdValue() const { - if (footer->has_writer()) { - return footer->writer(); + if (footer_->has_writer()) { + return footer_->writer(); } else { return WriterId::ORC_JAVA_WRITER; } @@ -643,56 +642,56 @@ namespace orc { std::string ReaderImpl::getSoftwareVersion() const { std::ostringstream buffer; buffer << writerIdToString(getWriterIdValue()); - if (footer->has_software_version()) { - buffer << " " << footer->software_version(); + if (footer_->has_software_version()) { + buffer << " " << footer_->software_version(); } return buffer.str(); } WriterVersion ReaderImpl::getWriterVersion() const { - return getWriterVersionImpl(contents.get()); + return getWriterVersionImpl(contents_.get()); } uint64_t ReaderImpl::getContentLength() const { - return footer->content_length(); + return footer_->content_length(); } uint64_t ReaderImpl::getStripeStatisticsLength() const { - return contents->postscript->metadata_length(); + return contents_->postscript->metadata_length(); } uint64_t ReaderImpl::getFileFooterLength() const { - return contents->postscript->footer_length(); + return contents_->postscript->footer_length(); } uint64_t ReaderImpl::getFilePostscriptLength() const { - return postscriptLength; + return postscriptLength_; } uint64_t ReaderImpl::getFileLength() const { - return fileLength; + return fileLength_; } uint64_t ReaderImpl::getRowIndexStride() const { - return footer->row_index_stride(); + return footer_->row_index_stride(); } const std::string& ReaderImpl::getStreamName() const { - return contents->stream->getName(); + return contents_->stream->getName(); } std::list<std::string> ReaderImpl::getMetadataKeys() const { std::list<std::string> result; - for (int i = 0; i < footer->metadata_size(); ++i) { - result.push_back(footer->metadata(i).name()); + for (int i = 0; i < footer_->metadata_size(); ++i) { + result.push_back(footer_->metadata(i).name()); } return result; } std::string ReaderImpl::getMetadataValue(const std::string& key) const { - for (int i = 0; i < footer->metadata_size(); ++i) { - if (footer->metadata(i).name() == key) { - return footer->metadata(i).value(); + for (int i = 0; i < footer_->metadata_size(); ++i) { + if (footer_->metadata(i).name() == key) { + return footer_->metadata(i).value(); } } throw std::range_error("key not found"); @@ -719,10 +718,10 @@ namespace orc { throw ParseError(msg.str()); } std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, + createDecompressor(contents_->compression, std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream( - contents->stream.get(), offset, length, *contents->pool)), - contents->blockSize, *(contents->pool), contents->readerMetrics); + contents_->stream.get(), offset, length, *contents_->pool)), + contents_->blockSize, *(contents_->pool), contents_->readerMetrics); proto::RowIndex rowIndex; if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) { @@ -740,8 +739,8 @@ namespace orc { } bool ReaderImpl::hasMetadataValue(const std::string& key) const { - for (int i = 0; i < footer->metadata_size(); ++i) { - if (footer->metadata(i).name() == key) { + for (int i = 0; i < footer_->metadata_size(); ++i) { + if (footer_->metadata(i).name() == key) { return true; } } @@ -749,22 +748,22 @@ namespace orc { } const Type& ReaderImpl::getType() const { - return *(contents->schema.get()); + return *(contents_->schema.get()); } std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { - if (!isMetadataLoaded) { + if (!isMetadataLoaded_) { readMetadata(); } - if (contents->metadata == nullptr) { + if (contents_->metadata == nullptr) { throw std::logic_error("No stripe statistics in file"); } size_t num_cols = static_cast<size_t>( - contents->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size()); + contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size()); std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols); - proto::StripeInformation currentStripeInfo = footer->stripes(static_cast<int>(stripeIndex)); - proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); + proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast<int>(stripeIndex)); + proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get()); getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); @@ -773,47 +772,47 @@ namespace orc { : getLocalTimezone(); StatContext statContext(hasCorrectStatistics(), &writerTZ); return std::make_unique<StripeStatisticsImpl>( - contents->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext); + contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext); } std::unique_ptr<Statistics> ReaderImpl::getStatistics() const { StatContext statContext(hasCorrectStatistics()); - return std::make_unique<StatisticsImpl>(*footer, statContext); + return std::make_unique<StatisticsImpl>(*footer_, statContext); } std::unique_ptr<ColumnStatistics> ReaderImpl::getColumnStatistics(uint32_t index) const { - if (index >= static_cast<uint64_t>(footer->statistics_size())) { + if (index >= static_cast<uint64_t>(footer_->statistics_size())) { throw std::logic_error("column index out of range"); } - proto::ColumnStatistics col = footer->statistics(static_cast<int32_t>(index)); + proto::ColumnStatistics col = footer_->statistics(static_cast<int32_t>(index)); StatContext statContext(hasCorrectStatistics()); return std::unique_ptr<ColumnStatistics>(convertColumnStatistics(col, statContext)); } void ReaderImpl::readMetadata() const { - uint64_t metadataSize = contents->postscript->metadata_length(); - uint64_t footerLength = contents->postscript->footer_length(); - if (fileLength < metadataSize + footerLength + postscriptLength + 1) { + uint64_t metadataSize = contents_->postscript->metadata_length(); + uint64_t footerLength = contents_->postscript->footer_length(); + if (fileLength_ < metadataSize + footerLength + postscriptLength_ + 1) { std::stringstream msg; - msg << "Invalid Metadata length: fileLength=" << fileLength + msg << "Invalid Metadata length: fileLength=" << fileLength_ << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength - << ", postscriptLength=" << postscriptLength; + << ", postscriptLength=" << postscriptLength_; throw ParseError(msg.str()); } - uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; + uint64_t metadataStart = fileLength_ - metadataSize - footerLength - postscriptLength_ - 1; if (metadataSize != 0) { std::unique_ptr<SeekableInputStream> pbStream = createDecompressor( - contents->compression, - std::make_unique<SeekableFileInputStream>(contents->stream.get(), metadataStart, - metadataSize, *contents->pool), - contents->blockSize, *contents->pool, contents->readerMetrics); - contents->metadata.reset(new proto::Metadata()); - if (!contents->metadata->ParseFromZeroCopyStream(pbStream.get())) { + contents_->compression, + std::make_unique<SeekableFileInputStream>(contents_->stream.get(), metadataStart, + metadataSize, *contents_->pool), + contents_->blockSize, *contents_->pool, contents_->readerMetrics); + contents_->metadata.reset(new proto::Metadata()); + if (!contents_->metadata->ParseFromZeroCopyStream(pbStream.get())) { throw ParseError("Failed to parse the metadata"); } } - isMetadataLoaded = true; + isMetadataLoaded_ = true; } bool ReaderImpl::hasCorrectStatistics() const { @@ -823,9 +822,9 @@ namespace orc { void ReaderImpl::checkOrcVersion() { FileVersion version = getFormatVersion(); if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) { - *(options.getErrorStream()) << "Warning: ORC file " << contents->stream->getName() - << " was written in an unknown format version " - << version.toString() << "\n"; + *(options_.getErrorStream()) + << "Warning: ORC file " << contents_->stream->getName() + << " was written in an unknown format version " << version.toString() << "\n"; } } @@ -835,11 +834,11 @@ namespace orc { } std::unique_ptr<RowReader> ReaderImpl::createRowReader(const RowReaderOptions& opts) const { - if (opts.getSearchArgument() && !isMetadataLoaded) { + if (opts.getSearchArgument() && !isMetadataLoaded_) { // load stripe statistics for PPD readMetadata(); } - return std::make_unique<RowReaderImpl>(contents, opts); + return std::make_unique<RowReaderImpl>(contents_, opts); } uint64_t maxStreamsForType(const proto::Type& type) { @@ -874,15 +873,15 @@ namespace orc { uint64_t ReaderImpl::getMemoryUse(int stripeIx) { std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true); + selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), true); return getMemoryUse(stripeIx, selectedColumns); } uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) { std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) { + selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false); + ColumnSelector column_selector(contents_.get()); + if (contents_->schema->getKind() == STRUCT && include.begin() != include.end()) { for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end(); ++field) { column_selector.updateSelectedByFieldId(selectedColumns, *field); @@ -891,16 +890,16 @@ namespace orc { // default is to select all columns std::fill(selectedColumns.begin(), selectedColumns.end(), true); } - column_selector.selectParents(selectedColumns, *contents->schema.get()); + column_selector.selectParents(selectedColumns, *contents_->schema.get()); selectedColumns[0] = true; // column 0 is selected by default return getMemoryUse(stripeIx, selectedColumns); } uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) { std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) { + selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false); + ColumnSelector column_selector(contents_.get()); + if (contents_->schema->getKind() == STRUCT && names.begin() != names.end()) { for (std::list<std::string>::const_iterator field = names.begin(); field != names.end(); ++field) { column_selector.updateSelectedByName(selectedColumns, *field); @@ -909,15 +908,15 @@ namespace orc { // default is to select all columns std::fill(selectedColumns.begin(), selectedColumns.end(), true); } - column_selector.selectParents(selectedColumns, *contents->schema.get()); + column_selector.selectParents(selectedColumns, *contents_->schema.get()); selectedColumns[0] = true; // column 0 is selected by default return getMemoryUse(stripeIx, selectedColumns); } uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) { std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); + selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false); + ColumnSelector column_selector(contents_.get()); if (include.begin() != include.end()) { for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end(); ++field) { @@ -927,7 +926,7 @@ namespace orc { // default is to select all columns std::fill(selectedColumns.begin(), selectedColumns.end(), true); } - column_selector.selectParents(selectedColumns, *contents->schema.get()); + column_selector.selectParents(selectedColumns, *contents_->schema.get()); selectedColumns[0] = true; // column 0 is selected by default return getMemoryUse(stripeIx, selectedColumns); } @@ -935,14 +934,14 @@ namespace orc { uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) { uint64_t maxDataLength = 0; - if (stripeIx >= 0 && stripeIx < footer->stripes_size()) { - uint64_t stripe = footer->stripes(stripeIx).data_length(); + if (stripeIx >= 0 && stripeIx < footer_->stripes_size()) { + uint64_t stripe = footer_->stripes(stripeIx).data_length(); if (maxDataLength < stripe) { maxDataLength = stripe; } } else { - for (int i = 0; i < footer->stripes_size(); i++) { - uint64_t stripe = footer->stripes(i).data_length(); + for (int i = 0; i < footer_->stripes_size(); i++) { + uint64_t stripe = footer_->stripes(i).data_length(); if (maxDataLength < stripe) { maxDataLength = stripe; } @@ -951,9 +950,9 @@ namespace orc { bool hasStringColumn = false; uint64_t nSelectedStreams = 0; - for (int i = 0; !hasStringColumn && i < footer->types_size(); i++) { + for (int i = 0; !hasStringColumn && i < footer_->types_size(); i++) { if (selectedColumns[static_cast<size_t>(i)]) { - const proto::Type& type = footer->types(i); + const proto::Type& type = footer_->types(i); nSelectedStreams += maxStreamsForType(type); switch (static_cast<int64_t>(type.kind())) { case proto::Type_Kind_CHAR: @@ -979,29 +978,29 @@ namespace orc { uint64_t memory = hasStringColumn ? 2 * maxDataLength : std::min(uint64_t(maxDataLength), - nSelectedStreams * contents->stream->getNaturalReadSize()); + nSelectedStreams * contents_->stream->getNaturalReadSize()); // Do we need even more memory to read the footer or the metadata? - if (memory < contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS) { - memory = contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS; + if (memory < contents_->postscript->footer_length() + DIRECTORY_SIZE_GUESS) { + memory = contents_->postscript->footer_length() + DIRECTORY_SIZE_GUESS; } - if (memory < contents->postscript->metadata_length()) { - memory = contents->postscript->metadata_length(); + if (memory < contents_->postscript->metadata_length()) { + memory = contents_->postscript->metadata_length(); } // Account for firstRowOfStripe. - memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t); + memory += static_cast<uint64_t>(footer_->stripes_size()) * sizeof(uint64_t); // Decompressors need buffers for each stream uint64_t decompressorMemory = 0; - if (contents->compression != CompressionKind_NONE) { - for (int i = 0; i < footer->types_size(); i++) { + if (contents_->compression != CompressionKind_NONE) { + for (int i = 0; i < footer_->types_size(); i++) { if (selectedColumns[static_cast<size_t>(i)]) { - const proto::Type& type = footer->types(i); - decompressorMemory += maxStreamsForType(type) * contents->blockSize; + const proto::Type& type = footer_->types(i); + decompressorMemory += maxStreamsForType(type) * contents_->blockSize; } } - if (contents->compression == CompressionKind_SNAPPY) { + if (contents_->compression == CompressionKind_SNAPPY) { decompressorMemory *= 2; // Snappy decompressor uses a second buffer } } @@ -1011,101 +1010,104 @@ namespace orc { // Update fields to indicate we've reached the end of file void RowReaderImpl::markEndOfFile() { - currentStripe = lastStripe; - currentRowInStripe = 0; - rowsInCurrentStripe = 0; - if (lastStripe == 0) { + currentStripe_ = lastStripe_; + currentRowInStripe_ = 0; + rowsInCurrentStripe_ = 0; + if (lastStripe_ == 0) { // Empty file - previousRow = 0; + previousRow_ = 0; } else { - previousRow = firstRowOfStripe[lastStripe - 1] + - footer->stripes(static_cast<int>(lastStripe - 1)).number_of_rows(); + previousRow_ = firstRowOfStripe_[lastStripe_ - 1] + + footer_->stripes(static_cast<int>(lastStripe_ - 1)).number_of_rows(); } } void RowReaderImpl::startNextStripe() { - reader.reset(); // ColumnReaders use lots of memory; free old memory first - rowIndexes.clear(); - bloomFilterIndex.clear(); + reader_.reset(); // ColumnReaders use lots of memory; free old memory first + rowIndexes_.clear(); + bloomFilterIndex_.clear(); // evaluate file statistics if it exists - if (sargsApplier && !sargsApplier->evaluateFileStatistics(*footer, numRowGroupsInStripeRange)) { + if (sargsApplier_ && + !sargsApplier_->evaluateFileStatistics(*footer_, numRowGroupsInStripeRange_)) { // skip the entire file markEndOfFile(); return; } do { - currentStripeInfo = footer->stripes(static_cast<int>(currentStripe)); - uint64_t fileLength = contents->stream->getLength(); - if (currentStripeInfo.offset() + currentStripeInfo.index_length() + - currentStripeInfo.data_length() + currentStripeInfo.footer_length() >= + currentStripeInfo_ = footer_->stripes(static_cast<int>(currentStripe_)); + uint64_t fileLength = contents_->stream->getLength(); + if (currentStripeInfo_.offset() + currentStripeInfo_.index_length() + + currentStripeInfo_.data_length() + currentStripeInfo_.footer_length() >= fileLength) { std::stringstream msg; - msg << "Malformed StripeInformation at stripe index " << currentStripe + msg << "Malformed StripeInformation at stripe index " << currentStripe_ << ": fileLength=" << fileLength - << ", StripeInfo=(offset=" << currentStripeInfo.offset() - << ", indexLength=" << currentStripeInfo.index_length() - << ", dataLength=" << currentStripeInfo.data_length() - << ", footerLength=" << currentStripeInfo.footer_length() << ")"; + << ", StripeInfo=(offset=" << currentStripeInfo_.offset() + << ", indexLength=" << currentStripeInfo_.index_length() + << ", dataLength=" << currentStripeInfo_.data_length() + << ", footerLength=" << currentStripeInfo_.footer_length() << ")"; throw ParseError(msg.str()); } - currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); - rowsInCurrentStripe = currentStripeInfo.number_of_rows(); - processingStripe = currentStripe; - - if (sargsApplier) { - bool isStripeNeeded = true; - if (contents->metadata) { - const auto& currentStripeStats = - contents->metadata->stripe_stats(static_cast<int>(currentStripe)); - // skip this stripe after stats fail to satisfy sargs - uint64_t stripeRowGroupCount = - (rowsInCurrentStripe + footer->row_index_stride() - 1) / footer->row_index_stride(); - isStripeNeeded = - sargsApplier->evaluateStripeStatistics(currentStripeStats, stripeRowGroupCount); - } + rowsInCurrentStripe_ = currentStripeInfo_.number_of_rows(); + processingStripe_ = currentStripe_; + + bool isStripeNeeded = true; + // If PPD enabled and stripe stats existed, evaulate it first + if (sargsApplier_ && contents_->metadata) { + const auto& currentStripeStats = + contents_->metadata->stripe_stats(static_cast<int>(currentStripe_)); + // skip this stripe after stats fail to satisfy sargs + uint64_t stripeRowGroupCount = + (rowsInCurrentStripe_ + footer_->row_index_stride() - 1) / footer_->row_index_stride(); + isStripeNeeded = + sargsApplier_->evaluateStripeStatistics(currentStripeStats, stripeRowGroupCount); + } - if (isStripeNeeded) { + if (isStripeNeeded) { + currentStripeFooter_ = getStripeFooter(currentStripeInfo_, *contents_.get()); + if (sargsApplier_) { // read row group statistics and bloom filters of current stripe loadStripeIndex(); // select row groups to read in the current stripe - sargsApplier->pickRowGroups(rowsInCurrentStripe, rowIndexes, bloomFilterIndex); - if (sargsApplier->hasSelectedFrom(currentRowInStripe)) { + sargsApplier_->pickRowGroups(rowsInCurrentStripe_, rowIndexes_, bloomFilterIndex_); + if (sargsApplier_->hasSelectedFrom(currentRowInStripe_)) { // current stripe has at least one row group matching the predicate break; } isStripeNeeded = false; } - if (!isStripeNeeded) { - // advance to next stripe when current stripe has no matching rows - currentStripe += 1; - currentRowInStripe = 0; - } } - } while (sargsApplier && currentStripe < lastStripe); - if (currentStripe < lastStripe) { + if (!isStripeNeeded) { + // advance to next stripe when current stripe has no matching rows + currentStripe_ += 1; + currentRowInStripe_ = 0; + } + } while (sargsApplier_ && currentStripe_ < lastStripe_); + + if (currentStripe_ < lastStripe_) { // get writer timezone info from stripe footer to help understand timestamp values. const Timezone& writerTimezone = - currentStripeFooter.has_writer_timezone() - ? getTimezoneByName(currentStripeFooter.writer_timezone()) - : localTimezone; - StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, currentStripeFooter, - currentStripeInfo.offset(), *contents->stream, writerTimezone, - readerTimezone); - reader = buildReader(*contents->schema, stripeStreams, useTightNumericVector, - throwOnSchemaEvolutionOverflow, /*convertToReadType=*/true); - - if (sargsApplier) { + currentStripeFooter_.has_writer_timezone() + ? getTimezoneByName(currentStripeFooter_.writer_timezone()) + : localTimezone_; + StripeStreamsImpl stripeStreams(*this, currentStripe_, currentStripeInfo_, + currentStripeFooter_, currentStripeInfo_.offset(), + *contents_->stream, writerTimezone, readerTimezone_); + reader_ = buildReader(*contents_->schema, stripeStreams, useTightNumericVector_, + throwOnSchemaEvolutionOverflow_, /*convertToReadType=*/true); + + if (sargsApplier_) { // move to the 1st selected row group when PPD is enabled. - currentRowInStripe = - advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, - footer->row_index_stride(), sargsApplier->getNextSkippedRows()); - previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe - 1; - if (currentRowInStripe > 0) { - seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride())); + currentRowInStripe_ = + advanceToNextRowGroup(currentRowInStripe_, rowsInCurrentStripe_, + footer_->row_index_stride(), sargsApplier_->getNextSkippedRows()); + previousRow_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_ - 1; + if (currentRowInStripe_ > 0) { + seekToRowGroup(static_cast<uint32_t>(currentRowInStripe_ / footer_->row_index_stride())); } } } else { @@ -1115,52 +1117,53 @@ namespace orc { } bool RowReaderImpl::next(ColumnVectorBatch& data) { - SCOPED_STOPWATCH(contents->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); - if (currentStripe >= lastStripe) { + SCOPED_STOPWATCH(contents_->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); + if (currentStripe_ >= lastStripe_) { data.numElements = 0; markEndOfFile(); return false; } - if (currentRowInStripe == 0) { + if (currentRowInStripe_ == 0) { startNextStripe(); } uint64_t rowsToRead = - std::min(static_cast<uint64_t>(data.capacity), rowsInCurrentStripe - currentRowInStripe); - if (sargsApplier && rowsToRead > 0) { - rowsToRead = computeBatchSize(rowsToRead, currentRowInStripe, rowsInCurrentStripe, - footer->row_index_stride(), sargsApplier->getNextSkippedRows()); + std::min(static_cast<uint64_t>(data.capacity), rowsInCurrentStripe_ - currentRowInStripe_); + if (sargsApplier_ && rowsToRead > 0) { + rowsToRead = + computeBatchSize(rowsToRead, currentRowInStripe_, rowsInCurrentStripe_, + footer_->row_index_stride(), sargsApplier_->getNextSkippedRows()); } data.numElements = rowsToRead; if (rowsToRead == 0) { markEndOfFile(); return false; } - if (enableEncodedBlock) { - reader->nextEncoded(data, rowsToRead, nullptr); + if (enableEncodedBlock_) { + reader_->nextEncoded(data, rowsToRead, nullptr); } else { - reader->next(data, rowsToRead, nullptr); + reader_->next(data, rowsToRead, nullptr); } // update row number - previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe; - currentRowInStripe += rowsToRead; + previousRow_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_; + currentRowInStripe_ += rowsToRead; // check if we need to advance to next selected row group - if (sargsApplier) { + if (sargsApplier_) { uint64_t nextRowToRead = - advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, footer->row_index_stride(), - sargsApplier->getNextSkippedRows()); - if (currentRowInStripe != nextRowToRead) { + advanceToNextRowGroup(currentRowInStripe_, rowsInCurrentStripe_, + footer_->row_index_stride(), sargsApplier_->getNextSkippedRows()); + if (currentRowInStripe_ != nextRowToRead) { // it is guaranteed to be at start of a row group - currentRowInStripe = nextRowToRead; - if (currentRowInStripe < rowsInCurrentStripe) { - seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride())); + currentRowInStripe_ = nextRowToRead; + if (currentRowInStripe_ < rowsInCurrentStripe_) { + seekToRowGroup(static_cast<uint32_t>(currentRowInStripe_ / footer_->row_index_stride())); } } } - if (currentRowInStripe >= rowsInCurrentStripe) { - currentStripe += 1; - currentRowInStripe = 0; + if (currentRowInStripe_ >= rowsInCurrentStripe_) { + currentStripe_ += 1; + currentRowInStripe_ = 0; } return rowsToRead != 0; } @@ -1219,9 +1222,9 @@ namespace orc { std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const { // If the read type is specified, then check that the selected schema matches the read type // on the first call to createRowBatch. - if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) { + if (schemaEvolution_.getReadType() && selectedSchema_.get() == nullptr) { auto fileSchema = &getSelectedType(); - auto readType = schemaEvolution.getReadType(); + auto readType = schemaEvolution_.getReadType(); std::set<uint64_t> readColumns, fileColumns; getColumnIds(readType, readColumns); getColumnIds(fileSchema, fileColumns); @@ -1233,9 +1236,9 @@ namespace orc { } } const Type& readType = - schemaEvolution.getReadType() ? *schemaEvolution.getReadType() : getSelectedType(); - return readType.createRowBatch(capacity, *contents->pool, enableEncodedBlock, - useTightNumericVector); + schemaEvolution_.getReadType() ? *schemaEvolution_.getReadType() : getSelectedType(); + return readType.createRowBatch(capacity, *contents_->pool, enableEncodedBlock_, + useTightNumericVector_); } void ensureOrcFooter(InputStream* stream, DataBuffer<char>* buffer, uint64_t postscriptLength) { @@ -1423,17 +1426,10 @@ namespace orc { uint32_t stripeIndex, const std::set<uint32_t>& included) const { std::map<uint32_t, BloomFilterIndex> ret; - // find stripe info - if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) { - throw std::logic_error("Illegal stripe index: " + - to_string(static_cast<int64_t>(stripeIndex))); - } - const proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - const proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents); + uint64_t offset; + auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset); // iterate stripe footer to get stream of bloom_filter - uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset()); for (int i = 0; i < currentStripeFooter.streams_size(); i++) { const proto::Stream& stream = currentStripeFooter.streams(i); uint32_t column = static_cast<uint32_t>(stream.column()); @@ -1443,10 +1439,10 @@ namespace orc { if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 && (included.empty() || included.find(column) != included.end())) { std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, + createDecompressor(contents_->compression, std::make_unique<SeekableFileInputStream>( - contents->stream.get(), offset, length, *contents->pool), - contents->blockSize, *(contents->pool), contents->readerMetrics); + contents_->stream.get(), offset, length, *contents_->pool), + contents_->blockSize, *(contents_->pool), contents_->readerMetrics); proto::BloomFilterIndex pbBFIndex; if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) { @@ -1471,6 +1467,150 @@ namespace orc { return ret; } + proto::StripeFooter ReaderImpl::loadCurrentStripeFooter(uint32_t stripeIndex, + uint64_t& offset) const { + // find stripe info + if (stripeIndex >= static_cast<uint32_t>(footer_->stripes_size())) { + throw std::logic_error("Illegal stripe index: " + + to_string(static_cast<int64_t>(stripeIndex))); + } + const proto::StripeInformation currentStripeInfo = + footer_->stripes(static_cast<int>(stripeIndex)); + offset = static_cast<uint64_t>(currentStripeInfo.offset()); + return getStripeFooter(currentStripeInfo, *contents_); + } + + std::map<uint32_t, RowGroupIndex> ReaderImpl::getRowGroupIndex( + uint32_t stripeIndex, const std::set<uint32_t>& included) const { + std::map<uint32_t, RowGroupIndex> ret; + uint64_t offset; + auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset); + + // iterate stripe footer to get stream of row_index + for (int i = 0; i < currentStripeFooter.streams_size(); i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + uint32_t column = static_cast<uint32_t>(stream.column()); + uint64_t length = static_cast<uint64_t>(stream.length()); + RowGroupIndex& rowGroupIndex = ret[column]; + + if (stream.kind() == proto::Stream_Kind_ROW_INDEX && + (included.empty() || included.find(column) != included.end())) { + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents_->compression, + std::make_unique<SeekableFileInputStream>( + contents_->stream.get(), offset, length, *contents_->pool), + contents_->blockSize, *(contents_->pool), contents_->readerMetrics); + + proto::RowIndex pbRowIndex; + if (!pbRowIndex.ParseFromZeroCopyStream(pbStream.get())) { + std::stringstream errMsgBuffer; + errMsgBuffer << "Failed to parse RowIndex at column " << column << " in stripe " + << stripeIndex; + throw ParseError(errMsgBuffer.str()); + } + + // add rowGroupIndex to result for one column + for (auto& rowIndexEntry : pbRowIndex.entry()) { + std::vector<uint64_t> posVector; + for (auto& position : rowIndexEntry.positions()) { + posVector.push_back(position); + } + rowGroupIndex.positions.push_back(posVector); + } + } + offset += length; + } + return ret; + } + + void ReaderImpl::releaseBuffer(uint64_t boundary) { + std::lock_guard<std::mutex> lock(contents_->readCacheMutex); + + if (contents_->readCache) { + contents_->readCache->evictEntriesBefore(boundary); + } + } + + void ReaderImpl::preBuffer(const std::vector<uint32_t>& stripes, + const std::list<uint64_t>& includeTypes) { + std::vector<uint32_t> newStripes; + for (auto stripe : stripes) { + if (stripe < static_cast<uint32_t>(footer_->stripes_size())) newStripes.push_back(stripe); + } + + std::list<uint64_t> newIncludeTypes; + for (auto type : includeTypes) { + if (type < static_cast<uint64_t>(footer_->types_size())) newIncludeTypes.push_back(type); + } + + if (newStripes.empty() || newIncludeTypes.empty()) { + return; + } + + orc::RowReaderOptions rowReaderOptions; + rowReaderOptions.includeTypes(newIncludeTypes); + ColumnSelector columnSelector(contents_.get()); + std::vector<bool> selectedColumns; + columnSelector.updateSelected(selectedColumns, rowReaderOptions); + + std::vector<ReadRange> ranges; + ranges.reserve(newIncludeTypes.size()); + for (auto stripe : newStripes) { + // get stripe information + const auto& stripeInfo = footer_->stripes(stripe); + uint64_t stripeFooterStart = + stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length(); + uint64_t stripeFooterLength = stripeInfo.footer_length(); + + // get stripe footer + std::unique_ptr<SeekableInputStream> pbStream = createDecompressor( + contents_->compression, + std::make_unique<SeekableFileInputStream>(contents_->stream.get(), stripeFooterStart, + stripeFooterLength, *contents_->pool), + contents_->blockSize, *contents_->pool, contents_->readerMetrics); + proto::StripeFooter stripeFooter; + if (!stripeFooter.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError(std::string("bad StripeFooter from ") + pbStream->getName()); + } + + // traverse all streams in stripe footer, choose selected streams to prebuffer + uint64_t offset = stripeInfo.offset(); + for (int i = 0; i < stripeFooter.streams_size(); i++) { + const proto::Stream& stream = stripeFooter.streams(i); + if (offset + stream.length() > stripeFooterStart) { + std::stringstream msg; + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripe + << ": streamOffset=" << offset << ", streamLength=" << stream.length() + << ", stripeOffset=" << stripeInfo.offset() + << ", stripeIndexLength=" << stripeInfo.index_length() + << ", stripeDataLength=" << stripeInfo.data_length(); + throw ParseError(msg.str()); + } + + if (stream.has_kind() && selectedColumns[stream.column()]) { + const auto& kind = stream.kind(); + if (kind == proto::Stream_Kind_DATA || kind == proto::Stream_Kind_DICTIONARY_DATA || + kind == proto::Stream_Kind_PRESENT || kind == proto::Stream_Kind_LENGTH || + kind == proto::Stream_Kind_SECONDARY) { + ranges.emplace_back(offset, stream.length()); + } + } + + offset += stream.length(); + } + + { + std::lock_guard<std::mutex> lock(contents_->readCacheMutex); + + if (!contents_->readCache) { + contents_->readCache = std::make_shared<ReadRangeCache>( + getStream(), options_.getCacheOptions(), contents_->pool, contents_->readerMetrics); + } + contents_->readCache->cache(std::move(ranges)); + } + } + } + RowReader::~RowReader() { // PASS } diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh index a1367e4bd3..39ca739675 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.hh +++ b/contrib/libs/apache/orc/c++/src/Reader.hh @@ -26,6 +26,8 @@ #include "ColumnReader.hh" #include "RLE.hh" +#include "io/Cache.hh" + #include "SchemaEvolution.hh" #include "TypeImpl.hh" #include "sargs/SargsApplier.hh" @@ -39,17 +41,17 @@ namespace orc { */ class WriterVersionImpl { private: - WriterVersion version; + WriterVersion version_; public: // Known Versions with issues resolved // The static method below is to fix global constructors Clang warning static const WriterVersionImpl& VERSION_HIVE_8732(); - WriterVersionImpl(WriterVersion ver) : version(ver) {} + WriterVersionImpl(WriterVersion ver) : version_(ver) {} bool compareGT(const WriterVersion other) const { - return version > other; + return version_ > other; } }; @@ -70,6 +72,11 @@ namespace orc { bool isDecimalAsLong; std::unique_ptr<proto::Metadata> metadata; ReaderMetrics* readerMetrics; + + // mutex to protect readCache_ from concurrent access + std::mutex readCacheMutex; + // cached io ranges. only valid when preBuffer is invoked. + std::shared_ptr<ReadRangeCache> readCache; }; proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, @@ -80,10 +87,10 @@ namespace orc { class ColumnSelector { private: - std::map<std::string, uint64_t> nameIdMap; - std::map<uint64_t, const Type*> idTypeMap; - const FileContents* contents; - std::vector<std::string> columns; + std::map<std::string, uint64_t> nameIdMap_; + std::map<uint64_t, const Type*> idTypeMap_; + const FileContents* contents_; + std::vector<std::string> columns_; // build map from type name and id, id to Type void buildTypeNameIdMap(const Type* type); @@ -127,54 +134,54 @@ namespace orc { class RowReaderImpl : public RowReader { private: - const Timezone& localTimezone; + const Timezone& localTimezone_; // contents - std::shared_ptr<FileContents> contents; - const bool throwOnHive11DecimalOverflow; - const int32_t forcedScaleOnHive11Decimal; + std::shared_ptr<FileContents> contents_; + const bool throwOnHive11DecimalOverflow_; + const int32_t forcedScaleOnHive11Decimal_; // inputs - std::vector<bool> selectedColumns; + std::vector<bool> selectedColumns_; // footer - proto::Footer* footer; - DataBuffer<uint64_t> firstRowOfStripe; - mutable std::unique_ptr<Type> selectedSchema; - bool skipBloomFilters; + proto::Footer* footer_; + DataBuffer<uint64_t> firstRowOfStripe_; + mutable std::unique_ptr<Type> selectedSchema_; + bool skipBloomFilters_; // reading state - uint64_t previousRow; - uint64_t firstStripe; - uint64_t currentStripe; - uint64_t lastStripe; // the stripe AFTER the last one - uint64_t processingStripe; - uint64_t currentRowInStripe; - uint64_t rowsInCurrentStripe; + uint64_t previousRow_; + uint64_t firstStripe_; + uint64_t currentStripe_; + uint64_t lastStripe_; // the stripe AFTER the last one + uint64_t processingStripe_; + uint64_t currentRowInStripe_; + uint64_t rowsInCurrentStripe_; // number of row groups between first stripe and last stripe - uint64_t numRowGroupsInStripeRange; - proto::StripeInformation currentStripeInfo; - proto::StripeFooter currentStripeFooter; - std::unique_ptr<ColumnReader> reader; - - bool enableEncodedBlock; - bool useTightNumericVector; - bool throwOnSchemaEvolutionOverflow; + uint64_t numRowGroupsInStripeRange_; + proto::StripeInformation currentStripeInfo_; + proto::StripeFooter currentStripeFooter_; + std::unique_ptr<ColumnReader> reader_; + + bool enableEncodedBlock_; + bool useTightNumericVector_; + bool throwOnSchemaEvolutionOverflow_; // internal methods void startNextStripe(); inline void markEndOfFile(); // row index of current stripe with column id as the key - std::unordered_map<uint64_t, proto::RowIndex> rowIndexes; - std::map<uint32_t, BloomFilterIndex> bloomFilterIndex; - std::shared_ptr<SearchArgument> sargs; - std::unique_ptr<SargsApplier> sargsApplier; + std::unordered_map<uint64_t, proto::RowIndex> rowIndexes_; + std::map<uint32_t, BloomFilterIndex> bloomFilterIndex_; + std::shared_ptr<SearchArgument> sargs_; + std::unique_ptr<SargsApplier> sargsApplier_; // desired timezone to return data of timestamp types. - const Timezone& readerTimezone; + const Timezone& readerTimezone_; // match read and file types - SchemaEvolution schemaEvolution; + SchemaEvolution schemaEvolution_; // load stripe index if not done so void loadStripeIndex(); @@ -196,7 +203,7 @@ namespace orc { // whether the current stripe is initialized inline bool isCurrentStripeInited() const { - return currentStripe == processingStripe; + return currentStripe_ == processingStripe_; } /** @@ -243,35 +250,40 @@ namespace orc { int32_t getForcedScaleOnHive11Decimal() const; const SchemaEvolution* getSchemaEvolution() const { - return &schemaEvolution; + return &schemaEvolution_; + } + + std::shared_ptr<ReadRangeCache> getReadCache() const { + return contents_->readCache; } }; class ReaderImpl : public Reader { private: // FileContents - std::shared_ptr<FileContents> contents; + std::shared_ptr<FileContents> contents_; // inputs - const ReaderOptions options; - const uint64_t fileLength; - const uint64_t postscriptLength; + const ReaderOptions options_; + const uint64_t fileLength_; + const uint64_t postscriptLength_; // footer - proto::Footer* footer; - uint64_t numberOfStripes; + proto::Footer* footer_; + uint64_t numberOfStripes_; + uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); // internal methods void readMetadata() const; void checkOrcVersion(); - void getRowIndexStatistics( - const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, - const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; + void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics>>* indexStats) const; + proto::StripeFooter loadCurrentStripeFooter(uint32_t stripeIndex, uint64_t& offset) const; // metadata - mutable bool isMetadataLoaded; + mutable bool isMetadataLoaded_; public: /** @@ -341,27 +353,27 @@ namespace orc { bool hasCorrectStatistics() const override; const ReaderMetrics* getReaderMetrics() const override { - return contents->readerMetrics; + return contents_->readerMetrics; } const proto::PostScript* getPostscript() const { - return contents->postscript.get(); + return contents_->postscript.get(); } uint64_t getBlockSize() const { - return contents->blockSize; + return contents_->blockSize; } const proto::Footer* getFooter() const { - return contents->footer.get(); + return contents_->footer.get(); } const Type* getSchema() const { - return contents->schema.get(); + return contents_->schema.get(); } InputStream* getStream() const { - return contents->stream.get(); + return contents_->stream.get(); } uint64_t getMemoryUse(int stripeIx = -1) override; @@ -374,6 +386,13 @@ namespace orc { std::map<uint32_t, BloomFilterIndex> getBloomFilters( uint32_t stripeIndex, const std::set<uint32_t>& included) const override; + + void preBuffer(const std::vector<uint32_t>& stripes, + const std::list<uint64_t>& includeTypes) override; + void releaseBuffer(uint64_t boundary) override; + + std::map<uint32_t, RowGroupIndex> getRowGroupIndex( + uint32_t stripeIndex, const std::set<uint32_t>& included) const override; }; } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc index ae05a70a36..95eec22ca7 100644 --- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc @@ -31,17 +31,17 @@ namespace orc { unsigned char RleDecoderV2::readByte() { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); - if (bufferStart == bufferEnd) { + if (bufferStart_ == bufferEnd_) { int bufferLength; const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { + if (!inputStream_->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::readByte"); } - bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer)); - bufferEnd = bufferStart + bufferLength; + bufferStart_ = const_cast<char*>(static_cast<const char*>(bufferPointer)); + bufferEnd_ = bufferStart_ + bufferLength; } - unsigned char result = static_cast<unsigned char>(*bufferStart++); + unsigned char result = static_cast<unsigned char>(*bufferStart_++); return result; } @@ -89,29 +89,29 @@ namespace orc { return dispatch.func(this, data, offset, len, fbs); } - RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool _isSigned, - MemoryPool& pool, ReaderMetrics* _metrics) - : RleDecoder(_metrics), - inputStream(std::move(input)), - isSigned(_isSigned), - firstByte(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - runLength(0), - runRead(0), - bitsLeft(0), - curByte(0), - unpackedPatch(pool, 0), - literals(pool, MAX_LITERAL_SIZE) { + RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned, + MemoryPool& pool, ReaderMetrics* metrics) + : RleDecoder(metrics), + inputStream_(std::move(input)), + isSigned_(isSigned), + firstByte_(0), + bufferStart_(nullptr), + bufferEnd_(bufferStart_), + runLength_(0), + runRead_(0), + bitsLeft_(0), + curByte_(0), + unpackedPatch_(pool, 0), + literals_(pool, MAX_LITERAL_SIZE) { // PASS } void RleDecoderV2::seek(PositionProvider& location) { // move the input stream - inputStream->seek(location); + inputStream_->seek(location); // clear state - bufferEnd = bufferStart = nullptr; - runRead = runLength = 0; + bufferEnd_ = bufferStart_ = nullptr; + runRead_ = runLength_ = 0; // skip ahead the given number of records skip(location.next()); } @@ -142,14 +142,14 @@ namespace orc { } } - if (runRead == runLength) { + if (runRead_ == runLength_) { resetRun(); - firstByte = readByte(); + firstByte_ = readByte(); } uint64_t offset = nRead, length = numValues - nRead; - EncodingType enc = static_cast<EncodingType>((firstByte >> 6) & 0x03); + EncodingType enc = static_cast<EncodingType>((firstByte_ >> 6) & 0x03); switch (static_cast<int64_t>(enc)) { case SHORT_REPEAT: nRead += nextShortRepeats(data, offset, length, notNull); @@ -184,37 +184,37 @@ namespace orc { template <typename T> uint64_t RleDecoderV2::nextShortRepeats(T* const data, uint64_t offset, uint64_t numValues, const char* const notNull) { - if (runRead == runLength) { + if (runRead_ == runLength_) { // extract the number of fixed bytes - uint64_t byteSize = (firstByte >> 3) & 0x07; + uint64_t byteSize = (firstByte_ >> 3) & 0x07; byteSize += 1; - runLength = firstByte & 0x07; + runLength_ = firstByte_ & 0x07; // run lengths values are stored only after MIN_REPEAT value is met - runLength += MIN_REPEAT; - runRead = 0; + runLength_ += MIN_REPEAT; + runRead_ = 0; // read the repeated value which is store using fixed bytes - literals[0] = readLongBE(byteSize); + literals_[0] = readLongBE(byteSize); - if (isSigned) { - literals[0] = unZigZag(static_cast<uint64_t>(literals[0])); + if (isSigned_) { + literals_[0] = unZigZag(static_cast<uint64_t>(literals_[0])); } } - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength_ - runRead_, numValues); if (notNull) { for (uint64_t pos = offset; pos < offset + nRead; ++pos) { if (notNull[pos]) { - data[pos] = static_cast<T>(literals[0]); - ++runRead; + data[pos] = static_cast<T>(literals_[0]); + ++runRead_; } } } else { for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = static_cast<T>(literals[0]); - ++runRead; + data[pos] = static_cast<T>(literals_[0]); + ++runRead_; } } @@ -224,22 +224,22 @@ namespace orc { template <typename T> uint64_t RleDecoderV2::nextDirect(T* const data, uint64_t offset, uint64_t numValues, const char* const notNull) { - if (runRead == runLength) { + if (runRead_ == runLength_) { // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; + unsigned char fbo = (firstByte_ >> 1) & 0x1f; uint32_t bitSize = decodeBitWidth(fbo); // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength_ = static_cast<uint64_t>(firstByte_ & 0x01) << 8; + runLength_ |= readByte(); // runs are one off - runLength += 1; - runRead = 0; + runLength_ += 1; + runRead_ = 0; - readLongs(literals.data(), 0, runLength, bitSize); - if (isSigned) { - for (uint64_t i = 0; i < runLength; ++i) { - literals[i] = unZigZag(static_cast<uint64_t>(literals[i])); + readLongs(literals_.data(), 0, runLength_, bitSize); + if (isSigned_) { + for (uint64_t i = 0; i < runLength_; ++i) { + literals_[i] = unZigZag(static_cast<uint64_t>(literals_[i])); } } } @@ -250,8 +250,8 @@ namespace orc { void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, int64_t* resPatch, uint64_t* patchIdx) { uint64_t idx = *patchIdx; - uint64_t gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize; - int64_t patch = unpackedPatch[idx] & patchMask; + uint64_t gap = static_cast<uint64_t>(unpackedPatch_[idx]) >> patchBitSize; + int64_t patch = unpackedPatch_[idx] & patchMask; int64_t actualGap = 0; // special case: gap is >255 then patch value will be 0. @@ -259,8 +259,8 @@ namespace orc { while (gap == 255 && patch == 0) { actualGap += 255; ++idx; - gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize; - patch = unpackedPatch[idx] & patchMask; + gap = static_cast<uint64_t>(unpackedPatch_[idx]) >> patchBitSize; + patch = unpackedPatch_[idx] & patchMask; } // add the left over gap actualGap += gap; @@ -273,17 +273,17 @@ namespace orc { template <typename T> uint64_t RleDecoderV2::nextPatched(T* const data, uint64_t offset, uint64_t numValues, const char* const notNull) { - if (runRead == runLength) { + if (runRead_ == runLength_) { // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; + unsigned char fbo = (firstByte_ >> 1) & 0x1f; uint32_t bitSize = decodeBitWidth(fbo); // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength_ = static_cast<uint64_t>(firstByte_ & 0x01) << 8; + runLength_ |= readByte(); // runs are one off - runLength += 1; - runRead = 0; + runLength_ += 1; + runRead_ = 0; // extract the number of bytes occupied by base uint64_t thirdByte = readByte(); @@ -316,12 +316,12 @@ namespace orc { base = -base; } - readLongs(literals.data(), 0, runLength, bitSize); + readLongs(literals_.data(), 0, runLength_, bitSize); // any remaining bits are thrown out resetReadLongs(); // TODO: something more efficient than resize - unpackedPatch.resize(pl); + unpackedPatch_.resize(pl); // TODO: Skip corrupt? // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { if ((patchBitSize + pgw) > 64) { @@ -330,7 +330,7 @@ namespace orc { "(patchBitSize + pgw > 64)!"); } uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); + readLongs(unpackedPatch_.data(), 0, pl, cfb); // any remaining bits are thrown out resetReadLongs(); @@ -342,21 +342,21 @@ namespace orc { uint64_t patchIdx = 0; adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); - for (uint64_t i = 0; i < runLength; ++i) { + for (uint64_t i = 0; i < runLength_; ++i) { if (static_cast<int64_t>(i) != gap) { // no patching required. add base to unpacked value to get final value - literals[i] += base; + literals_[i] += base; } else { // extract the patch value - int64_t patchedVal = literals[i] | (patch << bitSize); + int64_t patchedVal = literals_[i] | (patch << bitSize); // add base to patched value - literals[i] = base + patchedVal; + literals_[i] = base + patchedVal; // increment the patch to point to next entry in patch list ++patchIdx; - if (patchIdx < unpackedPatch.size()) { + if (patchIdx < unpackedPatch_.size()) { adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); // next gap is relative to the current gap @@ -372,9 +372,9 @@ namespace orc { template <typename T> uint64_t RleDecoderV2::nextDelta(T* const data, uint64_t offset, uint64_t numValues, const char* const notNull) { - if (runRead == runLength) { + if (runRead_ == runLength_) { // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; + unsigned char fbo = (firstByte_ >> 1) & 0x1f; uint32_t bitSize; if (fbo != 0) { bitSize = decodeBitWidth(fbo); @@ -383,20 +383,20 @@ namespace orc { } // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = 0; + runLength_ = static_cast<uint64_t>(firstByte_ & 0x01) << 8; + runLength_ |= readByte(); + ++runLength_; // account for first value + runRead_ = 0; int64_t prevValue; // read the first value stored as vint - if (isSigned) { + if (isSigned_) { prevValue = readVslong(); } else { prevValue = static_cast<int64_t>(readVulong()); } - literals[0] = prevValue; + literals_[0] = prevValue; // read the fixed delta value stored as vint (deltas can be negative even // if all number are positive) @@ -404,28 +404,28 @@ namespace orc { if (bitSize == 0) { // add fixed deltas to adjacent values - for (uint64_t i = 1; i < runLength; ++i) { - literals[i] = literals[i - 1] + deltaBase; + for (uint64_t i = 1; i < runLength_; ++i) { + literals_[i] = literals_[i - 1] + deltaBase; } } else { - prevValue = literals[1] = prevValue + deltaBase; - if (runLength < 2) { + prevValue = literals_[1] = prevValue + deltaBase; + if (runLength_ < 2) { std::stringstream ss; - ss << "Illegal run length for delta encoding: " << runLength; + ss << "Illegal run length for delta encoding: " << runLength_; throw ParseError(ss.str()); } // write the unpacked values, add it to previous value and store final // value to result buffer. if the delta base value is negative then it // is a decreasing sequence else an increasing sequence. // read deltas using the literals buffer. - readLongs(literals.data(), 2, runLength - 2, bitSize); + readLongs(literals_.data(), 2, runLength_ - 2, bitSize); if (deltaBase < 0) { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue - literals[i]; + for (uint64_t i = 2; i < runLength_; ++i) { + prevValue = literals_[i] = prevValue - literals_[i]; } } else { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue + literals[i]; + for (uint64_t i = 2; i < runLength_; ++i) { + prevValue = literals_[i] = prevValue + literals_[i]; } } } @@ -437,16 +437,16 @@ namespace orc { template <typename T> uint64_t RleDecoderV2::copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull) { - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength_ - runRead_, numValues); if (notNull) { for (uint64_t i = offset; i < (offset + nRead); ++i) { if (notNull[i]) { - data[i] = static_cast<T>(literals[runRead++]); + data[i] = static_cast<T>(literals_[runRead_++]); } } } else { for (uint64_t i = offset; i < (offset + nRead); ++i) { - data[i] = static_cast<T>(literals[runRead++]); + data[i] = static_cast<T>(literals_[runRead_++]); } } return nRead; diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc index a75aeac2eb..1cda9ee91e 100644 --- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc @@ -41,11 +41,11 @@ namespace orc { if (!reuseHist) { // histogram that store the encoded bit requirement for each values. // maximum number of bits that can encoded is 32 (refer FixedBitSizes) - memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); + memset(histgram_, 0, FixedBitSizes::SIZE * sizeof(int32_t)); // compute the histogram for (size_t i = offset; i < (offset + length); i++) { uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); - histgram[idx] += 1; + histgram_[idx] += 1; } } @@ -53,7 +53,7 @@ namespace orc { // return the bits required by pth percentile length for (int32_t i = HIST_LEN - 1; i >= 0; i--) { - perLen -= histgram[i]; + perLen -= histgram_[i]; if (perLen < 0) { return decodeBitWidth(static_cast<uint32_t>(i)); } @@ -64,13 +64,13 @@ namespace orc { RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking) : RleEncoder(std::move(outStream), hasSigned), - alignedBitPacking(alignBitPacking), - prevDelta(0) { + alignedBitPacking_(alignBitPacking), + prevDelta_(0) { literals = new int64_t[MAX_LITERAL_SIZE]; - gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; - zigzagLiterals = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr; - baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; - adjDeltas = new int64_t[MAX_LITERAL_SIZE]; + gapVsPatchList_ = new int64_t[MAX_LITERAL_SIZE]; + zigzagLiterals_ = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr; + baseRedLiterals_ = new int64_t[MAX_LITERAL_SIZE]; + adjDeltas_ = new int64_t[MAX_LITERAL_SIZE]; } void RleEncoderV2::write(int64_t val) { @@ -80,39 +80,39 @@ namespace orc { } if (numLiterals == 1) { - prevDelta = val - literals[0]; + prevDelta_ = val - literals[0]; literals[numLiterals++] = val; if (val == literals[0]) { - fixedRunLength = 2; - variableRunLength = 0; + fixedRunLength_ = 2; + variableRunLength_ = 0; } else { - fixedRunLength = 0; - variableRunLength = 2; + fixedRunLength_ = 0; + variableRunLength_ = 2; } return; } int64_t currentDelta = val - literals[numLiterals - 1]; EncodingOption option = {}; - if (prevDelta == 0 && currentDelta == 0) { + if (prevDelta_ == 0 && currentDelta == 0) { // case 1: fixed delta run literals[numLiterals++] = val; - if (variableRunLength > 0) { + if (variableRunLength_ > 0) { // if variable run is non-zero then we are seeing repeating // values at the end of variable run in which case fixed Run // length is 2 - fixedRunLength = 2; + fixedRunLength_ = 2; } - fixedRunLength++; + fixedRunLength_++; // if fixed run met the minimum condition and if variable // run is non-zero then flush the variable run and shift the // tail fixed runs to start of the buffer - if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { + if (fixedRunLength_ >= MIN_REPEAT && variableRunLength_ > 0) { numLiterals -= MIN_REPEAT; - variableRunLength -= (MIN_REPEAT - 1); + variableRunLength_ -= (MIN_REPEAT - 1); determineEncoding(option); writeValues(option); @@ -124,7 +124,7 @@ namespace orc { numLiterals = MIN_REPEAT; } - if (fixedRunLength == MAX_LITERAL_SIZE) { + if (fixedRunLength_ == MAX_LITERAL_SIZE) { option.encoding = DELTA; option.isFixedDelta = true; writeValues(option); @@ -137,8 +137,8 @@ namespace orc { // if fixed run length is non-zero and if it satisfies the // short repeat conditions then write the values as short repeats // else use delta encoding - if (fixedRunLength >= MIN_REPEAT) { - if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + if (fixedRunLength_ >= MIN_REPEAT) { + if (fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) { option.encoding = SHORT_REPEAT; } else { option.encoding = DELTA; @@ -149,20 +149,20 @@ namespace orc { // if fixed run length is <MIN_REPEAT and current value is // different from previous then treat it as variable run - if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; + if (fixedRunLength_ > 0 && fixedRunLength_ < MIN_REPEAT && val != literals[numLiterals - 1]) { + variableRunLength_ = fixedRunLength_; + fixedRunLength_ = 0; } // after writing values re-initialize the variables if (numLiterals == 0) { initializeLiterals(val); } else { - prevDelta = val - literals[numLiterals - 1]; + prevDelta_ = val - literals[numLiterals - 1]; literals[numLiterals++] = val; - variableRunLength++; + variableRunLength_++; - if (variableRunLength == MAX_LITERAL_SIZE) { + if (variableRunLength_ == MAX_LITERAL_SIZE) { determineEncoding(option); writeValues(option); } @@ -172,7 +172,7 @@ namespace orc { void RleEncoderV2::computeZigZagLiterals(EncodingOption& option) { assert(isSigned); for (size_t i = 0; i < numLiterals; i++) { - zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]); + zigzagLiterals_[option.zigzagLiteralsCount++] = zigZag(literals[i]); } } @@ -207,7 +207,7 @@ namespace orc { for (size_t i = 0; i < numLiterals; i++) { // if value is above mask then create the patch and record the gap - if (baseRedLiterals[i] > mask) { + if (baseRedLiterals_[i] > mask) { size_t gap = i - prev; if (gap > maxGap) { maxGap = gap; @@ -219,12 +219,12 @@ namespace orc { gapIdx++; // extract the most significant bits that are over mask bits - int64_t patch = baseRedLiterals[i] >> option.brBits95p; + int64_t patch = baseRedLiterals_[i] >> option.brBits95p; patchList.push_back(patch); patchIdx++; // strip off the MSB to enable safe bit packing - baseRedLiterals[i] &= mask; + baseRedLiterals_[i] &= mask; } } @@ -268,13 +268,13 @@ namespace orc { int64_t g = gapList[gapIdx++]; int64_t p = patchList[patchIdx++]; while (g > 255) { - gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); + gapVsPatchList_[option.gapVsPatchListCount++] = (255L << option.patchWidth); i++; g -= 255; } // store patch value in LSBs and gap in MSBs - gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); + gapVsPatchList_[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); } } @@ -287,7 +287,7 @@ namespace orc { if (isSigned) { computeZigZagLiterals(option); } - int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals; + int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals_ : literals; option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0); return currentZigzagLiterals; } @@ -318,7 +318,7 @@ namespace orc { int64_t initialDelta = literals[1] - literals[0]; int64_t currDelta = 0; int64_t deltaMax = 0; - adjDeltas[option.adjDeltasCount++] = initialDelta; + adjDeltas_[option.adjDeltasCount++] = initialDelta; for (size_t i = 1; i < numLiterals; i++) { const int64_t l1 = literals[i]; @@ -332,8 +332,8 @@ namespace orc { option.isFixedDelta &= (currDelta == initialDelta); if (i > 1) { - adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); - deltaMax = std::max(deltaMax, adjDeltas[i - 1]); + adjDeltas_[option.adjDeltasCount++] = std::abs(currDelta); + deltaMax = std::max(deltaMax, adjDeltas_[i - 1]); } } @@ -407,15 +407,15 @@ namespace orc { // patching is done only on base reduced values. // remove base from literals for (size_t i = 0; i < numLiterals; i++) { - baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); + baseRedLiterals_[option.baseRedLiteralsCount++] = (literals[i] - option.min); } // 95th percentile width is used to determine max allowed value // after which patching will be done - option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); + option.brBits95p = percentileBits(baseRedLiterals_, 0, numLiterals, 0.95); // 100th percentile is used to compute the max patch width - option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); + option.brBits100p = percentileBits(baseRedLiterals_, 0, numLiterals, 1.0, true); // after base reducing the values, if the difference in bits between // 95th percentile and 100th percentile value is zero then there @@ -440,31 +440,8 @@ namespace orc { } uint64_t RleEncoderV2::flush() { - if (numLiterals != 0) { - EncodingOption option = {}; - if (variableRunLength != 0) { - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength != 0) { - if (fixedRunLength < MIN_REPEAT) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength >= MIN_REPEAT && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - writeValues(option); - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); - } - } - } - - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + finishEncode(); uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; return dataSize; } @@ -488,7 +465,7 @@ namespace orc { } numLiterals = 0; - prevDelta = 0; + prevDelta_ = 0; } } @@ -506,8 +483,8 @@ namespace orc { uint32_t header = getOpCode(SHORT_REPEAT); - fixedRunLength -= MIN_REPEAT; - header |= fixedRunLength; + fixedRunLength_ -= MIN_REPEAT; + header |= fixedRunLength_; header |= ((numBytesRepeatVal - 1) << 3); writeByte(static_cast<char>(header)); @@ -517,40 +494,40 @@ namespace orc { writeByte(static_cast<char>(b)); } - fixedRunLength = 0; + fixedRunLength_ = 0; } void RleEncoderV2::writeDirectValues(EncodingOption& option) { // write the number of fixed bits required in next 5 bits uint32_t fb = option.zzBits100p; - if (alignedBitPacking) { + if (alignedBitPacking_) { fb = getClosestAlignedFixedBits(fb); } const uint32_t efb = encodeBitWidth(fb) << 1; // adjust variable run length - variableRunLength -= 1; + variableRunLength_ -= 1; // extract the 9th bit of run length - const uint32_t tailBits = (variableRunLength & 0x100) >> 8; + const uint32_t tailBits = (variableRunLength_ & 0x100) >> 8; // create first byte of the header const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits); // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); + const char headerSecondByte = static_cast<char>(variableRunLength_ & 0xff); // write header writeByte(headerFirstByte); writeByte(headerSecondByte); // bit packing the zigzag encoded literals - int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals; + int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals_ : literals; writeInts(currentZigzagLiterals, 0, numLiterals, fb); // reset run length - variableRunLength = 0; + variableRunLength_ = 0; } void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { @@ -565,16 +542,16 @@ namespace orc { const uint32_t efb = encodeBitWidth(option.brBits95p) << 1; // adjust variable run length, they are one off - variableRunLength -= 1; + variableRunLength_ -= 1; // extract the 9th bit of run length - const uint32_t tailBits = (variableRunLength & 0x100) >> 8; + const uint32_t tailBits = (variableRunLength_ & 0x100) >> 8; // create first byte of the header const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits); // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); + const char headerSecondByte = static_cast<char>(variableRunLength_ & 0xff); // if the min value is negative toggle the sign const bool isNegative = (option.min < 0); @@ -618,15 +595,15 @@ namespace orc { // base reduced literals are bit packed uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p); - writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits); + writeInts(baseRedLiterals_, 0, numLiterals, closestFixedBits); // write patch list closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth); - writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits); + writeInts(gapVsPatchList_, 0, option.patchLength, closestFixedBits); // reset run length - variableRunLength = 0; + variableRunLength_ = 0; } void RleEncoderV2::writeDeltaValues(EncodingOption& option) { @@ -634,7 +611,7 @@ namespace orc { uint32_t fb = option.bitsDeltaMax; uint32_t efb = 0; - if (alignedBitPacking) { + if (alignedBitPacking_) { fb = getClosestAlignedFixedBits(fb); } @@ -642,14 +619,14 @@ namespace orc { // if fixed run length is greater than threshold then it will be fixed // delta sequence with delta value 0 else fixed delta sequence with // non-zero delta value - if (fixedRunLength > MIN_REPEAT) { + if (fixedRunLength_ > MIN_REPEAT) { // ex. sequence: 2 2 2 2 2 2 2 2 - len = fixedRunLength - 1; - fixedRunLength = 0; + len = fixedRunLength_ - 1; + fixedRunLength_ = 0; } else { // ex. sequence: 4 6 8 10 12 14 16 - len = variableRunLength - 1; - variableRunLength = 0; + len = variableRunLength_ - 1; + variableRunLength_ = 0; } } else { // fixed width 0 is used for long repeating values. @@ -658,8 +635,8 @@ namespace orc { fb = 2; } efb = encodeBitWidth(fb) << 1; - len = variableRunLength - 1; - variableRunLength = 0; + len = variableRunLength_ - 1; + variableRunLength_ = 0; } // extract the 9th bit of run length @@ -687,13 +664,13 @@ namespace orc { writeVslong(option.fixedDelta); } else { // store the first value as delta value using zigzag encoding - writeVslong(adjDeltas[0]); + writeVslong(adjDeltas_[0]); // adjacent delta values are bit packed. The length of adjDeltas array is // always one less than the number of literals (delta difference for n // elements is n-1). We have already written one element, write the // remaining numLiterals - 2 elements here - writeInts(adjDeltas, 1, numLiterals - 2, fb); + writeInts(adjDeltas_, 1, numLiterals - 2, fb); } } @@ -776,7 +753,33 @@ namespace orc { void RleEncoderV2::initializeLiterals(int64_t val) { literals[numLiterals++] = val; - fixedRunLength = 1; - variableRunLength = 1; + fixedRunLength_ = 1; + variableRunLength_ = 1; + } + + void RleEncoderV2::finishEncode() { + if (numLiterals != 0) { + EncodingOption option = {}; + if (variableRunLength_ != 0) { + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength_ != 0) { + if (fixedRunLength_ < MIN_REPEAT) { + variableRunLength_ = fixedRunLength_; + fixedRunLength_ = 0; + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength_ >= MIN_REPEAT && fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + writeValues(option); + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); + } + } + } + + RleEncoder::finishEncode(); } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc index b8c4fd4048..7cf3b5c512 100644 --- a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc +++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc @@ -21,20 +21,20 @@ namespace orc { - SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& _readType, const Type* fileType) - : readType(_readType) { - if (readType) { - buildConversion(readType.get(), fileType); + SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& readType, const Type* fileType) + : readType_(readType) { + if (readType_) { + buildConversion(readType_.get(), fileType); } else { for (uint64_t i = 0; i <= fileType->getMaximumColumnId(); ++i) { - safePPDConversionMap.insert(i); + safePPDConversionMap_.insert(i); } } } const Type* SchemaEvolution::getReadType(const Type& fileType) const { - auto ret = readTypeMap.find(fileType.getColumnId()); - return ret == readTypeMap.cend() ? &fileType : ret->second; + auto ret = readTypeMap_.find(fileType.getColumnId()); + return ret == readTypeMap_.cend() ? &fileType : ret->second; } inline void invalidConversion(const Type* readType, const Type* fileType) { @@ -80,7 +80,7 @@ namespace orc { if (readType.getKind() == fileType.getKind()) { ret.isValid = true; if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) { - ret.isValid = readType.getMaximumLength() == fileType.getMaximumLength(); + ret.needConvert = readType.getMaximumLength() != fileType.getMaximumLength(); } else if (fileType.getKind() == DECIMAL) { ret.needConvert = readType.getPrecision() != fileType.getPrecision() || readType.getScale() != fileType.getScale(); @@ -99,12 +99,17 @@ namespace orc { break; } case DECIMAL: { - ret.isValid = ret.needConvert = isNumeric(readType); + ret.isValid = ret.needConvert = + isNumeric(readType) || isStringVariant(readType) || isTimestamp(readType); break; } case STRING: case CHAR: - case VARCHAR: + case VARCHAR: { + ret.isValid = ret.needConvert = isStringVariant(readType) || isNumeric(readType) || + isTimestamp(readType) || isDecimal(readType); + break; + } case TIMESTAMP: case TIMESTAMP_INSTANT: case DATE: @@ -126,22 +131,22 @@ namespace orc { return ret; } - void SchemaEvolution::buildConversion(const Type* _readType, const Type* fileType) { + void SchemaEvolution::buildConversion(const Type* readType, const Type* fileType) { if (fileType == nullptr) { - throw SchemaEvolutionError("File does not have " + _readType->toString()); + throw SchemaEvolutionError("File does not have " + readType->toString()); } - auto [valid, convert] = checkConversion(*_readType, *fileType); + auto [valid, convert] = checkConversion(*readType, *fileType); if (!valid) { - invalidConversion(_readType, fileType); + invalidConversion(readType, fileType); } - readTypeMap.emplace(_readType->getColumnId(), convert ? _readType : fileType); + readTypeMap_.emplace(readType->getColumnId(), convert ? readType : fileType); // check whether PPD conversion is safe - buildSafePPDConversionMap(_readType, fileType); + buildSafePPDConversionMap(readType, fileType); - for (uint64_t i = 0; i < _readType->getSubtypeCount(); ++i) { - auto subType = _readType->getSubtype(i); + for (uint64_t i = 0; i < readType->getSubtypeCount(); ++i) { + auto subType = readType->getSubtype(i); if (subType) { // null subType means that this is a sub column of map/list type // and it does not exist in the file. simply skip it. @@ -164,20 +169,20 @@ namespace orc { return kind != STRUCT && kind != MAP && kind != LIST && kind != UNION; } - void SchemaEvolution::buildSafePPDConversionMap(const Type* _readType, const Type* fileType) { - if (_readType == nullptr || !isPrimitive(_readType) || fileType == nullptr || + void SchemaEvolution::buildSafePPDConversionMap(const Type* readType, const Type* fileType) { + if (readType == nullptr || !isPrimitive(readType) || fileType == nullptr || !isPrimitive(fileType)) { return; } bool isSafe = false; - if (_readType == fileType) { + if (readType == fileType) { // short cut for same type isSafe = true; - } else if (_readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) { + } else if (readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) { // for decimals alone do equality check to not mess up with precision change - if (fileType->getPrecision() == readType->getPrecision() && - fileType->getScale() == readType->getScale()) { + if (fileType->getPrecision() == readType_->getPrecision() && + fileType->getScale() == readType_->getScale()) { isSafe = true; } } else { @@ -195,32 +200,32 @@ namespace orc { // as ORC stores char with padded spaces in its internal index. switch (fileType->getKind()) { case BYTE: { - if (readType->getKind() == SHORT || readType->getKind() == INT || - readType->getKind() == LONG) { + if (readType_->getKind() == SHORT || readType_->getKind() == INT || + readType_->getKind() == LONG) { isSafe = true; } break; } case SHORT: { - if (readType->getKind() == INT || readType->getKind() == LONG) { + if (readType_->getKind() == INT || readType_->getKind() == LONG) { isSafe = true; } break; } case INT: { - if (readType->getKind() == LONG) { + if (readType_->getKind() == LONG) { isSafe = true; } break; } case STRING: { - if (readType->getKind() == VARCHAR) { + if (readType_->getKind() == VARCHAR) { isSafe = true; } break; } case VARCHAR: { - if (readType->getKind() == STRING) { + if (readType_->getKind() == STRING) { isSafe = true; } break; @@ -244,12 +249,12 @@ namespace orc { } if (isSafe) { - safePPDConversionMap.insert(fileType->getColumnId()); + safePPDConversionMap_.insert(fileType->getColumnId()); } } bool SchemaEvolution::isSafePPDConversion(uint64_t columnId) const { - return safePPDConversionMap.find(columnId) != safePPDConversionMap.cend(); + return safePPDConversionMap_.find(columnId) != safePPDConversionMap_.cend(); } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh index ef9020eba4..c3deff7236 100644 --- a/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh +++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh @@ -46,7 +46,7 @@ namespace orc { // return selected read type const Type* getReadType() const { - return readType.get(); + return readType_.get(); } private: @@ -54,9 +54,9 @@ namespace orc { void buildSafePPDConversionMap(const Type* readType, const Type* fileType); private: - const std::shared_ptr<Type> readType; - std::unordered_map<uint64_t, const Type*> readTypeMap; - std::unordered_set<uint64_t> safePPDConversionMap; + const std::shared_ptr<Type> readType_; + std::unordered_map<uint64_t, const Type*> readTypeMap_; + std::unordered_set<uint64_t> safePPDConversionMap_; }; } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc index 8ed29d0e7c..76fd736b27 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.cc +++ b/contrib/libs/apache/orc/c++/src/Statistics.cc @@ -52,18 +52,18 @@ namespace orc { StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext) { for (int i = 0; i < stripeStats.col_stats_size(); i++) { - colStats.push_back(convertColumnStatistics(stripeStats.col_stats(i), statContext)); + colStats_.push_back(convertColumnStatistics(stripeStats.col_stats(i), statContext)); } } StatisticsImpl::StatisticsImpl(const proto::Footer& footer, const StatContext& statContext) { for (int i = 0; i < footer.statistics_size(); i++) { - colStats.push_back(convertColumnStatistics(footer.statistics(i), statContext)); + colStats_.push_back(convertColumnStatistics(footer.statistics(i), statContext)); } } StatisticsImpl::~StatisticsImpl() { - for (std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); ptr != colStats.end(); + for (std::vector<ColumnStatistics*>::iterator ptr = colStats_.begin(); ptr != colStats_.end(); ++ptr) { delete *ptr; } @@ -85,11 +85,11 @@ namespace orc { const proto::StripeStatistics& stripeStats, std::vector<std::vector<proto::ColumnStatistics> >& indexStats, const StatContext& statContext) { - columnStats = std::make_unique<StatisticsImpl>(stripeStats, statContext); - rowIndexStats.resize(indexStats.size()); - for (size_t i = 0; i < rowIndexStats.size(); i++) { + columnStats_ = std::make_unique<StatisticsImpl>(stripeStats, statContext); + rowIndexStats_.resize(indexStats.size()); + for (size_t i = 0; i < rowIndexStats_.size(); i++) { for (size_t j = 0; j < indexStats[i].size(); j++) { - rowIndexStats[i].push_back(std::shared_ptr<const ColumnStatistics>( + rowIndexStats_[i].push_back(std::shared_ptr<const ColumnStatistics>( convertColumnStatistics(indexStats[i][j], statContext))); } } @@ -180,205 +180,205 @@ namespace orc { } ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_binary_statistics() && statContext.correctStats) { - _stats.setHasTotalLength(pb.binary_statistics().has_sum()); - _stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum())); + stats_.setHasTotalLength(pb.binary_statistics().has_sum()); + stats_.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum())); } } BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_bucket_statistics() && statContext.correctStats) { - _hasCount = true; - _trueCount = pb.bucket_statistics().count(0); + hasCount_ = true; + trueCount_ = pb.bucket_statistics().count(0); } else { - _hasCount = false; - _trueCount = 0; + hasCount_ = false; + trueCount_ = 0; } } DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_date_statistics() || !statContext.correctStats) { // hasMinimum_ is false by default; // hasMaximum_ is false by default; - _stats.setMinimum(0); - _stats.setMaximum(0); + stats_.setMinimum(0); + stats_.setMaximum(0); } else { - _stats.setHasMinimum(pb.date_statistics().has_minimum()); - _stats.setHasMaximum(pb.date_statistics().has_maximum()); - _stats.setMinimum(pb.date_statistics().minimum()); - _stats.setMaximum(pb.date_statistics().maximum()); + stats_.setHasMinimum(pb.date_statistics().has_minimum()); + stats_.setHasMaximum(pb.date_statistics().has_maximum()); + stats_.setMinimum(pb.date_statistics().minimum()); + stats_.setMaximum(pb.date_statistics().maximum()); } } DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_decimal_statistics() && statContext.correctStats) { const proto::DecimalStatistics& stats = pb.decimal_statistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); + stats_.setHasMinimum(stats.has_minimum()); + stats_.setHasMaximum(stats.has_maximum()); + stats_.setHasSum(stats.has_sum()); - _stats.setMinimum(Decimal(stats.minimum())); - _stats.setMaximum(Decimal(stats.maximum())); - _stats.setSum(Decimal(stats.sum())); + stats_.setMinimum(Decimal(stats.minimum())); + stats_.setMaximum(Decimal(stats.maximum())); + stats_.setSum(Decimal(stats.sum())); } } DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_double_statistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); + stats_.setMinimum(0); + stats_.setMaximum(0); + stats_.setSum(0); } else { const proto::DoubleStatistics& stats = pb.double_statistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); + stats_.setHasMinimum(stats.has_minimum()); + stats_.setHasMaximum(stats.has_maximum()); + stats_.setHasSum(stats.has_sum()); - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setSum(stats.sum()); + stats_.setMinimum(stats.minimum()); + stats_.setMaximum(stats.maximum()); + stats_.setSum(stats.sum()); } } IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_int_statistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); + stats_.setMinimum(0); + stats_.setMaximum(0); + stats_.setSum(0); } else { const proto::IntegerStatistics& stats = pb.int_statistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); + stats_.setHasMinimum(stats.has_minimum()); + stats_.setHasMaximum(stats.has_maximum()); + stats_.setHasSum(stats.has_sum()); - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setSum(stats.sum()); + stats_.setMinimum(stats.minimum()); + stats_.setMaximum(stats.maximum()); + stats_.setSum(stats.sum()); } } StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_string_statistics() || !statContext.correctStats) { - _stats.setTotalLength(0); + stats_.setTotalLength(0); } else { const proto::StringStatistics& stats = pb.string_statistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasTotalLength(stats.has_sum()); + stats_.setHasMinimum(stats.has_minimum()); + stats_.setHasMaximum(stats.has_maximum()); + stats_.setHasTotalLength(stats.has_sum()); - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setTotalLength(static_cast<uint64_t>(stats.sum())); + stats_.setMinimum(stats.minimum()); + stats_.setMaximum(stats.maximum()); + stats_.setTotalLength(static_cast<uint64_t>(stats.sum())); } } TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_timestamp_statistics() || !statContext.correctStats) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _lowerBound = 0; - _upperBound = 0; - _minimumNanos = DEFAULT_MIN_NANOS; - _maximumNanos = DEFAULT_MAX_NANOS; + stats_.setMinimum(0); + stats_.setMaximum(0); + lowerBound_ = 0; + upperBound_ = 0; + minimumNanos_ = DEFAULT_MIN_NANOS; + maximumNanos_ = DEFAULT_MAX_NANOS; } else { const proto::TimestampStatistics& stats = pb.timestamp_statistics(); - _stats.setHasMinimum(stats.has_minimum_utc() || + stats_.setHasMinimum(stats.has_minimum_utc() || (stats.has_minimum() && (statContext.writerTimezone != nullptr))); - _stats.setHasMaximum(stats.has_maximum_utc() || + stats_.setHasMaximum(stats.has_maximum_utc() || (stats.has_maximum() && (statContext.writerTimezone != nullptr))); - _hasLowerBound = stats.has_minimum_utc() || stats.has_minimum(); - _hasUpperBound = stats.has_maximum_utc() || stats.has_maximum(); + hasLowerBound_ = stats.has_minimum_utc() || stats.has_minimum(); + hasUpperBound_ = stats.has_maximum_utc() || stats.has_maximum(); // to be consistent with java side, non-default minimum_nanos and maximum_nanos // are added by one in their serialized form. - _minimumNanos = stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS; - _maximumNanos = stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS; + minimumNanos_ = stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS; + maximumNanos_ = stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS; // Timestamp stats are stored in milliseconds if (stats.has_minimum_utc()) { int64_t minimum = stats.minimum_utc(); - _stats.setMinimum(minimum); - _lowerBound = minimum; + stats_.setMinimum(minimum); + lowerBound_ = minimum; } else if (statContext.writerTimezone) { int64_t writerTimeSec = stats.minimum() / 1000; // multiply the offset by 1000 to convert to millisecond int64_t minimum = stats.minimum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000; - _stats.setMinimum(minimum); - _lowerBound = minimum; + stats_.setMinimum(minimum); + lowerBound_ = minimum; } else { - _stats.setMinimum(0); + stats_.setMinimum(0); // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown // TZ and daylight savings - _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); + lowerBound_ = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); } // Timestamp stats are stored in milliseconds if (stats.has_maximum_utc()) { int64_t maximum = stats.maximum_utc(); - _stats.setMaximum(maximum); - _upperBound = maximum; + stats_.setMaximum(maximum); + upperBound_ = maximum; } else if (statContext.writerTimezone) { int64_t writerTimeSec = stats.maximum() / 1000; // multiply the offset by 1000 to convert to millisecond int64_t maximum = stats.maximum() + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000; - _stats.setMaximum(maximum); - _upperBound = maximum; + stats_.setMaximum(maximum); + upperBound_ = maximum; } else { - _stats.setMaximum(0); + stats_.setMaximum(0); // add 1 day 1 hour (25 hours) in milliseconds to handle unknown // TZ and daylight savings - _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); + upperBound_ = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); } // Add 1 millisecond to account for microsecond precision of values - _upperBound += 1; + upperBound_ += 1; } } CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl( const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.number_of_values()); - _stats.setHasNull(pb.has_null()); + stats_.setNumberOfValues(pb.number_of_values()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_collection_statistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); + stats_.setMinimum(0); + stats_.setMaximum(0); + stats_.setSum(0); } else { const proto::CollectionStatistics& stats = pb.collection_statistics(); - _stats.setHasMinimum(stats.has_min_children()); - _stats.setHasMaximum(stats.has_max_children()); - _stats.setHasSum(stats.has_total_children()); + stats_.setHasMinimum(stats.has_min_children()); + stats_.setHasMaximum(stats.has_max_children()); + stats_.setHasSum(stats.has_total_children()); - _stats.setMinimum(stats.min_children()); - _stats.setMaximum(stats.max_children()); - _stats.setSum(stats.total_children()); + stats_.setMinimum(stats.min_children()); + stats_.setMaximum(stats.max_children()); + stats_.setSum(stats.total_children()); } } diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh index e585bf971c..6f212c15cc 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.hh +++ b/contrib/libs/apache/orc/c++/src/Statistics.hh @@ -48,160 +48,160 @@ namespace orc { template <typename T> class InternalStatisticsImpl { private: - bool _hasNull; - bool _hasMinimum; - bool _hasMaximum; - bool _hasSum; - bool _hasTotalLength; - uint64_t _totalLength; - uint64_t _valueCount; - T _minimum; - T _maximum; - T _sum; + bool hasNull_; + bool hasMinimum_; + bool hasMaximum_; + bool hasSum_; + bool hasTotalLength_; + uint64_t totalLength_; + uint64_t valueCount_; + T minimum_; + T maximum_; + T sum_; public: InternalStatisticsImpl() { - _hasNull = false; - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; - _hasTotalLength = false; - _totalLength = 0; - _valueCount = 0; + hasNull_ = false; + hasMinimum_ = false; + hasMaximum_ = false; + hasSum_ = false; + hasTotalLength_ = false; + totalLength_ = 0; + valueCount_ = 0; } ~InternalStatisticsImpl() {} // GET / SET _totalLength bool hasTotalLength() const { - return _hasTotalLength; + return hasTotalLength_; } void setHasTotalLength(bool hasTotalLength) { - _hasTotalLength = hasTotalLength; + hasTotalLength_ = hasTotalLength; } uint64_t getTotalLength() const { - return _totalLength; + return totalLength_; } void setTotalLength(uint64_t totalLength) { - _totalLength = totalLength; + totalLength_ = totalLength; } // GET / SET _sum bool hasSum() const { - return _hasSum; + return hasSum_; } void setHasSum(bool hasSum) { - _hasSum = hasSum; + hasSum_ = hasSum; } T getSum() const { - return _sum; + return sum_; } void setSum(T sum) { - _sum = sum; + sum_ = sum; } // GET / SET _maximum bool hasMaximum() const { - return _hasMaximum; + return hasMaximum_; } const T& getMaximum() const { - return _maximum; + return maximum_; } void setHasMaximum(bool hasMax) { - _hasMaximum = hasMax; + hasMaximum_ = hasMax; } void setMaximum(T max) { - _maximum = max; + maximum_ = max; } // GET / SET _minimum bool hasMinimum() const { - return _hasMinimum; + return hasMinimum_; } void setHasMinimum(bool hasMin) { - _hasMinimum = hasMin; + hasMinimum_ = hasMin; } const T& getMinimum() const { - return _minimum; + return minimum_; } void setMinimum(T min) { - _minimum = min; + minimum_ = min; } // GET / SET _valueCount uint64_t getNumberOfValues() const { - return _valueCount; + return valueCount_; } void setNumberOfValues(uint64_t numValues) { - _valueCount = numValues; + valueCount_ = numValues; } // GET / SET _hasNullValue bool hasNull() const { - return _hasNull; + return hasNull_; } void setHasNull(bool hasNull) { - _hasNull = hasNull; + hasNull_ = hasNull; } void reset() { - _hasNull = false; - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; - _hasTotalLength = false; - _totalLength = 0; - _valueCount = 0; + hasNull_ = false; + hasMinimum_ = false; + hasMaximum_ = false; + hasSum_ = false; + hasTotalLength_ = false; + totalLength_ = 0; + valueCount_ = 0; } void updateMinMax(T value) { - if (!_hasMinimum) { - _hasMinimum = _hasMaximum = true; - _minimum = _maximum = value; - } else if (compare(value, _minimum)) { - _minimum = value; - } else if (compare(_maximum, value)) { - _maximum = value; + if (!hasMinimum_) { + hasMinimum_ = hasMaximum_ = true; + minimum_ = maximum_ = value; + } else if (compare(value, minimum_)) { + minimum_ = value; + } else if (compare(maximum_, value)) { + maximum_ = value; } } // sum is not merged here as we need to check overflow void merge(const InternalStatisticsImpl& other) { - _hasNull = _hasNull || other._hasNull; - _valueCount += other._valueCount; - - if (other._hasMinimum) { - if (!_hasMinimum) { - _hasMinimum = _hasMaximum = true; - _minimum = other._minimum; - _maximum = other._maximum; + hasNull_ = hasNull_ || other.hasNull_; + valueCount_ += other.valueCount_; + + if (other.hasMinimum_) { + if (!hasMinimum_) { + hasMinimum_ = hasMaximum_ = true; + minimum_ = other.minimum_; + maximum_ = other.maximum_; } else { // all template types should support operator< - if (compare(_maximum, other._maximum)) { - _maximum = other._maximum; + if (compare(maximum_, other.maximum_)) { + maximum_ = other.maximum_; } - if (compare(other._minimum, _minimum)) { - _minimum = other._minimum; + if (compare(other.minimum_, minimum_)) { + minimum_ = other.minimum_; } } } - _hasTotalLength = _hasTotalLength && other._hasTotalLength; - _totalLength += other._totalLength; + hasTotalLength_ = hasTotalLength_ && other.hasTotalLength_; + totalLength_ += other.totalLength_; } }; @@ -240,7 +240,7 @@ namespace orc { class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics { private: - InternalCharStatistics _stats; + InternalCharStatistics stats_; public: ColumnStatisticsImpl() { @@ -250,36 +250,36 @@ namespace orc { virtual ~ColumnStatisticsImpl() override; uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } void merge(const MutableColumnStatistics& other) override { - _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats); + stats_.merge(dynamic_cast<const ColumnStatisticsImpl&>(other).stats_); } void reset() override { - _stats.reset(); + stats_.reset(); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); } std::string toString() const override { @@ -292,7 +292,7 @@ namespace orc { class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics { private: - InternalCharStatistics _stats; + InternalCharStatistics stats_; public: BinaryColumnStatisticsImpl() { @@ -303,63 +303,63 @@ namespace orc { virtual ~BinaryColumnStatisticsImpl() override; uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } bool hasTotalLength() const override { - return _stats.hasTotalLength(); + return stats_.hasTotalLength(); } uint64_t getTotalLength() const override { if (hasTotalLength()) { - return _stats.getTotalLength(); + return stats_.getTotalLength(); } else { throw ParseError("Total length is not defined."); } } void setTotalLength(uint64_t length) { - _stats.setHasTotalLength(true); - _stats.setTotalLength(length); + stats_.setHasTotalLength(true); + stats_.setTotalLength(length); } void update(size_t length) { - _stats.setTotalLength(_stats.getTotalLength() + length); + stats_.setTotalLength(stats_.getTotalLength() + length); } void merge(const MutableColumnStatistics& other) override { const BinaryColumnStatisticsImpl& binStats = dynamic_cast<const BinaryColumnStatisticsImpl&>(other); - _stats.merge(binStats._stats); + stats_.merge(binStats.stats_); } void reset() override { - _stats.reset(); + stats_.reset(); setTotalLength(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::BinaryStatistics* binStats = pbStats.mutable_binary_statistics(); - binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); + binStats->set_sum(static_cast<int64_t>(stats_.getTotalLength())); } std::string toString() const override { @@ -379,9 +379,9 @@ namespace orc { class BooleanColumnStatisticsImpl : public BooleanColumnStatistics, public MutableColumnStatistics { private: - InternalBooleanStatistics _stats; - bool _hasCount; - uint64_t _trueCount; + InternalBooleanStatistics stats_; + bool hasCount_; + uint64_t trueCount_; public: BooleanColumnStatisticsImpl() { @@ -392,33 +392,33 @@ namespace orc { virtual ~BooleanColumnStatisticsImpl() override; bool hasCount() const override { - return _hasCount; + return hasCount_; } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - _hasCount = true; + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); + hasCount_ = true; } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } uint64_t getFalseCount() const override { if (hasCount()) { - return getNumberOfValues() - _trueCount; + return getNumberOfValues() - trueCount_; } else { throw ParseError("False count is not defined."); } @@ -426,43 +426,43 @@ namespace orc { uint64_t getTrueCount() const override { if (hasCount()) { - return _trueCount; + return trueCount_; } else { throw ParseError("True count is not defined."); } } void setTrueCount(uint64_t trueCount) { - _hasCount = true; - _trueCount = trueCount; + hasCount_ = true; + trueCount_ = trueCount; } void update(bool value, size_t repetitions) { if (value) { - _trueCount += repetitions; + trueCount_ += repetitions; } } void merge(const MutableColumnStatistics& other) override { const BooleanColumnStatisticsImpl& boolStats = dynamic_cast<const BooleanColumnStatisticsImpl&>(other); - _stats.merge(boolStats._stats); - _hasCount = _hasCount && boolStats._hasCount; - _trueCount += boolStats._trueCount; + stats_.merge(boolStats.stats_); + hasCount_ = hasCount_ && boolStats.hasCount_; + trueCount_ += boolStats.trueCount_; } void reset() override { - _stats.reset(); + stats_.reset(); setTrueCount(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::BucketStatistics* bucketStats = pbStats.mutable_bucket_statistics(); - if (_hasCount) { - bucketStats->add_count(_trueCount); + if (hasCount_) { + bucketStats->add_count(trueCount_); } else { bucketStats->clear_count(); } @@ -485,7 +485,7 @@ namespace orc { class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics { private: - InternalDateStatistics _stats; + InternalDateStatistics stats_; public: DateColumnStatisticsImpl() { @@ -495,36 +495,36 @@ namespace orc { virtual ~DateColumnStatisticsImpl() override; bool hasMinimum() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximum() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } int32_t getMinimum() const override { if (hasMinimum()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("Minimum is not defined."); } @@ -532,44 +532,44 @@ namespace orc { int32_t getMaximum() const override { if (hasMaximum()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(int32_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximum(int32_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } void update(int32_t value) { - _stats.updateMinMax(value); + stats_.updateMinMax(value); } void merge(const MutableColumnStatistics& other) override { const DateColumnStatisticsImpl& dateStats = dynamic_cast<const DateColumnStatisticsImpl&>(other); - _stats.merge(dateStats._stats); + stats_.merge(dateStats.stats_); } void reset() override { - _stats.reset(); + stats_.reset(); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::DateStatistics* dateStatistics = pbStats.mutable_date_statistics(); - if (_stats.hasMinimum()) { - dateStatistics->set_maximum(_stats.getMaximum()); - dateStatistics->set_minimum(_stats.getMinimum()); + if (stats_.hasMinimum()) { + dateStatistics->set_maximum(stats_.getMaximum()); + dateStatistics->set_minimum(stats_.getMinimum()); } else { dateStatistics->clear_minimum(); dateStatistics->clear_maximum(); @@ -599,7 +599,7 @@ namespace orc { class DecimalColumnStatisticsImpl : public DecimalColumnStatistics, public MutableColumnStatistics { private: - InternalDecimalStatistics _stats; + InternalDecimalStatistics stats_; public: DecimalColumnStatisticsImpl() { @@ -610,40 +610,40 @@ namespace orc { virtual ~DecimalColumnStatisticsImpl() override; bool hasMinimum() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximum() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } bool hasSum() const override { - return _stats.hasSum(); + return stats_.hasSum(); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } Decimal getMinimum() const override { if (hasMinimum()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("Minimum is not defined."); } @@ -651,39 +651,39 @@ namespace orc { Decimal getMaximum() const override { if (hasMaximum()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(Decimal minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximum(Decimal maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } Decimal getSum() const override { if (hasSum()) { - return _stats.getSum(); + return stats_.getSum(); } else { throw ParseError("Sum is not defined."); } } void setSum(Decimal sum) { - _stats.setHasSum(true); - _stats.setSum(sum); + stats_.setHasSum(true); + stats_.setSum(sum); } void update(const Decimal& value) { - _stats.updateMinMax(value); + stats_.updateMinMax(value); - if (_stats.hasSum()) { + if (stats_.hasSum()) { updateSum(value); } } @@ -692,33 +692,33 @@ namespace orc { const DecimalColumnStatisticsImpl& decStats = dynamic_cast<const DecimalColumnStatisticsImpl&>(other); - _stats.merge(decStats._stats); + stats_.merge(decStats.stats_); - _stats.setHasSum(_stats.hasSum() && decStats.hasSum()); - if (_stats.hasSum()) { + stats_.setHasSum(stats_.hasSum() && decStats.hasSum()); + if (stats_.hasSum()) { updateSum(decStats.getSum()); } } void reset() override { - _stats.reset(); + stats_.reset(); setSum(Decimal()); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::DecimalStatistics* decStats = pbStats.mutable_decimal_statistics(); - if (_stats.hasMinimum()) { - decStats->set_minimum(_stats.getMinimum().toString(true)); - decStats->set_maximum(_stats.getMaximum().toString(true)); + if (stats_.hasMinimum()) { + decStats->set_minimum(stats_.getMinimum().toString(true)); + decStats->set_maximum(stats_.getMaximum().toString(true)); } else { decStats->clear_minimum(); decStats->clear_maximum(); } - if (_stats.hasSum()) { - decStats->set_sum(_stats.getSum().toString(true)); + if (stats_.hasSum()) { + decStats->set_sum(stats_.getSum().toString(true)); } else { decStats->clear_sum(); } @@ -752,9 +752,9 @@ namespace orc { private: void updateSum(Decimal value) { - if (_stats.hasSum()) { + if (stats_.hasSum()) { bool overflow = false; - Decimal sum = _stats.getSum(); + Decimal sum = stats_.getSum(); if (sum.scale > value.scale) { value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow); } else if (sum.scale < value.scale) { @@ -766,14 +766,14 @@ namespace orc { bool wasPositive = sum.value >= 0; sum.value += value.value; if ((value.value >= 0) == wasPositive) { - _stats.setHasSum((sum.value >= 0) == wasPositive); + stats_.setHasSum((sum.value >= 0) == wasPositive); } } else { - _stats.setHasSum(false); + stats_.setHasSum(false); } - if (_stats.hasSum()) { - _stats.setSum(sum); + if (stats_.hasSum()) { + stats_.setSum(sum); } } } @@ -781,7 +781,7 @@ namespace orc { class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics { private: - InternalDoubleStatistics _stats; + InternalDoubleStatistics stats_; public: DoubleColumnStatisticsImpl() { @@ -791,40 +791,40 @@ namespace orc { virtual ~DoubleColumnStatisticsImpl() override; bool hasMinimum() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximum() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } bool hasSum() const override { - return _stats.hasSum(); + return stats_.hasSum(); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } double getMinimum() const override { if (hasMinimum()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("Minimum is not defined."); } @@ -832,70 +832,70 @@ namespace orc { double getMaximum() const override { if (hasMaximum()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(double minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximum(double maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } double getSum() const override { if (hasSum()) { - return _stats.getSum(); + return stats_.getSum(); } else { throw ParseError("Sum is not defined."); } } void setSum(double sum) { - _stats.setHasSum(true); - _stats.setSum(sum); + stats_.setHasSum(true); + stats_.setSum(sum); } void update(double value) { - _stats.updateMinMax(value); - _stats.setSum(_stats.getSum() + value); + stats_.updateMinMax(value); + stats_.setSum(stats_.getSum() + value); } void merge(const MutableColumnStatistics& other) override { const DoubleColumnStatisticsImpl& doubleStats = dynamic_cast<const DoubleColumnStatisticsImpl&>(other); - _stats.merge(doubleStats._stats); + stats_.merge(doubleStats.stats_); - _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); - if (_stats.hasSum()) { - _stats.setSum(_stats.getSum() + doubleStats.getSum()); + stats_.setHasSum(stats_.hasSum() && doubleStats.hasSum()); + if (stats_.hasSum()) { + stats_.setSum(stats_.getSum() + doubleStats.getSum()); } } void reset() override { - _stats.reset(); + stats_.reset(); setSum(0.0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::DoubleStatistics* doubleStats = pbStats.mutable_double_statistics(); - if (_stats.hasMinimum()) { - doubleStats->set_minimum(_stats.getMinimum()); - doubleStats->set_maximum(_stats.getMaximum()); + if (stats_.hasMinimum()) { + doubleStats->set_minimum(stats_.getMinimum()); + doubleStats->set_maximum(stats_.getMaximum()); } else { doubleStats->clear_minimum(); doubleStats->clear_maximum(); } - if (_stats.hasSum()) { - doubleStats->set_sum(_stats.getSum()); + if (stats_.hasSum()) { + doubleStats->set_sum(stats_.getSum()); } else { doubleStats->clear_sum(); } @@ -930,7 +930,7 @@ namespace orc { class IntegerColumnStatisticsImpl : public IntegerColumnStatistics, public MutableColumnStatistics { private: - InternalIntegerStatistics _stats; + InternalIntegerStatistics stats_; public: IntegerColumnStatisticsImpl() { @@ -940,40 +940,40 @@ namespace orc { virtual ~IntegerColumnStatisticsImpl() override; bool hasMinimum() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximum() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } bool hasSum() const override { - return _stats.hasSum(); + return stats_.hasSum(); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } int64_t getMinimum() const override { if (hasMinimum()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("Minimum is not defined."); } @@ -981,48 +981,48 @@ namespace orc { int64_t getMaximum() const override { if (hasMaximum()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(int64_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximum(int64_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } int64_t getSum() const override { if (hasSum()) { - return _stats.getSum(); + return stats_.getSum(); } else { throw ParseError("Sum is not defined."); } } void setSum(int64_t sum) { - _stats.setHasSum(true); - _stats.setSum(sum); + stats_.setHasSum(true); + stats_.setSum(sum); } void update(int64_t value, int repetitions) { - _stats.updateMinMax(value); + stats_.updateMinMax(value); - if (_stats.hasSum()) { + if (stats_.hasSum()) { if (repetitions > 1) { - _stats.setHasSum(multiplyExact(value, repetitions, &value)); + stats_.setHasSum(multiplyExact(value, repetitions, &value)); } - if (_stats.hasSum()) { - _stats.setHasSum(addExact(_stats.getSum(), value, &value)); + if (stats_.hasSum()) { + stats_.setHasSum(addExact(stats_.getSum(), value, &value)); - if (_stats.hasSum()) { - _stats.setSum(value); + if (stats_.hasSum()) { + stats_.setSum(value); } } } @@ -1032,38 +1032,38 @@ namespace orc { const IntegerColumnStatisticsImpl& intStats = dynamic_cast<const IntegerColumnStatisticsImpl&>(other); - _stats.merge(intStats._stats); + stats_.merge(intStats.stats_); // update sum and check overflow - _stats.setHasSum(_stats.hasSum() && intStats.hasSum()); - if (_stats.hasSum()) { + stats_.setHasSum(stats_.hasSum() && intStats.hasSum()); + if (stats_.hasSum()) { int64_t value; - _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value)); - if (_stats.hasSum()) { - _stats.setSum(value); + stats_.setHasSum(addExact(stats_.getSum(), intStats.getSum(), &value)); + if (stats_.hasSum()) { + stats_.setSum(value); } } } void reset() override { - _stats.reset(); + stats_.reset(); setSum(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::IntegerStatistics* intStats = pbStats.mutable_int_statistics(); - if (_stats.hasMinimum()) { - intStats->set_minimum(_stats.getMinimum()); - intStats->set_maximum(_stats.getMaximum()); + if (stats_.hasMinimum()) { + intStats->set_minimum(stats_.getMinimum()); + intStats->set_maximum(stats_.getMaximum()); } else { intStats->clear_minimum(); intStats->clear_maximum(); } - if (_stats.hasSum()) { - intStats->set_sum(_stats.getSum()); + if (stats_.hasSum()) { + intStats->set_sum(stats_.getSum()); } else { intStats->clear_sum(); } @@ -1097,7 +1097,7 @@ namespace orc { class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics { private: - InternalStringStatistics _stats; + InternalStringStatistics stats_; public: StringColumnStatisticsImpl() { @@ -1108,40 +1108,40 @@ namespace orc { virtual ~StringColumnStatisticsImpl() override; bool hasMinimum() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximum() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } bool hasTotalLength() const override { - return _stats.hasTotalLength(); + return stats_.hasTotalLength(); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } const std::string& getMinimum() const override { if (hasMinimum()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("Minimum is not defined."); } @@ -1149,59 +1149,59 @@ namespace orc { const std::string& getMaximum() const override { if (hasMaximum()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(std::string minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximum(std::string maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } uint64_t getTotalLength() const override { if (hasTotalLength()) { - return _stats.getTotalLength(); + return stats_.getTotalLength(); } else { throw ParseError("Total length is not defined."); } } void setTotalLength(uint64_t length) { - _stats.setHasTotalLength(true); - _stats.setTotalLength(length); + stats_.setHasTotalLength(true); + stats_.setTotalLength(length); } void update(const char* value, size_t length) { if (value != nullptr) { - if (!_stats.hasMinimum()) { + if (!stats_.hasMinimum()) { std::string tempStr(value, value + length); setMinimum(tempStr); setMaximum(tempStr); } else { // update min - int minCmp = strncmp(_stats.getMinimum().c_str(), value, - std::min(_stats.getMinimum().length(), length)); - if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) { + int minCmp = strncmp(stats_.getMinimum().c_str(), value, + std::min(stats_.getMinimum().length(), length)); + if (minCmp > 0 || (minCmp == 0 && length < stats_.getMinimum().length())) { setMinimum(std::string(value, value + length)); } // update max - int maxCmp = strncmp(_stats.getMaximum().c_str(), value, - std::min(_stats.getMaximum().length(), length)); - if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) { + int maxCmp = strncmp(stats_.getMaximum().c_str(), value, + std::min(stats_.getMaximum().length(), length)); + if (maxCmp < 0 || (maxCmp == 0 && length > stats_.getMaximum().length())) { setMaximum(std::string(value, value + length)); } } } - _stats.setTotalLength(_stats.getTotalLength() + length); + stats_.setTotalLength(stats_.getTotalLength() + length); } void update(std::string value) { @@ -1211,28 +1211,28 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const StringColumnStatisticsImpl& strStats = dynamic_cast<const StringColumnStatisticsImpl&>(other); - _stats.merge(strStats._stats); + stats_.merge(strStats.stats_); } void reset() override { - _stats.reset(); + stats_.reset(); setTotalLength(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::StringStatistics* strStats = pbStats.mutable_string_statistics(); - if (_stats.hasMinimum()) { - strStats->set_minimum(_stats.getMinimum()); - strStats->set_maximum(_stats.getMaximum()); + if (stats_.hasMinimum()) { + strStats->set_minimum(stats_.getMinimum()); + strStats->set_maximum(stats_.getMaximum()); } else { strStats->clear_minimum(); strStats->clear_maximum(); } - if (_stats.hasTotalLength()) { - strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); + if (stats_.hasTotalLength()) { + strStats->set_sum(static_cast<int64_t>(stats_.getTotalLength())); } else { strStats->clear_sum(); } @@ -1267,13 +1267,13 @@ namespace orc { class TimestampColumnStatisticsImpl : public TimestampColumnStatistics, public MutableColumnStatistics { private: - InternalIntegerStatistics _stats; - bool _hasLowerBound; - bool _hasUpperBound; - int64_t _lowerBound; - int64_t _upperBound; - int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp - int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp + InternalIntegerStatistics stats_; + bool hasLowerBound_; + bool hasUpperBound_; + int64_t lowerBound_; + int64_t upperBound_; + int32_t minimumNanos_; // last 6 digits of nanosecond of minimum timestamp + int32_t maximumNanos_; // last 6 digits of nanosecond of maximum timestamp static constexpr int32_t DEFAULT_MIN_NANOS = 0; static constexpr int32_t DEFAULT_MAX_NANOS = 999999; @@ -1286,36 +1286,36 @@ namespace orc { virtual ~TimestampColumnStatisticsImpl() override; bool hasMinimum() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximum() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } int64_t getMinimum() const override { if (hasMinimum()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("Minimum is not defined."); } @@ -1323,46 +1323,46 @@ namespace orc { int64_t getMaximum() const override { if (hasMaximum()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(int64_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximum(int64_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } void update(int64_t value) { - _stats.updateMinMax(value); + stats_.updateMinMax(value); } void update(int64_t milli, int32_t nano) { - if (!_stats.hasMinimum()) { - _stats.setHasMinimum(true); - _stats.setHasMaximum(true); - _stats.setMinimum(milli); - _stats.setMaximum(milli); - _maximumNanos = _minimumNanos = nano; + if (!stats_.hasMinimum()) { + stats_.setHasMinimum(true); + stats_.setHasMaximum(true); + stats_.setMinimum(milli); + stats_.setMaximum(milli); + maximumNanos_ = minimumNanos_ = nano; } else { - if (milli <= _stats.getMinimum()) { - if (milli < _stats.getMinimum() || nano < _minimumNanos) { - _minimumNanos = nano; + if (milli <= stats_.getMinimum()) { + if (milli < stats_.getMinimum() || nano < minimumNanos_) { + minimumNanos_ = nano; } - _stats.setMinimum(milli); + stats_.setMinimum(milli); } - if (milli >= _stats.getMaximum()) { - if (milli > _stats.getMaximum() || nano > _maximumNanos) { - _maximumNanos = nano; + if (milli >= stats_.getMaximum()) { + if (milli > stats_.getMaximum() || nano > maximumNanos_) { + maximumNanos_ = nano; } - _stats.setMaximum(milli); + stats_.setMaximum(milli); } } } @@ -1371,55 +1371,55 @@ namespace orc { const TimestampColumnStatisticsImpl& tsStats = dynamic_cast<const TimestampColumnStatisticsImpl&>(other); - _stats.setHasNull(_stats.hasNull() || tsStats.hasNull()); - _stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues()); + stats_.setHasNull(stats_.hasNull() || tsStats.hasNull()); + stats_.setNumberOfValues(stats_.getNumberOfValues() + tsStats.getNumberOfValues()); if (tsStats.hasMinimum()) { - if (!_stats.hasMinimum()) { - _stats.setHasMinimum(true); - _stats.setHasMaximum(true); - _stats.setMinimum(tsStats.getMinimum()); - _stats.setMaximum(tsStats.getMaximum()); - _minimumNanos = tsStats.getMinimumNanos(); - _maximumNanos = tsStats.getMaximumNanos(); + if (!stats_.hasMinimum()) { + stats_.setHasMinimum(true); + stats_.setHasMaximum(true); + stats_.setMinimum(tsStats.getMinimum()); + stats_.setMaximum(tsStats.getMaximum()); + minimumNanos_ = tsStats.getMinimumNanos(); + maximumNanos_ = tsStats.getMaximumNanos(); } else { - if (tsStats.getMaximum() >= _stats.getMaximum()) { - if (tsStats.getMaximum() > _stats.getMaximum() || - tsStats.getMaximumNanos() > _maximumNanos) { - _maximumNanos = tsStats.getMaximumNanos(); + if (tsStats.getMaximum() >= stats_.getMaximum()) { + if (tsStats.getMaximum() > stats_.getMaximum() || + tsStats.getMaximumNanos() > maximumNanos_) { + maximumNanos_ = tsStats.getMaximumNanos(); } - _stats.setMaximum(tsStats.getMaximum()); + stats_.setMaximum(tsStats.getMaximum()); } - if (tsStats.getMinimum() <= _stats.getMinimum()) { - if (tsStats.getMinimum() < _stats.getMinimum() || - tsStats.getMinimumNanos() < _minimumNanos) { - _minimumNanos = tsStats.getMinimumNanos(); + if (tsStats.getMinimum() <= stats_.getMinimum()) { + if (tsStats.getMinimum() < stats_.getMinimum() || + tsStats.getMinimumNanos() < minimumNanos_) { + minimumNanos_ = tsStats.getMinimumNanos(); } - _stats.setMinimum(tsStats.getMinimum()); + stats_.setMinimum(tsStats.getMinimum()); } } } } void reset() override { - _stats.reset(); - _minimumNanos = DEFAULT_MIN_NANOS; - _maximumNanos = DEFAULT_MAX_NANOS; + stats_.reset(); + minimumNanos_ = DEFAULT_MIN_NANOS; + maximumNanos_ = DEFAULT_MAX_NANOS; } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::TimestampStatistics* tsStats = pbStats.mutable_timestamp_statistics(); - if (_stats.hasMinimum()) { - tsStats->set_minimum_utc(_stats.getMinimum()); - tsStats->set_maximum_utc(_stats.getMaximum()); - if (_minimumNanos != DEFAULT_MIN_NANOS) { - tsStats->set_minimum_nanos(_minimumNanos + 1); + if (stats_.hasMinimum()) { + tsStats->set_minimum_utc(stats_.getMinimum()); + tsStats->set_maximum_utc(stats_.getMaximum()); + if (minimumNanos_ != DEFAULT_MIN_NANOS) { + tsStats->set_minimum_nanos(minimumNanos_ + 1); } - if (_maximumNanos != DEFAULT_MAX_NANOS) { - tsStats->set_maximum_nanos(_maximumNanos + 1); + if (maximumNanos_ != DEFAULT_MAX_NANOS) { + tsStats->set_maximum_nanos(maximumNanos_ + 1); } } else { tsStats->clear_minimum_utc(); @@ -1478,16 +1478,16 @@ namespace orc { } bool hasLowerBound() const override { - return _hasLowerBound; + return hasLowerBound_; } bool hasUpperBound() const override { - return _hasUpperBound; + return hasUpperBound_; } int64_t getLowerBound() const override { if (hasLowerBound()) { - return _lowerBound; + return lowerBound_; } else { throw ParseError("LowerBound is not defined."); } @@ -1495,7 +1495,7 @@ namespace orc { int64_t getUpperBound() const override { if (hasUpperBound()) { - return _upperBound; + return upperBound_; } else { throw ParseError("UpperBound is not defined."); } @@ -1503,7 +1503,7 @@ namespace orc { int32_t getMinimumNanos() const override { if (hasMinimum()) { - return _minimumNanos; + return minimumNanos_; } else { throw ParseError("Minimum is not defined."); } @@ -1511,7 +1511,7 @@ namespace orc { int32_t getMaximumNanos() const override { if (hasMaximum()) { - return _maximumNanos; + return maximumNanos_; } else { throw ParseError("Maximum is not defined."); } @@ -1521,7 +1521,7 @@ namespace orc { class CollectionColumnStatisticsImpl : public CollectionColumnStatistics, public MutableColumnStatistics { private: - InternalCollectionStatistics _stats; + InternalCollectionStatistics stats_; public: CollectionColumnStatisticsImpl() { @@ -1531,40 +1531,40 @@ namespace orc { virtual ~CollectionColumnStatisticsImpl() override; bool hasMinimumChildren() const override { - return _stats.hasMinimum(); + return stats_.hasMinimum(); } bool hasMaximumChildren() const override { - return _stats.hasMaximum(); + return stats_.hasMaximum(); } bool hasTotalChildren() const override { - return _stats.hasSum(); + return stats_.hasSum(); } void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); + return stats_.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); + stats_.setNumberOfValues(value); } bool hasNull() const override { - return _stats.hasNull(); + return stats_.hasNull(); } void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); + stats_.setHasNull(hasNull); } uint64_t getMinimumChildren() const override { if (hasMinimumChildren()) { - return _stats.getMinimum(); + return stats_.getMinimum(); } else { throw ParseError("MinimumChildren is not defined."); } @@ -1572,7 +1572,7 @@ namespace orc { uint64_t getMaximumChildren() const override { if (hasMaximumChildren()) { - return _stats.getMaximum(); + return stats_.getMaximum(); } else { throw ParseError("MaximumChildren is not defined."); } @@ -1580,78 +1580,78 @@ namespace orc { uint64_t getTotalChildren() const override { if (hasTotalChildren()) { - return _stats.getSum(); + return stats_.getSum(); } else { throw ParseError("TotalChildren is not defined."); } } void setMinimumChildren(uint64_t minimum) override { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); + stats_.setHasMinimum(true); + stats_.setMinimum(minimum); } void setMaximumChildren(uint64_t maximum) override { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); + stats_.setHasMaximum(true); + stats_.setMaximum(maximum); } void setTotalChildren(uint64_t sum) override { - _stats.setHasSum(true); - _stats.setSum(sum); + stats_.setHasSum(true); + stats_.setSum(sum); } void setHasTotalChildren(bool hasSum) override { - _stats.setHasSum(hasSum); + stats_.setHasSum(hasSum); } void merge(const MutableColumnStatistics& other) override { const CollectionColumnStatisticsImpl& collectionStats = dynamic_cast<const CollectionColumnStatisticsImpl&>(other); - _stats.merge(collectionStats._stats); + stats_.merge(collectionStats.stats_); // hasSumValue here means no overflow - _stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren()); - if (_stats.hasSum()) { - uint64_t oldSum = _stats.getSum(); - _stats.setSum(_stats.getSum() + collectionStats.getTotalChildren()); - if (oldSum > _stats.getSum()) { - _stats.setHasSum(false); + stats_.setHasSum(stats_.hasSum() && collectionStats.hasTotalChildren()); + if (stats_.hasSum()) { + uint64_t oldSum = stats_.getSum(); + stats_.setSum(stats_.getSum() + collectionStats.getTotalChildren()); + if (oldSum > stats_.getSum()) { + stats_.setHasSum(false); } } } void reset() override { - _stats.reset(); + stats_.reset(); setTotalChildren(0); } void update(uint64_t value) { - _stats.updateMinMax(value); - if (_stats.hasSum()) { - uint64_t oldSum = _stats.getSum(); - _stats.setSum(_stats.getSum() + value); - if (oldSum > _stats.getSum()) { - _stats.setHasSum(false); + stats_.updateMinMax(value); + if (stats_.hasSum()) { + uint64_t oldSum = stats_.getSum(); + stats_.setSum(stats_.getSum() + value); + if (oldSum > stats_.getSum()) { + stats_.setHasSum(false); } } } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_has_null(_stats.hasNull()); - pbStats.set_number_of_values(_stats.getNumberOfValues()); + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::CollectionStatistics* collectionStats = pbStats.mutable_collection_statistics(); - if (_stats.hasMinimum()) { - collectionStats->set_min_children(_stats.getMinimum()); - collectionStats->set_max_children(_stats.getMaximum()); + if (stats_.hasMinimum()) { + collectionStats->set_min_children(stats_.getMinimum()); + collectionStats->set_max_children(stats_.getMaximum()); } else { collectionStats->clear_min_children(); collectionStats->clear_max_children(); } - if (_stats.hasSum()) { - collectionStats->set_total_children(_stats.getSum()); + if (stats_.hasSum()) { + collectionStats->set_total_children(stats_.getSum()); } else { collectionStats->clear_total_children(); } @@ -1688,7 +1688,7 @@ namespace orc { class StatisticsImpl : public Statistics { private: - std::vector<ColumnStatistics*> colStats; + std::vector<ColumnStatistics*> colStats_; // DELIBERATELY NOT IMPLEMENTED StatisticsImpl(const StatisticsImpl&); @@ -1700,20 +1700,20 @@ namespace orc { StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { - return colStats[columnId]; + return colStats_[columnId]; } virtual ~StatisticsImpl() override; uint32_t getNumberOfColumns() const override { - return static_cast<uint32_t>(colStats.size()); + return static_cast<uint32_t>(colStats_.size()); } }; class StripeStatisticsImpl : public StripeStatistics { private: - std::unique_ptr<StatisticsImpl> columnStats; - std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats; + std::unique_ptr<StatisticsImpl> columnStats_; + std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats_; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); @@ -1725,23 +1725,23 @@ namespace orc { const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { - return columnStats->getColumnStatistics(columnId); + return columnStats_->getColumnStatistics(columnId); } uint32_t getNumberOfColumns() const override { - return columnStats->getNumberOfColumns(); + return columnStats_->getNumberOfColumns(); } virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, uint32_t rowIndex) const override { // check id indices are valid - return rowIndexStats[columnId][rowIndex].get(); + return rowIndexStats_[columnId][rowIndex].get(); } virtual ~StripeStatisticsImpl() override; uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { - return static_cast<uint32_t>(rowIndexStats[columnId].size()); + return static_cast<uint32_t>(rowIndexStats_[columnId].size()); } }; diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc index 8507e95767..a5609f7629 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.cc +++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc @@ -19,25 +19,27 @@ #include "StripeStream.hh" #include "RLE.hh" #include "Reader.hh" +#include "io/Cache.hh" #include "orc/Exceptions.hh" #include "wrap/coded-stream-wrapper.h" namespace orc { - StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, - const proto::StripeInformation& _stripeInfo, - const proto::StripeFooter& _footer, uint64_t _stripeStart, - InputStream& _input, const Timezone& _writerTimezone, - const Timezone& _readerTimezone) - : reader(_reader), - stripeInfo(_stripeInfo), - footer(_footer), - stripeIndex(_index), - stripeStart(_stripeStart), - input(_input), - writerTimezone(_writerTimezone), - readerTimezone(_readerTimezone) { + StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, + const proto::StripeInformation& stripeInfo, + const proto::StripeFooter& footer, uint64_t stripeStart, + InputStream& input, const Timezone& writerTimezone, + const Timezone& readerTimezone) + : reader_(reader), + stripeInfo_(stripeInfo), + footer_(footer), + stripeIndex_(index), + stripeStart_(stripeStart), + input_(input), + writerTimezone_(writerTimezone), + readerTimezone_(readerTimezone), + readCache_(reader.getReadCache()) { // PASS } @@ -58,51 +60,65 @@ namespace orc { } const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const { - return reader.getSelectedColumns(); + return reader_.getSelectedColumns(); } proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId) const { - return footer.columns(static_cast<int>(columnId)); + return footer_.columns(static_cast<int>(columnId)); } const Timezone& StripeStreamsImpl::getWriterTimezone() const { - return writerTimezone; + return writerTimezone_; } const Timezone& StripeStreamsImpl::getReaderTimezone() const { - return readerTimezone; + return readerTimezone_; } std::ostream* StripeStreamsImpl::getErrorStream() const { - return reader.getFileContents().errorStream; + return reader_.getFileContents().errorStream; } std::unique_ptr<SeekableInputStream> StripeStreamsImpl::getStream(uint64_t columnId, proto::Stream_Kind kind, bool shouldStream) const { - uint64_t offset = stripeStart; - uint64_t dataEnd = stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length(); - MemoryPool* pool = reader.getFileContents().pool; - for (int i = 0; i < footer.streams_size(); ++i) { - const proto::Stream& stream = footer.streams(i); + uint64_t offset = stripeStart_; + uint64_t dataEnd = + stripeInfo_.offset() + stripeInfo_.index_length() + stripeInfo_.data_length(); + MemoryPool* pool = reader_.getFileContents().pool; + for (int i = 0; i < footer_.streams_size(); ++i) { + const proto::Stream& stream = footer_.streams(i); if (stream.has_kind() && stream.kind() == kind && stream.column() == static_cast<uint64_t>(columnId)) { uint64_t streamLength = stream.length(); - uint64_t myBlock = shouldStream ? input.getNaturalReadSize() : streamLength; if (offset + streamLength > dataEnd) { std::stringstream msg; - msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex_ << ": streamOffset=" << offset << ", streamLength=" << streamLength - << ", stripeOffset=" << stripeInfo.offset() - << ", stripeIndexLength=" << stripeInfo.index_length() - << ", stripeDataLength=" << stripeInfo.data_length(); + << ", stripeOffset=" << stripeInfo_.offset() + << ", stripeIndexLength=" << stripeInfo_.index_length() + << ", stripeDataLength=" << stripeInfo_.data_length(); throw ParseError(msg.str()); } - return createDecompressor(reader.getCompression(), - std::make_unique<SeekableFileInputStream>( - &input, offset, stream.length(), *pool, myBlock), - reader.getCompressionSize(), *pool, - reader.getFileContents().readerMetrics); + + BufferSlice slice; + if (readCache_) { + ReadRange range{offset, streamLength}; + slice = readCache_->read(range); + } + + uint64_t myBlock = shouldStream ? input_.getNaturalReadSize() : streamLength; + std::unique_ptr<SeekableInputStream> seekableInput; + if (slice.buffer) { + seekableInput = std::make_unique<SeekableArrayInputStream>( + slice.buffer->data() + slice.offset, slice.length); + } else { + seekableInput = std::make_unique<SeekableFileInputStream>(&input_, offset, streamLength, + *pool, myBlock); + } + return createDecompressor(reader_.getCompression(), std::move(seekableInput), + reader_.getCompressionSize(), *pool, + reader_.getFileContents().readerMetrics); } offset += stream.length(); } @@ -110,38 +126,38 @@ namespace orc { } MemoryPool& StripeStreamsImpl::getMemoryPool() const { - return *reader.getFileContents().pool; + return *reader_.getFileContents().pool; } ReaderMetrics* StripeStreamsImpl::getReaderMetrics() const { - return reader.getFileContents().readerMetrics; + return reader_.getFileContents().readerMetrics; } bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const { - return reader.getThrowOnHive11DecimalOverflow(); + return reader_.getThrowOnHive11DecimalOverflow(); } bool StripeStreamsImpl::isDecimalAsLong() const { - return reader.getIsDecimalAsLong(); + return reader_.getIsDecimalAsLong(); } int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const { - return reader.getForcedScaleOnHive11Decimal(); + return reader_.getForcedScaleOnHive11Decimal(); } const SchemaEvolution* StripeStreamsImpl::getSchemaEvolution() const { - return reader.getSchemaEvolution(); + return reader_.getSchemaEvolution(); } void StripeInformationImpl::ensureStripeFooterLoaded() const { - if (stripeFooter.get() == nullptr) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(compression, - std::make_unique<SeekableFileInputStream>( - stream, offset + indexLength + dataLength, footerLength, memory), - blockSize, memory, metrics); - stripeFooter = std::make_unique<proto::StripeFooter>(); - if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { + if (stripeFooter_.get() == nullptr) { + std::unique_ptr<SeekableInputStream> pbStream = createDecompressor( + compression_, + std::make_unique<SeekableFileInputStream>(stream_, offset_ + indexLength_ + dataLength_, + footerLength_, memory_), + blockSize_, memory_, metrics_); + stripeFooter_ = std::make_unique<proto::StripeFooter>(); + if (!stripeFooter_->ParseFromZeroCopyStream(pbStream.get())) { throw ParseError("Failed to parse the stripe footer"); } } @@ -150,12 +166,12 @@ namespace orc { std::unique_ptr<StreamInformation> StripeInformationImpl::getStreamInformation( uint64_t streamId) const { ensureStripeFooterLoaded(); - uint64_t streamOffset = offset; + uint64_t streamOffset = offset_; for (uint64_t s = 0; s < streamId; ++s) { - streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); + streamOffset += stripeFooter_->streams(static_cast<int>(s)).length(); } return std::make_unique<StreamInformationImpl>( - streamOffset, stripeFooter->streams(static_cast<int>(streamId))); + streamOffset, stripeFooter_->streams(static_cast<int>(streamId))); } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh index eae6ce0c31..2d26f8575e 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.hh +++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh @@ -30,6 +30,7 @@ namespace orc { class RowReaderImpl; + class ReadRangeCache; /** * StripeStream Implementation @@ -37,14 +38,15 @@ namespace orc { class StripeStreamsImpl : public StripeStreams { private: - const RowReaderImpl& reader; - const proto::StripeInformation& stripeInfo; - const proto::StripeFooter& footer; - const uint64_t stripeIndex; - const uint64_t stripeStart; - InputStream& input; - const Timezone& writerTimezone; - const Timezone& readerTimezone; + const RowReaderImpl& reader_; + const proto::StripeInformation& stripeInfo_; + const proto::StripeFooter& footer_; + const uint64_t stripeIndex_; + const uint64_t stripeStart_; + InputStream& input_; + const Timezone& writerTimezone_; + const Timezone& readerTimezone_; + std::shared_ptr<ReadRangeCache> readCache_; public: StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, @@ -87,36 +89,36 @@ namespace orc { class StreamInformationImpl : public StreamInformation { private: - StreamKind kind; - uint64_t column; - uint64_t offset; - uint64_t length; + StreamKind kind_; + uint64_t column_; + uint64_t offset_; + uint64_t length_; public: - StreamInformationImpl(uint64_t _offset, const proto::Stream& stream) - : kind(static_cast<StreamKind>(stream.kind())), - column(stream.column()), - offset(_offset), - length(stream.length()) { + StreamInformationImpl(uint64_t offset, const proto::Stream& stream) + : kind_(static_cast<StreamKind>(stream.kind())), + column_(stream.column()), + offset_(offset), + length_(stream.length()) { // PASS } ~StreamInformationImpl() override; StreamKind getKind() const override { - return kind; + return kind_; } uint64_t getColumnId() const override { - return column; + return column_; } uint64_t getOffset() const override { - return offset; + return offset_; } uint64_t getLength() const override { - return length; + return length_; } }; @@ -125,34 +127,34 @@ namespace orc { */ class StripeInformationImpl : public StripeInformation { - uint64_t offset; - uint64_t indexLength; - uint64_t dataLength; - uint64_t footerLength; - uint64_t numRows; - InputStream* stream; - MemoryPool& memory; - CompressionKind compression; - uint64_t blockSize; - mutable std::unique_ptr<proto::StripeFooter> stripeFooter; - ReaderMetrics* metrics; + uint64_t offset_; + uint64_t indexLength_; + uint64_t dataLength_; + uint64_t footerLength_; + uint64_t numRows_; + InputStream* stream_; + MemoryPool& memory_; + CompressionKind compression_; + uint64_t blockSize_; + mutable std::unique_ptr<proto::StripeFooter> stripeFooter_; + ReaderMetrics* metrics_; void ensureStripeFooterLoaded() const; public: - StripeInformationImpl(uint64_t _offset, uint64_t _indexLength, uint64_t _dataLength, - uint64_t _footerLength, uint64_t _numRows, InputStream* _stream, - MemoryPool& _memory, CompressionKind _compression, uint64_t _blockSize, - ReaderMetrics* _metrics) - : offset(_offset), - indexLength(_indexLength), - dataLength(_dataLength), - footerLength(_footerLength), - numRows(_numRows), - stream(_stream), - memory(_memory), - compression(_compression), - blockSize(_blockSize), - metrics(_metrics) { + StripeInformationImpl(uint64_t offset, uint64_t indexLength, uint64_t dataLength, + uint64_t footerLength, uint64_t numRows, InputStream* stream, + MemoryPool& memory, CompressionKind compression, uint64_t blockSize, + ReaderMetrics* metrics) + : offset_(offset), + indexLength_(indexLength), + dataLength_(dataLength), + footerLength_(footerLength), + numRows_(numRows), + stream_(stream), + memory_(memory), + compression_(compression), + blockSize_(blockSize), + metrics_(metrics) { // PASS } @@ -161,49 +163,50 @@ namespace orc { } uint64_t getOffset() const override { - return offset; + return offset_; } uint64_t getLength() const override { - return indexLength + dataLength + footerLength; + return indexLength_ + dataLength_ + footerLength_; } uint64_t getIndexLength() const override { - return indexLength; + return indexLength_; } uint64_t getDataLength() const override { - return dataLength; + return dataLength_; } uint64_t getFooterLength() const override { - return footerLength; + return footerLength_; } uint64_t getNumberOfRows() const override { - return numRows; + return numRows_; } uint64_t getNumberOfStreams() const override { ensureStripeFooterLoaded(); - return static_cast<uint64_t>(stripeFooter->streams_size()); + return static_cast<uint64_t>(stripeFooter_->streams_size()); } std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId) const override; ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter->columns(static_cast<int>(colId)).kind()); + return static_cast<ColumnEncodingKind>( + stripeFooter_->columns(static_cast<int>(colId)).kind()); } uint64_t getDictionarySize(uint64_t colId) const override { ensureStripeFooterLoaded(); return static_cast<ColumnEncodingKind>( - stripeFooter->columns(static_cast<int>(colId)).dictionary_size()); + stripeFooter_->columns(static_cast<int>(colId)).dictionary_size()); } const std::string& getWriterTimezone() const override { ensureStripeFooterLoaded(); - return stripeFooter->writer_timezone(); + return stripeFooter_->writer_timezone(); } }; diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc index 4c78a53a29..384f8ea99f 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.cc +++ b/contrib/libs/apache/orc/c++/src/Timezone.cc @@ -184,49 +184,49 @@ namespace orc { * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week> */ class FutureRuleImpl : public FutureRule { - std::string ruleString; - TimezoneVariant standard; - bool hasDst; - TimezoneVariant dst; - Transition start; - Transition end; + std::string ruleString_; + TimezoneVariant standard_; + bool hasDst_; + TimezoneVariant dst_; + Transition start_; + Transition end_; // expanded time_t offsets of transitions - std::vector<int64_t> offsets; + std::vector<int64_t> offsets_; // Is the epoch (1 Jan 1970 00:00) in standard time? // This code assumes that the transition dates fall in the same order // each year. Hopefully no timezone regions decide to move across the // equator, which is about what it would take. - bool startInStd; + bool startInStd_; void computeOffsets() { - if (!hasDst) { - startInStd = true; - offsets.resize(1); + if (!hasDst_) { + startInStd_ = true; + offsets_.resize(1); } else { // Insert a transition for the epoch and two per a year for the next // 400 years. We assume that the all even positions are in standard // time if and only if startInStd and the odd ones are the reverse. - offsets.resize(400 * 2 + 1); - startInStd = start.getTime(1970) < end.getTime(1970); + offsets_.resize(400 * 2 + 1); + startInStd_ = start_.getTime(1970) < end_.getTime(1970); int64_t base = 0; for (int64_t year = 1970; year < 1970 + 400; ++year) { - if (startInStd) { - offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + start.getTime(year) - standard.gmtOffset; - offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + end.getTime(year) - dst.gmtOffset; + if (startInStd_) { + offsets_[static_cast<uint64_t>(year - 1970) * 2 + 1] = + base + start_.getTime(year) - standard_.gmtOffset; + offsets_[static_cast<uint64_t>(year - 1970) * 2 + 2] = + base + end_.getTime(year) - dst_.gmtOffset; } else { - offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + end.getTime(year) - dst.gmtOffset; - offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + start.getTime(year) - standard.gmtOffset; + offsets_[static_cast<uint64_t>(year - 1970) * 2 + 1] = + base + end_.getTime(year) - dst_.gmtOffset; + offsets_[static_cast<uint64_t>(year - 1970) * 2 + 2] = + base + start_.getTime(year) - standard_.gmtOffset; } base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY; } } - offsets[0] = 0; + offsets_[0] = 0; } public: @@ -247,34 +247,34 @@ namespace orc { } bool FutureRuleImpl::isDefined() const { - return ruleString.size() > 0; + return ruleString_.size() > 0; } const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const { - if (!hasDst) { - return standard; + if (!hasDst_) { + return standard_; } else { int64_t adjusted = clk % SECONDS_PER_400_YEARS; if (adjusted < 0) { adjusted += SECONDS_PER_400_YEARS; } - int64_t idx = binarySearch(offsets, adjusted); - if (startInStd == (idx % 2 == 0)) { - return standard; + int64_t idx = binarySearch(offsets_, adjusted); + if (startInStd_ == (idx % 2 == 0)) { + return standard_; } else { - return dst; + return dst_; } } } void FutureRuleImpl::print(std::ostream& out) const { if (isDefined()) { - out << " Future rule: " << ruleString << "\n"; - out << " standard " << standard.toString() << "\n"; - if (hasDst) { - out << " dst " << dst.toString() << "\n"; - out << " start " << start.toString() << "\n"; - out << " end " << end.toString() << "\n"; + out << " Future rule: " << ruleString_ << "\n"; + out << " standard " << standard_.toString() << "\n"; + if (hasDst_) { + out << " dst " << dst_.toString() << "\n"; + out << " start " << start_.toString() << "\n"; + out << " end " << end_.toString() << "\n"; } } } @@ -285,40 +285,40 @@ namespace orc { class FutureRuleParser { public: FutureRuleParser(const std::string& str, FutureRuleImpl* rule) - : ruleString(str), length(str.size()), position(0), output(*rule) { - output.ruleString = str; - if (position != length) { - parseName(output.standard.name); - output.standard.gmtOffset = -parseOffset(); - output.standard.isDst = false; - output.hasDst = position < length; - if (output.hasDst) { - parseName(output.dst.name); - output.dst.isDst = true; - if (ruleString[position] != ',') { - output.dst.gmtOffset = -parseOffset(); + : ruleString_(str), length_(str.size()), position_(0), output_(*rule) { + output_.ruleString_ = str; + if (position_ != length_) { + parseName(output_.standard_.name); + output_.standard_.gmtOffset = -parseOffset(); + output_.standard_.isDst = false; + output_.hasDst_ = position_ < length_; + if (output_.hasDst_) { + parseName(output_.dst_.name); + output_.dst_.isDst = true; + if (ruleString_[position_] != ',') { + output_.dst_.gmtOffset = -parseOffset(); } else { - output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60; + output_.dst_.gmtOffset = output_.standard_.gmtOffset + 60 * 60; } - parseTransition(output.start); - parseTransition(output.end); + parseTransition(output_.start_); + parseTransition(output_.end_); } - if (position != length) { + if (position_ != length_) { throwError("Extra text"); } - output.computeOffsets(); + output_.computeOffsets(); } } private: - const std::string& ruleString; - size_t length; - size_t position; - FutureRuleImpl& output; + const std::string& ruleString_; + size_t length_; + size_t position_; + FutureRuleImpl& output_; void throwError(const char* msg) { std::stringstream buffer; - buffer << msg << " at " << position << " in '" << ruleString << "'"; + buffer << msg << " at " << position_ << " in '" << ruleString_ << "'"; throw TimezoneError(buffer.str()); } @@ -328,46 +328,46 @@ namespace orc { * and set the output string. */ void parseName(std::string& result) { - if (position == length) { + if (position_ == length_) { throwError("name required"); } - size_t start = position; - if (ruleString[position] == '<') { - while (position < length && ruleString[position] != '>') { - position += 1; + size_t start = position_; + if (ruleString_[position_] == '<') { + while (position_ < length_ && ruleString_[position_] != '>') { + position_ += 1; } - if (position == length) { + if (position_ == length_) { throwError("missing close '>'"); } - position += 1; + position_ += 1; } else { - while (position < length) { - char ch = ruleString[position]; + while (position_ < length_) { + char ch = ruleString_[position_]; if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') { break; } - position += 1; + position_ += 1; } } - if (position == start) { + if (position_ == start) { throwError("empty string not allowed"); } - result = ruleString.substr(start, position - start); + result = ruleString_.substr(start, position_ - start); } /** * Parse an integer of the form [0-9]+ and return it. */ int64_t parseNumber() { - if (position >= length) { + if (position_ >= length_) { throwError("missing number"); } int64_t result = 0; - while (position < length) { - char ch = ruleString[position]; + while (position_ < length_) { + char ch = ruleString_[position_]; if (isdigit(ch)) { result = result * 10 + (ch - '0'); - position += 1; + position_ += 1; } else { break; } @@ -383,17 +383,17 @@ namespace orc { int64_t parseOffset() { int64_t scale = 3600; bool isNegative = false; - if (position < length) { - char ch = ruleString[position]; + if (position_ < length_) { + char ch = ruleString_[position_]; isNegative = ch == '-'; if (ch == '-' || ch == '+') { - position += 1; + position_ += 1; } } int64_t result = parseNumber() * scale; - while (position < length && scale > 1 && ruleString[position] == ':') { + while (position_ < length_ && scale > 1 && ruleString_[position_] == ':') { scale /= 60; - position += 1; + position_ += 1; result += parseNumber() * scale; } if (isNegative) { @@ -407,35 +407,35 @@ namespace orc { * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)? */ void parseTransition(Transition& transition) { - if (length - position < 2 || ruleString[position] != ',') { + if (length_ - position_ < 2 || ruleString_[position_] != ',') { throwError("missing transition"); } - position += 1; - char ch = ruleString[position]; + position_ += 1; + char ch = ruleString_[position_]; if (ch == 'J') { transition.kind = TRANSITION_JULIAN; - position += 1; + position_ += 1; transition.day = parseNumber(); } else if (ch == 'M') { transition.kind = TRANSITION_MONTH; - position += 1; + position_ += 1; transition.month = parseNumber(); - if (position == length || ruleString[position] != '.') { + if (position_ == length_ || ruleString_[position_] != '.') { throwError("missing first ."); } - position += 1; + position_ += 1; transition.week = parseNumber(); - if (position == length || ruleString[position] != '.') { + if (position_ == length_ || ruleString_[position_] != '.') { throwError("missing second ."); } - position += 1; + position_ += 1; transition.day = parseNumber(); } else { transition.kind = TRANSITION_DAY; transition.day = parseNumber(); } - if (position < length && ruleString[position] == '/') { - position += 1; + if (position_ < length_ && ruleString_[position_] == '/') { + position_ += 1; transition.time = parseOffset(); } else { transition.time = 2 * 60 * 60; @@ -565,7 +565,7 @@ namespace orc { class TimezoneImpl : public Timezone { public: - TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer); + TimezoneImpl(const std::string& filename, const std::vector<unsigned char>& buffer); virtual ~TimezoneImpl() override; /** @@ -576,11 +576,11 @@ namespace orc { void print(std::ostream&) const override; uint64_t getVersion() const override { - return version; + return version_; } int64_t getEpoch() const override { - return epoch; + return epoch_; } int64_t convertToUTC(int64_t clk) const override { @@ -599,31 +599,31 @@ namespace orc { void parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset, uint64_t fileLength, const VersionParser& version); // filename - std::string filename; + std::string filename_; // the version of the file - uint64_t version; + uint64_t version_; // the list of variants for this timezone - std::vector<TimezoneVariant> variants; + std::vector<TimezoneVariant> variants_; // the list of the times where the local rules change - std::vector<int64_t> transitions; + std::vector<int64_t> transitions_; // the variant that starts at this transition. - std::vector<uint64_t> currentVariant; + std::vector<uint64_t> currentVariant_; // the variant before the first transition - uint64_t ancientVariant; + uint64_t ancientVariant_; // the rule for future times - std::shared_ptr<FutureRule> futureRule; + std::shared_ptr<FutureRule> futureRule_; // the last explicit transition after which we use the future rule - int64_t lastTransition; + int64_t lastTransition_; // The ORC epoch time in this timezone. - int64_t epoch; + int64_t epoch_; }; DIAGNOSTIC_PUSH @@ -639,8 +639,8 @@ namespace orc { // PASS } - TimezoneImpl::TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer) - : filename(_filename) { + TimezoneImpl::TimezoneImpl(const std::string& filename, const std::vector<unsigned char>& buffer) + : filename_(filename) { parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser()); // Build the literal for the ORC epoch // 2015 Jan 1 00:00:00 @@ -653,7 +653,7 @@ namespace orc { epochStruct.tm_year = 2015 - 1900; epochStruct.tm_isdst = 0; time_t utcEpoch = timegm(&epochStruct); - epoch = utcEpoch - getVariant(utcEpoch).gmtOffset; + epoch_ = utcEpoch - getVariant(utcEpoch).gmtOffset; } std::string getTimezoneDirectory() { @@ -783,9 +783,9 @@ namespace orc { uint64_t variantCount, uint64_t nameOffset, uint64_t nameCount) { for (uint64_t variant = 0; variant < variantCount; ++variant) { - variants[variant].gmtOffset = + variants_[variant].gmtOffset = static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); - variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; + variants_[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; uint64_t nameStart = ptr[variantOffset + 6 * variant + 5]; if (nameStart >= nameCount) { std::stringstream buffer; @@ -793,7 +793,7 @@ namespace orc { << " >= " << nameCount; throw TimezoneError(buffer.str()); } - variants[variant].name = + variants_[variant].name = std::string(reinterpret_cast<const char*>(ptr) + nameOffset + nameStart); } } @@ -833,7 +833,7 @@ namespace orc { if (fileLength < headerOffset + 6 * 4 || strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) != 0) { std::stringstream buffer; - buffer << "non-tzfile " << filename; + buffer << "non-tzfile " << filename_; throw TimezoneError(buffer.str()); } @@ -854,7 +854,7 @@ namespace orc { if (sectionLength > fileLength) { std::stringstream buffer; - buffer << "tzfile too short " << filename << " needs " << sectionLength << " and has " + buffer << "tzfile too short " << filename_ << " needs " << sectionLength << " and has " << fileLength; throw TimezoneError(buffer.str()); } @@ -864,82 +864,82 @@ namespace orc { parseZoneFile(ptr, sectionLength, fileLength, Version2Parser()); return; } - version = versionParser.getVersion(); - variants.resize(variantCount); - transitions.resize(timeCount); - currentVariant.resize(timeCount); + version_ = versionParser.getVersion(); + variants_.resize(variantCount); + transitions_.resize(timeCount); + currentVariant_.resize(timeCount); parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, nameCount); bool foundAncient = false; for (uint64_t t = 0; t < timeCount; ++t) { - transitions[t] = versionParser.parseTime(ptr + timeOffset + t * versionParser.getTimeSize()); - currentVariant[t] = ptr[timeVariantOffset + t]; - if (currentVariant[t] >= variantCount) { + transitions_[t] = versionParser.parseTime(ptr + timeOffset + t * versionParser.getTimeSize()); + currentVariant_[t] = ptr[timeVariantOffset + t]; + if (currentVariant_[t] >= variantCount) { std::stringstream buffer; - buffer << "tzfile rule out of range " << filename << " references rule " - << currentVariant[t] << " of " << variantCount; + buffer << "tzfile rule out of range " << filename_ << " references rule " + << currentVariant_[t] << " of " << variantCount; throw TimezoneError(buffer.str()); } // find the oldest standard time and use that as the ancient value - if (!foundAncient && !variants[currentVariant[t]].isDst) { + if (!foundAncient && !variants_[currentVariant_[t]].isDst) { foundAncient = true; - ancientVariant = currentVariant[t]; + ancientVariant_ = currentVariant_[t]; } } if (!foundAncient) { - ancientVariant = 0; + ancientVariant_ = 0; } - futureRule = parseFutureRule( + futureRule_ = parseFutureRule( versionParser.parseFutureString(ptr, sectionLength, fileLength - sectionLength)); // find the lower bound for applying the future rule - if (futureRule->isDefined()) { + if (futureRule_->isDefined()) { if (timeCount > 0) { - lastTransition = transitions[timeCount - 1]; + lastTransition_ = transitions_[timeCount - 1]; } else { - lastTransition = INT64_MIN; + lastTransition_ = INT64_MIN; } } else { - lastTransition = INT64_MAX; + lastTransition_ = INT64_MAX; } } const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const { // if it is after the last explicit entry in the table, // use the future rule to get an answer - if (clk > lastTransition) { - return futureRule->getVariant(clk); + if (clk > lastTransition_) { + return futureRule_->getVariant(clk); } else { - int64_t transition = binarySearch(transitions, clk); + int64_t transition = binarySearch(transitions_, clk); uint64_t idx; if (transition < 0) { - idx = ancientVariant; + idx = ancientVariant_; } else { - idx = currentVariant[static_cast<size_t>(transition)]; + idx = currentVariant_[static_cast<size_t>(transition)]; } - return variants[idx]; + return variants_[idx]; } } void TimezoneImpl::print(std::ostream& out) const { - out << "Timezone file: " << filename << "\n"; - out << " Version: " << version << "\n"; - futureRule->print(out); - for (uint64_t r = 0; r < variants.size(); ++r) { - out << " Variant " << r << ": " << variants[r].toString() << "\n"; + out << "Timezone file: " << filename_ << "\n"; + out << " Version: " << version_ << "\n"; + futureRule_->print(out); + for (uint64_t r = 0; r < variants_.size(); ++r) { + out << " Variant " << r << ": " << variants_[r].toString() << "\n"; } - for (uint64_t t = 0; t < transitions.size(); ++t) { + for (uint64_t t = 0; t < transitions_.size(); ++t) { tm timeStruct; tm* result = nullptr; char buffer[25]; if (sizeof(time_t) >= 8) { - time_t val = transitions[t]; + time_t val = transitions_[t]; result = gmtime_r(&val, &timeStruct); if (result) { strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct); } } - out << " Transition: " << (result == nullptr ? "null" : buffer) << " (" << transitions[t] - << ") -> " << variants[currentVariant[t]].name << "\n"; + out << " Transition: " << (result == nullptr ? "null" : buffer) << " (" << transitions_[t] + << ") -> " << variants_[currentVariant_[t]].name << "\n"; } } diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc index c427a962b5..cbc7b82796 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc @@ -29,54 +29,54 @@ namespace orc { // PASS } - TypeImpl::TypeImpl(TypeKind _kind) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = 0; - precision = 0; - scale = 0; - subtypeCount = 0; - } - - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = _maxLength; - precision = 0; - scale = 0; - subtypeCount = 0; - } - - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, uint64_t _scale) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = 0; - precision = _precision; - scale = _scale; - subtypeCount = 0; + TypeImpl::TypeImpl(TypeKind kind) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + } + + TypeImpl::TypeImpl(TypeKind kind, uint64_t maxLength) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = maxLength; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + } + + TypeImpl::TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = precision; + scale_ = scale; + subtypeCount_ = 0; } uint64_t TypeImpl::assignIds(uint64_t root) const { - columnId = static_cast<int64_t>(root); + columnId_ = static_cast<int64_t>(root); uint64_t current = root + 1; - for (uint64_t i = 0; i < subtypeCount; ++i) { - current = dynamic_cast<TypeImpl*>(subTypes[i].get())->assignIds(current); + for (uint64_t i = 0; i < subtypeCount_; ++i) { + current = dynamic_cast<TypeImpl*>(subTypes_[i].get())->assignIds(current); } - maximumColumnId = static_cast<int64_t>(current) - 1; + maximumColumnId_ = static_cast<int64_t>(current) - 1; return current; } void TypeImpl::ensureIdAssigned() const { - if (columnId == -1) { + if (columnId_ == -1) { const TypeImpl* root = this; - while (root->parent != nullptr) { - root = root->parent; + while (root->parent_ != nullptr) { + root = root->parent_; } root->assignIds(0); } @@ -84,94 +84,94 @@ namespace orc { uint64_t TypeImpl::getColumnId() const { ensureIdAssigned(); - return static_cast<uint64_t>(columnId); + return static_cast<uint64_t>(columnId_); } uint64_t TypeImpl::getMaximumColumnId() const { ensureIdAssigned(); - return static_cast<uint64_t>(maximumColumnId); + return static_cast<uint64_t>(maximumColumnId_); } TypeKind TypeImpl::getKind() const { - return kind; + return kind_; } uint64_t TypeImpl::getSubtypeCount() const { - return subtypeCount; + return subtypeCount_; } const Type* TypeImpl::getSubtype(uint64_t i) const { - return subTypes[i].get(); + return subTypes_[i].get(); } const std::string& TypeImpl::getFieldName(uint64_t i) const { - return fieldNames[i]; + return fieldNames_[i]; } uint64_t TypeImpl::getMaximumLength() const { - return maxLength; + return maxLength_; } uint64_t TypeImpl::getPrecision() const { - return precision; + return precision_; } uint64_t TypeImpl::getScale() const { - return scale; + return scale_; } Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) { - attributes[key] = value; + attributes_[key] = value; return *this; } bool TypeImpl::hasAttributeKey(const std::string& key) const { - return attributes.find(key) != attributes.end(); + return attributes_.find(key) != attributes_.end(); } Type& TypeImpl::removeAttribute(const std::string& key) { - auto it = attributes.find(key); - if (it == attributes.end()) { + auto it = attributes_.find(key); + if (it == attributes_.end()) { throw std::range_error("Key not found: " + key); } - attributes.erase(it); + attributes_.erase(it); return *this; } std::vector<std::string> TypeImpl::getAttributeKeys() const { std::vector<std::string> ret; - ret.reserve(attributes.size()); - for (auto& attribute : attributes) { + ret.reserve(attributes_.size()); + for (auto& attribute : attributes_) { ret.push_back(attribute.first); } return ret; } std::string TypeImpl::getAttributeValue(const std::string& key) const { - auto it = attributes.find(key); - if (it == attributes.end()) { + auto it = attributes_.find(key); + if (it == attributes_.end()) { throw std::range_error("Key not found: " + key); } return it->second; } - void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) { - columnId = static_cast<int64_t>(_columnId); - maximumColumnId = static_cast<int64_t>(_maxColumnId); + void TypeImpl::setIds(uint64_t columnId, uint64_t maxColumnId) { + columnId_ = static_cast<int64_t>(columnId); + maximumColumnId_ = static_cast<int64_t>(maxColumnId); } void TypeImpl::addChildType(std::unique_ptr<Type> childType) { TypeImpl* child = dynamic_cast<TypeImpl*>(childType.get()); - subTypes.push_back(std::move(childType)); + subTypes_.push_back(std::move(childType)); if (child != nullptr) { - child->parent = this; + child->parent_ = this; } - subtypeCount += 1; + subtypeCount_ += 1; } Type* TypeImpl::addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) { addChildType(std::move(fieldType)); - fieldNames.push_back(fieldName); + fieldNames_.push_back(fieldName); return this; } @@ -190,7 +190,7 @@ namespace orc { } std::string TypeImpl::toString() const { - switch (static_cast<int64_t>(kind)) { + switch (static_cast<int64_t>(kind_)) { case BOOLEAN: return "boolean"; case BYTE: @@ -214,20 +214,20 @@ namespace orc { case TIMESTAMP_INSTANT: return "timestamp with local time zone"; case LIST: - return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; + return "array<" + (subTypes_[0] ? subTypes_[0]->toString() : "void") + ">"; case MAP: - return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + - (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; + return "map<" + (subTypes_[0] ? subTypes_[0]->toString() : "void") + "," + + (subTypes_[1] ? subTypes_[1]->toString() : "void") + ">"; case STRUCT: { std::string result = "struct<"; - for (size_t i = 0; i < subTypes.size(); ++i) { + for (size_t i = 0; i < subTypes_.size(); ++i) { if (i != 0) { result += ","; } - if (isUnquotedFieldName(fieldNames[i])) { - result += fieldNames[i]; + if (isUnquotedFieldName(fieldNames_[i])) { + result += fieldNames_[i]; } else { - std::string name(fieldNames[i]); + std::string name(fieldNames_[i]); size_t pos = 0; while ((pos = name.find("`", pos)) != std::string::npos) { name.replace(pos, 1, "``"); @@ -238,37 +238,37 @@ namespace orc { result += "`"; } result += ":"; - result += subTypes[i]->toString(); + result += subTypes_[i]->toString(); } result += ">"; return result; } case UNION: { std::string result = "uniontype<"; - for (size_t i = 0; i < subTypes.size(); ++i) { + for (size_t i = 0; i < subTypes_.size(); ++i) { if (i != 0) { result += ","; } - result += subTypes[i]->toString(); + result += subTypes_[i]->toString(); } result += ">"; return result; } case DECIMAL: { std::stringstream result; - result << "decimal(" << precision << "," << scale << ")"; + result << "decimal(" << precision_ << "," << scale_ << ")"; return result.str(); } case DATE: return "date"; case VARCHAR: { std::stringstream result; - result << "varchar(" << maxLength << ")"; + result << "varchar(" << maxLength_ << ")"; return result.str(); } case CHAR: { std::stringstream result; - result << "char(" << maxLength << ")"; + result << "char(" << maxLength_ << ")"; return result.str(); } default: @@ -285,7 +285,7 @@ namespace orc { std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(uint64_t capacity, MemoryPool& memoryPool, bool encoded, bool useTightNumericVector) const { - switch (static_cast<int64_t>(kind)) { + switch (static_cast<int64_t>(kind_)) { case BOOLEAN: if (useTightNumericVector) { return std::make_unique<ByteVectorBatch>(capacity, memoryPool); @@ -660,7 +660,8 @@ namespace orc { std::pair<std::string, size_t> nameRes = parseName(input, pos, end); pos = nameRes.second; if (input[pos] != ':') { - throw std::logic_error("Invalid struct type. No field name set."); + throw std::logic_error("Invalid struct type. Field name can not contain '" + + std::string(1, input[pos]) + "'."); } std::pair<std::unique_ptr<Type>, size_t> typeRes = TypeImpl::parseType(input, ++pos, end); result->addStructField(nameRes.first, std::move(typeRes.first)); diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh index 6d0743793a..647d5a5d2c 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh @@ -30,17 +30,17 @@ namespace orc { class TypeImpl : public Type { private: - TypeImpl* parent; - mutable int64_t columnId; - mutable int64_t maximumColumnId; - TypeKind kind; - std::vector<std::unique_ptr<Type>> subTypes; - std::vector<std::string> fieldNames; - uint64_t subtypeCount; - uint64_t maxLength; - uint64_t precision; - uint64_t scale; - std::map<std::string, std::string> attributes; + TypeImpl* parent_; + mutable int64_t columnId_; + mutable int64_t maximumColumnId_; + TypeKind kind_; + std::vector<std::unique_ptr<Type>> subTypes_; + std::vector<std::string> fieldNames_; + uint64_t subtypeCount_; + uint64_t maxLength_; + uint64_t precision_; + uint64_t scale_; + std::map<std::string, std::string> attributes_; public: /** diff --git a/contrib/libs/apache/orc/c++/src/Utils.hh b/contrib/libs/apache/orc/c++/src/Utils.hh index 751c09b205..851d0af15c 100644 --- a/contrib/libs/apache/orc/c++/src/Utils.hh +++ b/contrib/libs/apache/orc/c++/src/Utils.hh @@ -21,38 +21,39 @@ #include <atomic> #include <chrono> +#include <stdexcept> namespace orc { class AutoStopwatch { - std::chrono::high_resolution_clock::time_point start; - std::atomic<uint64_t>* latencyUs; - std::atomic<uint64_t>* count; - bool minus; + std::chrono::high_resolution_clock::time_point start_; + std::atomic<uint64_t>* latencyUs_; + std::atomic<uint64_t>* count_; + bool minus_; public: - AutoStopwatch(std::atomic<uint64_t>* _latencyUs, std::atomic<uint64_t>* _count, - bool _minus = false) - : latencyUs(_latencyUs), count(_count), minus(_minus) { - if (latencyUs) { - start = std::chrono::high_resolution_clock::now(); + AutoStopwatch(std::atomic<uint64_t>* latencyUs, std::atomic<uint64_t>* count, + bool minus = false) + : latencyUs_(latencyUs), count_(count), minus_(minus) { + if (latencyUs_) { + start_ = std::chrono::high_resolution_clock::now(); } } ~AutoStopwatch() { - if (latencyUs) { + if (latencyUs_) { std::chrono::microseconds elapsedTime = std::chrono::duration_cast<std::chrono::microseconds>( - std::chrono::high_resolution_clock::now() - start); - if (!minus) { - latencyUs->fetch_add(static_cast<uint64_t>(elapsedTime.count())); + std::chrono::high_resolution_clock::now() - start_); + if (!minus_) { + latencyUs_->fetch_add(static_cast<uint64_t>(elapsedTime.count())); } else { - latencyUs->fetch_sub(static_cast<uint64_t>(elapsedTime.count())); + latencyUs_->fetch_sub(static_cast<uint64_t>(elapsedTime.count())); } } - if (count) { - count->fetch_add(1); + if (count_) { + count_->fetch_add(1); } } }; @@ -70,6 +71,75 @@ namespace orc { #define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR) #endif + struct Utf8Utils { + /** + * Counts how many utf-8 chars of the input data + */ + static uint64_t charLength(const char* data, uint64_t length) { + uint64_t chars = 0; + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + } + return chars; + } + + /** + * Return the number of bytes required to read at most maxCharLength + * characters in full from a utf-8 encoded byte array provided + * by data. This does not validate utf-8 data, but + * operates correctly on already valid utf-8 data. + * + * @param maxCharLength number of characters required + * @param data the bytes of UTF-8 + * @param length the length of data to truncate + */ + static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { + uint64_t chars = 0; + if (length <= maxCharLength) { + return length; + } + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + if (chars > maxCharLength) { + return i; + } + } + // everything fits + return length; + } + + /** + * Checks if b is the first byte of a UTF-8 character. + */ + inline static bool isUtfStartByte(char b) { + return (b & 0xC0) != 0x80; + } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { + uint64_t posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (isUtfStartByte(text[posn])) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw std::logic_error("Could not truncate string, beginning of a valid char not found"); + } + }; + } // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc index b9e2854586..49f47aeb03 100644 --- a/contrib/libs/apache/orc/c++/src/Vector.cc +++ b/contrib/libs/apache/orc/c++/src/Vector.cc @@ -34,6 +34,7 @@ namespace orc { notNull(pool, cap), hasNulls(false), isEncoded(false), + dictionaryDecoded(false), memoryPool(pool) { std::memset(notNull.data(), 1, capacity); } @@ -61,13 +62,20 @@ namespace orc { return false; } + void ColumnVectorBatch::decodeDictionary() { + if (dictionaryDecoded) return; + + decodeDictionaryImpl(); + dictionaryDecoded = true; + } + StringDictionary::StringDictionary(MemoryPool& pool) : dictionaryBlob(pool), dictionaryOffset(pool) { // PASS } - EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, MemoryPool& pool) - : StringVectorBatch(_capacity, pool), dictionary(), index(pool, _capacity) { + EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool) + : StringVectorBatch(capacity, pool), dictionary(), index(pool, capacity) { // PASS } @@ -88,10 +96,21 @@ namespace orc { } } - StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool) - : ColumnVectorBatch(_capacity, pool), - data(pool, _capacity), - length(pool, _capacity), + void EncodedStringVectorBatch::decodeDictionaryImpl() { + size_t n = index.size(); + resize(n); + + for (size_t i = 0; i < n; ++i) { + if (!hasNulls || notNull[i]) { + dictionary->getValueByIndex(index[i], data[i], length[i]); + } + } + } + + StringVectorBatch::StringVectorBatch(uint64_t capacity, MemoryPool& pool) + : ColumnVectorBatch(capacity, pool), + data(pool, capacity), + length(pool, capacity), blob(pool) { // PASS } @@ -174,6 +193,12 @@ namespace orc { return false; } + void StructVectorBatch::decodeDictionaryImpl() { + for (const auto& field : fields) { + field->decodeDictionary(); + } + } + ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { offsets.zeroOut(); @@ -211,6 +236,10 @@ namespace orc { return true; } + void ListVectorBatch::decodeDictionaryImpl() { + elements->decodeDictionary(); + } + MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { offsets.zeroOut(); @@ -251,6 +280,16 @@ namespace orc { return true; } + void MapVectorBatch::decodeDictionaryImpl() { + if (keys) { + keys->decodeDictionary(); + } + + if (elements) { + elements->decodeDictionary(); + } + } + UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) { tags.zeroOut(); @@ -310,6 +349,12 @@ namespace orc { return false; } + void UnionVectorBatch::decodeDictionaryImpl() { + for (const auto& child : children) { + child->decodeDictionary(); + } + } + Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), precision(0), @@ -383,7 +428,7 @@ namespace orc { readScales.capacity() * sizeof(int64_t)); } - Decimal::Decimal(const Int128& _value, int32_t _scale) : value(_value), scale(_scale) { + Decimal::Decimal(const Int128& value, int32_t scale) : value(value), scale(scale) { // PASS } @@ -408,8 +453,8 @@ namespace orc { return value.toDecimalString(scale, trimTrailingZeros); } - TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, MemoryPool& pool) - : ColumnVectorBatch(_capacity, pool), data(pool, _capacity), nanoseconds(pool, _capacity) { + TimestampVectorBatch::TimestampVectorBatch(uint64_t capacity, MemoryPool& pool) + : ColumnVectorBatch(capacity, pool), data(pool, capacity), nanoseconds(pool, capacity) { // PASS } diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc index 89eb3781cf..775e6d2452 100644 --- a/contrib/libs/apache/orc/c++/src/Writer.cc +++ b/contrib/libs/apache/orc/c++/src/Writer.cc @@ -46,6 +46,8 @@ namespace orc { WriterMetrics* metrics; bool useTightNumericVector; uint64_t outputBufferCapacity; + uint64_t memoryBlockSize; + bool alignBlockBoundToRowGroup; WriterOptionsPrivate() : fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 stripeSize = 64 * 1024 * 1024; // 64M @@ -67,28 +69,30 @@ namespace orc { metrics = nullptr; useTightNumericVector = false; outputBufferCapacity = 1024 * 1024; + memoryBlockSize = 64 * 1024; // 64K + alignBlockBoundToRowGroup = false; } }; WriterOptions::WriterOptions() - : privateBits(std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate())) { + : privateBits_(std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate())) { // PASS } WriterOptions::WriterOptions(const WriterOptions& rhs) - : privateBits(std::unique_ptr<WriterOptionsPrivate>( - new WriterOptionsPrivate(*(rhs.privateBits.get())))) { + : privateBits_(std::unique_ptr<WriterOptionsPrivate>( + new WriterOptionsPrivate(*(rhs.privateBits_.get())))) { // PASS } WriterOptions::WriterOptions(WriterOptions& rhs) { // swap privateBits with rhs - privateBits.swap(rhs.privateBits); + privateBits_.swap(rhs.privateBits_); } WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) { if (this != &rhs) { - privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get()))); + privateBits_.reset(new WriterOptionsPrivate(*(rhs.privateBits_.get()))); } return *this; } @@ -97,7 +101,7 @@ namespace orc { // PASS } RleVersion WriterOptions::getRleVersion() const { - if (privateBits->fileVersion == FileVersion::v_0_11()) { + if (privateBits_->fileVersion == FileVersion::v_0_11()) { return RleVersion_1; } @@ -105,186 +109,204 @@ namespace orc { } WriterOptions& WriterOptions::setStripeSize(uint64_t size) { - privateBits->stripeSize = size; + privateBits_->stripeSize = size; return *this; } uint64_t WriterOptions::getStripeSize() const { - return privateBits->stripeSize; + return privateBits_->stripeSize; } WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) { if (size >= (1 << 23)) { throw std::invalid_argument("Compression block size cannot be greater or equal than 8M"); } - privateBits->compressionBlockSize = size; + privateBits_->compressionBlockSize = size; return *this; } uint64_t WriterOptions::getCompressionBlockSize() const { - return privateBits->compressionBlockSize; + return privateBits_->compressionBlockSize; } WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) { - privateBits->rowIndexStride = stride; - privateBits->enableIndex = (stride != 0); + privateBits_->rowIndexStride = stride; + privateBits_->enableIndex = (stride != 0); return *this; } uint64_t WriterOptions::getRowIndexStride() const { - return privateBits->rowIndexStride; + return privateBits_->rowIndexStride; } WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) { - privateBits->dictionaryKeySizeThreshold = val; + privateBits_->dictionaryKeySizeThreshold = val; return *this; } double WriterOptions::getDictionaryKeySizeThreshold() const { - return privateBits->dictionaryKeySizeThreshold; + return privateBits_->dictionaryKeySizeThreshold; } WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) { // Only Hive_0_11 and Hive_0_12 version are supported currently if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) { - privateBits->fileVersion = version; + privateBits_->fileVersion = version; return *this; } if (version == FileVersion::UNSTABLE_PRE_2_0()) { - *privateBits->errorStream << "Warning: ORC files written in " - << FileVersion::UNSTABLE_PRE_2_0().toString() - << " will not be readable by other versions of the software." - << " It is only for developer testing.\n"; - privateBits->fileVersion = version; + *privateBits_->errorStream << "Warning: ORC files written in " + << FileVersion::UNSTABLE_PRE_2_0().toString() + << " will not be readable by other versions of the software." + << " It is only for developer testing.\n"; + privateBits_->fileVersion = version; return *this; } throw std::logic_error("Unsupported file version specified."); } FileVersion WriterOptions::getFileVersion() const { - return privateBits->fileVersion; + return privateBits_->fileVersion; } WriterOptions& WriterOptions::setCompression(CompressionKind comp) { - privateBits->compression = comp; + privateBits_->compression = comp; return *this; } CompressionKind WriterOptions::getCompression() const { - return privateBits->compression; + return privateBits_->compression; } WriterOptions& WriterOptions::setCompressionStrategy(CompressionStrategy strategy) { - privateBits->compressionStrategy = strategy; + privateBits_->compressionStrategy = strategy; return *this; } CompressionStrategy WriterOptions::getCompressionStrategy() const { - return privateBits->compressionStrategy; + return privateBits_->compressionStrategy; } bool WriterOptions::getAlignedBitpacking() const { - return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED; + return privateBits_->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED; } WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) { - privateBits->paddingTolerance = tolerance; + privateBits_->paddingTolerance = tolerance; return *this; } double WriterOptions::getPaddingTolerance() const { - return privateBits->paddingTolerance; + return privateBits_->paddingTolerance; } WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) { - privateBits->memoryPool = memoryPool; + privateBits_->memoryPool = memoryPool; return *this; } MemoryPool* WriterOptions::getMemoryPool() const { - return privateBits->memoryPool; + return privateBits_->memoryPool; } WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) { - privateBits->errorStream = &errStream; + privateBits_->errorStream = &errStream; return *this; } std::ostream* WriterOptions::getErrorStream() const { - return privateBits->errorStream; + return privateBits_->errorStream; } bool WriterOptions::getEnableIndex() const { - return privateBits->enableIndex; + return privateBits_->enableIndex; } bool WriterOptions::getEnableDictionary() const { - return privateBits->dictionaryKeySizeThreshold > 0.0; + return privateBits_->dictionaryKeySizeThreshold > 0.0; } WriterOptions& WriterOptions::setColumnsUseBloomFilter(const std::set<uint64_t>& columns) { - privateBits->columnsUseBloomFilter = columns; + privateBits_->columnsUseBloomFilter = columns; return *this; } bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const { - return privateBits->columnsUseBloomFilter.find(column) != - privateBits->columnsUseBloomFilter.end(); + return privateBits_->columnsUseBloomFilter.find(column) != + privateBits_->columnsUseBloomFilter.end(); } WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) { - privateBits->bloomFilterFalsePositiveProb = fpp; + privateBits_->bloomFilterFalsePositiveProb = fpp; return *this; } double WriterOptions::getBloomFilterFPP() const { - return privateBits->bloomFilterFalsePositiveProb; + return privateBits_->bloomFilterFalsePositiveProb; } // delibrately not provide setter to write bloom filter version because // we only support UTF8 for now. BloomFilterVersion WriterOptions::getBloomFilterVersion() const { - return privateBits->bloomFilterVersion; + return privateBits_->bloomFilterVersion; } const Timezone& WriterOptions::getTimezone() const { - return getTimezoneByName(privateBits->timezone); + return getTimezoneByName(privateBits_->timezone); } const std::string& WriterOptions::getTimezoneName() const { - return privateBits->timezone; + return privateBits_->timezone; } WriterOptions& WriterOptions::setTimezoneName(const std::string& zone) { - privateBits->timezone = zone; + privateBits_->timezone = zone; return *this; } WriterMetrics* WriterOptions::getWriterMetrics() const { - return privateBits->metrics; + return privateBits_->metrics; } WriterOptions& WriterOptions::setWriterMetrics(WriterMetrics* metrics) { - privateBits->metrics = metrics; + privateBits_->metrics = metrics; return *this; } WriterOptions& WriterOptions::setUseTightNumericVector(bool useTightNumericVector) { - privateBits->useTightNumericVector = useTightNumericVector; + privateBits_->useTightNumericVector = useTightNumericVector; return *this; } bool WriterOptions::getUseTightNumericVector() const { - return privateBits->useTightNumericVector; + return privateBits_->useTightNumericVector; } WriterOptions& WriterOptions::setOutputBufferCapacity(uint64_t capacity) { - privateBits->outputBufferCapacity = capacity; + privateBits_->outputBufferCapacity = capacity; return *this; } uint64_t WriterOptions::getOutputBufferCapacity() const { - return privateBits->outputBufferCapacity; + return privateBits_->outputBufferCapacity; + } + + WriterOptions& WriterOptions::setMemoryBlockSize(uint64_t capacity) { + privateBits_->memoryBlockSize = capacity; + return *this; + } + + uint64_t WriterOptions::getMemoryBlockSize() const { + return privateBits_->memoryBlockSize; + } + + WriterOptions& WriterOptions::setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup) { + privateBits_->alignBlockBoundToRowGroup = alignBlockBoundToRowGroup; + return *this; + } + + bool WriterOptions::getAlignBlockBoundToRowGroup() const { + return privateBits_->alignBlockBoundToRowGroup; } Writer::~Writer() { @@ -293,25 +315,25 @@ namespace orc { class WriterImpl : public Writer { private: - std::unique_ptr<ColumnWriter> columnWriter; - std::unique_ptr<BufferedOutputStream> compressionStream; - std::unique_ptr<BufferedOutputStream> bufferedStream; - std::unique_ptr<StreamsFactory> streamsFactory; - OutputStream* outStream; - WriterOptions options; - const Type& type; - uint64_t stripeRows, totalRows, indexRows; - uint64_t currentOffset; - proto::Footer fileFooter; - proto::PostScript postScript; - proto::StripeInformation stripeInfo; - proto::Metadata metadata; + std::unique_ptr<ColumnWriter> columnWriter_; + std::unique_ptr<BufferedOutputStream> compressionStream_; + std::unique_ptr<BufferedOutputStream> bufferedStream_; + std::unique_ptr<StreamsFactory> streamsFactory_; + OutputStream* outStream_; + WriterOptions options_; + const Type& type_; + uint64_t stripeRows_, totalRows_, indexRows_; + uint64_t currentOffset_; + proto::Footer fileFooter_; + proto::PostScript postScript_; + proto::StripeInformation stripeInfo_; + proto::Metadata metadata_; static const char* magicId; static const WriterId writerId; - bool useTightNumericVector; - int32_t stripesAtLastFlush; - uint64_t lastFlushOffset; + bool useTightNumericVector_; + int32_t stripesAtLastFlush_; + uint64_t lastFlushOffset_; public: WriterImpl(const Type& type, OutputStream* stream, const WriterOptions& options); @@ -342,93 +364,101 @@ namespace orc { const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER; WriterImpl::WriterImpl(const Type& t, OutputStream* stream, const WriterOptions& opts) - : outStream(stream), options(opts), type(t) { - streamsFactory = createStreamsFactory(options, outStream); - columnWriter = buildWriter(type, *streamsFactory, options); - stripeRows = totalRows = indexRows = 0; - currentOffset = 0; - stripesAtLastFlush = 0; - lastFlushOffset = 0; - - useTightNumericVector = opts.getUseTightNumericVector(); + : outStream_(stream), options_(opts), type_(t) { + streamsFactory_ = createStreamsFactory(options_, outStream_); + columnWriter_ = buildWriter(type_, *streamsFactory_, options_); + stripeRows_ = totalRows_ = indexRows_ = 0; + currentOffset_ = 0; + stripesAtLastFlush_ = 0; + lastFlushOffset_ = 0; + + useTightNumericVector_ = opts.getUseTightNumericVector(); + + if (options_.getCompressionBlockSize() % options_.getMemoryBlockSize() != 0) { + throw std::invalid_argument( + "Compression block size must be a multiple of memory block size."); + } // compression stream for stripe footer, file footer and metadata - compressionStream = - createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(), - options.getOutputBufferCapacity(), options.getCompressionBlockSize(), - *options.getMemoryPool(), options.getWriterMetrics()); + compressionStream_ = createCompressor( + options_.getCompression(), outStream_, options_.getCompressionStrategy(), + options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), + options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics()); // uncompressed stream for post script - bufferedStream.reset(new BufferedOutputStream(*options.getMemoryPool(), outStream, - 1024, // buffer capacity: 1024 bytes - options.getCompressionBlockSize(), - options.getWriterMetrics())); + bufferedStream_.reset(new BufferedOutputStream(*options_.getMemoryPool(), outStream_, + 1024, // buffer capacity: 1024 bytes + options_.getCompressionBlockSize(), + options_.getWriterMetrics())); init(); } std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) const { - return type.createRowBatch(size, *options.getMemoryPool(), false, useTightNumericVector); + return type_.createRowBatch(size, *options_.getMemoryPool(), false, useTightNumericVector_); } void WriterImpl::add(ColumnVectorBatch& rowsToAdd) { - if (options.getEnableIndex()) { + if (options_.getEnableIndex()) { uint64_t pos = 0; uint64_t chunkSize = 0; - uint64_t rowIndexStride = options.getRowIndexStride(); + uint64_t rowIndexStride = options_.getRowIndexStride(); while (pos < rowsToAdd.numElements) { - chunkSize = std::min(rowsToAdd.numElements - pos, rowIndexStride - indexRows); - columnWriter->add(rowsToAdd, pos, chunkSize, nullptr); + chunkSize = std::min(rowsToAdd.numElements - pos, rowIndexStride - indexRows_); + columnWriter_->add(rowsToAdd, pos, chunkSize, nullptr); pos += chunkSize; - indexRows += chunkSize; - stripeRows += chunkSize; - - if (indexRows >= rowIndexStride) { - columnWriter->createRowIndexEntry(); - indexRows = 0; + indexRows_ += chunkSize; + stripeRows_ += chunkSize; + + if (indexRows_ >= rowIndexStride) { + if (options_.getAlignBlockBoundToRowGroup()) { + columnWriter_->finishStreams(); + } + columnWriter_->createRowIndexEntry(); + indexRows_ = 0; } } } else { - stripeRows += rowsToAdd.numElements; - columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr); + stripeRows_ += rowsToAdd.numElements; + columnWriter_->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr); } - if (columnWriter->getEstimatedSize() >= options.getStripeSize()) { + if (columnWriter_->getEstimatedSize() >= options_.getStripeSize()) { writeStripe(); } } void WriterImpl::close() { - if (stripeRows > 0) { + if (stripeRows_ > 0) { writeStripe(); } writeMetadata(); writeFileFooter(); writePostscript(); - outStream->close(); + outStream_->close(); } uint64_t WriterImpl::writeIntermediateFooter() { - if (stripeRows > 0) { + if (stripeRows_ > 0) { writeStripe(); } - if (stripesAtLastFlush != fileFooter.stripes_size()) { + if (stripesAtLastFlush_ != fileFooter_.stripes_size()) { writeMetadata(); writeFileFooter(); writePostscript(); - stripesAtLastFlush = fileFooter.stripes_size(); - outStream->flush(); - lastFlushOffset = outStream->getLength(); - currentOffset = lastFlushOffset; + stripesAtLastFlush_ = fileFooter_.stripes_size(); + outStream_->flush(); + lastFlushOffset_ = outStream_->getLength(); + currentOffset_ = lastFlushOffset_; // init stripe now that we adjusted the currentOffset initStripe(); } - return lastFlushOffset; + return lastFlushOffset_; } void WriterImpl::addUserMetadata(const std::string& name, const std::string& value) { - proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata(); + proto::UserMetadataItem* userMetadataItem = fileFooter_.add_metadata(); userMetadataItem->set_name(name); userMetadataItem->set_value(value); } @@ -437,65 +467,65 @@ namespace orc { // Write file header const static size_t magicIdLength = strlen(WriterImpl::magicId); { - SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); - outStream->write(WriterImpl::magicId, magicIdLength); + SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + outStream_->write(WriterImpl::magicId, magicIdLength); } - currentOffset += magicIdLength; + currentOffset_ += magicIdLength; // Initialize file footer - fileFooter.set_header_length(currentOffset); - fileFooter.set_content_length(0); - fileFooter.set_number_of_rows(0); - fileFooter.set_row_index_stride(static_cast<uint32_t>(options.getRowIndexStride())); - fileFooter.set_writer(writerId); - fileFooter.set_software_version(ORC_VERSION); + fileFooter_.set_header_length(currentOffset_); + fileFooter_.set_content_length(0); + fileFooter_.set_number_of_rows(0); + fileFooter_.set_row_index_stride(static_cast<uint32_t>(options_.getRowIndexStride())); + fileFooter_.set_writer(writerId); + fileFooter_.set_software_version(ORC_VERSION); uint32_t index = 0; - buildFooterType(type, fileFooter, index); + buildFooterType(type_, fileFooter_, index); // Initialize post script - postScript.set_footer_length(0); - postScript.set_compression(WriterImpl::convertCompressionKind(options.getCompression())); - postScript.set_compression_block_size(options.getCompressionBlockSize()); + postScript_.set_footer_length(0); + postScript_.set_compression(WriterImpl::convertCompressionKind(options_.getCompression())); + postScript_.set_compression_block_size(options_.getCompressionBlockSize()); - postScript.add_version(options.getFileVersion().getMajor()); - postScript.add_version(options.getFileVersion().getMinor()); + postScript_.add_version(options_.getFileVersion().getMajor()); + postScript_.add_version(options_.getFileVersion().getMinor()); - postScript.set_writer_version(WriterVersion_ORC_135); - postScript.set_magic("ORC"); + postScript_.set_writer_version(WriterVersion_ORC_135); + postScript_.set_magic("ORC"); // Initialize first stripe initStripe(); } void WriterImpl::initStripe() { - stripeInfo.set_offset(currentOffset); - stripeInfo.set_index_length(0); - stripeInfo.set_data_length(0); - stripeInfo.set_footer_length(0); - stripeInfo.set_number_of_rows(0); + stripeInfo_.set_offset(currentOffset_); + stripeInfo_.set_index_length(0); + stripeInfo_.set_data_length(0); + stripeInfo_.set_footer_length(0); + stripeInfo_.set_number_of_rows(0); - stripeRows = indexRows = 0; + stripeRows_ = indexRows_ = 0; } void WriterImpl::writeStripe() { - if (options.getEnableIndex() && indexRows != 0) { - columnWriter->createRowIndexEntry(); - indexRows = 0; + if (options_.getEnableIndex() && indexRows_ != 0) { + columnWriter_->createRowIndexEntry(); + indexRows_ = 0; } else { - columnWriter->mergeRowGroupStatsIntoStripeStats(); + columnWriter_->mergeRowGroupStatsIntoStripeStats(); } // dictionary should be written before any stream is flushed - columnWriter->writeDictionary(); + columnWriter_->writeDictionary(); std::vector<proto::Stream> streams; // write ROW_INDEX streams - if (options.getEnableIndex()) { - columnWriter->writeIndex(streams); + if (options_.getEnableIndex()) { + columnWriter_->writeIndex(streams); } // write streams like PRESENT, DATA, etc. - columnWriter->flush(streams); + columnWriter_->flush(streams); // generate and write stripe footer proto::StripeFooter stripeFooter; @@ -504,28 +534,28 @@ namespace orc { } std::vector<proto::ColumnEncoding> encodings; - columnWriter->getColumnEncoding(encodings); + columnWriter_->getColumnEncoding(encodings); for (uint32_t i = 0; i < encodings.size(); ++i) { *stripeFooter.add_columns() = encodings[i]; } - stripeFooter.set_writer_timezone(options.getTimezoneName()); + stripeFooter.set_writer_timezone(options_.getTimezoneName()); // add stripe statistics to metadata - proto::StripeStatistics* stripeStats = metadata.add_stripe_stats(); + proto::StripeStatistics* stripeStats = metadata_.add_stripe_stats(); std::vector<proto::ColumnStatistics> colStats; - columnWriter->getStripeStatistics(colStats); + columnWriter_->getStripeStatistics(colStats); for (uint32_t i = 0; i != colStats.size(); ++i) { *stripeStats->add_col_stats() = colStats[i]; } // merge stripe stats into file stats and clear stripe stats - columnWriter->mergeStripeStatsIntoFileStats(); + columnWriter_->mergeStripeStatsIntoFileStats(); - if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) { + if (!stripeFooter.SerializeToZeroCopyStream(compressionStream_.get())) { throw std::logic_error("Failed to write stripe footer."); } - uint64_t footerLength = compressionStream->flush(); + uint64_t footerLength = compressionStream_->flush(); // calculate data length and index length uint64_t dataLength = 0; @@ -540,53 +570,53 @@ namespace orc { } // update stripe info - stripeInfo.set_index_length(indexLength); - stripeInfo.set_data_length(dataLength); - stripeInfo.set_footer_length(footerLength); - stripeInfo.set_number_of_rows(stripeRows); + stripeInfo_.set_index_length(indexLength); + stripeInfo_.set_data_length(dataLength); + stripeInfo_.set_footer_length(footerLength); + stripeInfo_.set_number_of_rows(stripeRows_); - *fileFooter.add_stripes() = stripeInfo; + *fileFooter_.add_stripes() = stripeInfo_; - currentOffset = currentOffset + indexLength + dataLength + footerLength; - totalRows += stripeRows; + currentOffset_ = currentOffset_ + indexLength + dataLength + footerLength; + totalRows_ += stripeRows_; - columnWriter->reset(); + columnWriter_->reset(); initStripe(); } void WriterImpl::writeMetadata() { - if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) { + if (!metadata_.SerializeToZeroCopyStream(compressionStream_.get())) { throw std::logic_error("Failed to write metadata."); } - postScript.set_metadata_length(compressionStream.get()->flush()); + postScript_.set_metadata_length(compressionStream_.get()->flush()); } void WriterImpl::writeFileFooter() { - fileFooter.set_content_length(currentOffset - fileFooter.header_length()); - fileFooter.set_number_of_rows(totalRows); + fileFooter_.set_content_length(currentOffset_ - fileFooter_.header_length()); + fileFooter_.set_number_of_rows(totalRows_); // update file statistics std::vector<proto::ColumnStatistics> colStats; - columnWriter->getFileStatistics(colStats); - fileFooter.clear_statistics(); + columnWriter_->getFileStatistics(colStats); + fileFooter_.clear_statistics(); for (uint32_t i = 0; i != colStats.size(); ++i) { - *fileFooter.add_statistics() = colStats[i]; + *fileFooter_.add_statistics() = colStats[i]; } - if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) { + if (!fileFooter_.SerializeToZeroCopyStream(compressionStream_.get())) { throw std::logic_error("Failed to write file footer."); } - postScript.set_footer_length(compressionStream->flush()); + postScript_.set_footer_length(compressionStream_->flush()); } void WriterImpl::writePostscript() { - if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) { + if (!postScript_.SerializeToZeroCopyStream(bufferedStream_.get())) { throw std::logic_error("Failed to write post script."); } - unsigned char psLength = static_cast<unsigned char>(bufferedStream->flush()); - SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); - outStream->write(&psLength, sizeof(unsigned char)); + unsigned char psLength = static_cast<unsigned char>(bufferedStream_->flush()); + SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + outStream_->write(&psLength, sizeof(unsigned char)); } void WriterImpl::buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index) { diff --git a/contrib/libs/apache/orc/c++/src/io/Cache.cc b/contrib/libs/apache/orc/c++/src/io/Cache.cc new file mode 100644 index 0000000000..39f63fdd2b --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/io/Cache.cc @@ -0,0 +1,171 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cassert> + +#include "Cache.hh" + +namespace orc { + + std::vector<ReadRange> ReadRangeCombiner::coalesce(std::vector<ReadRange> ranges) const { + if (ranges.empty()) { + return ranges; + } + + // Remove zero-sized ranges + auto end = std::remove_if(ranges.begin(), ranges.end(), + [](const ReadRange& range) { return range.length == 0; }); + // Sort in position order + std::sort(ranges.begin(), end, [](const ReadRange& a, const ReadRange& b) { + return a.offset != b.offset ? a.offset < b.offset : a.length > b.length; + }); + + // Remove ranges that overlap 100% + std::vector<ReadRange> uniqueRanges; + uniqueRanges.reserve(ranges.size()); + for (auto it = ranges.begin(); it != end; ++it) { + if (uniqueRanges.empty() || !uniqueRanges.back().contains(*it)) { + uniqueRanges.push_back(*it); + } + } + ranges = std::move(uniqueRanges); + + // Skip further processing if ranges is empty after removing zero-sized ranges. + if (ranges.empty()) { + return ranges; + } + +#ifndef NDEBUG + for (size_t i = 0; i < ranges.size() - 1; ++i) { + const auto& left = ranges[i]; + const auto& right = ranges[i + 1]; + assert(left.offset < right.offset); + assert(!left.contains(right)); + } +#endif + + std::vector<ReadRange> coalesced; + auto itr = ranges.begin(); + + // Start of the current coalesced range and end (exclusive) of previous range. + // Both are initialized with the start of first range which is a placeholder value. + uint64_t coalescedStart = itr->offset; + uint64_t coalescedEnd = coalescedStart + itr->length; + + for (++itr; itr < ranges.end(); ++itr) { + const uint64_t currentRangeStart = itr->offset; + const uint64_t currentRangeEnd = currentRangeStart + itr->length; + + assert(coalescedStart < coalescedEnd); + assert(currentRangeStart < currentRangeEnd); + + // At this point, the coalesced range is [coalesced_start, prev_range_end). + // Stop coalescing if: + // - coalesced range is too large, or + // - distance (hole/gap) between consecutive ranges is too large. + if ((currentRangeEnd - coalescedStart > rangeSizeLimit) || + (currentRangeStart > coalescedEnd + holeSizeLimit)) { + coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart}); + coalescedStart = currentRangeStart; + } + + // Update the prev_range_end with the current range. + coalescedEnd = currentRangeEnd; + } + coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart}); + + assert(coalesced.front().offset == ranges.front().offset); + assert(coalesced.back().offset + coalesced.back().length == + ranges.back().offset + ranges.back().length); + return coalesced; + } + + std::vector<ReadRange> ReadRangeCombiner::coalesceReadRanges(std::vector<ReadRange> ranges, + uint64_t holeSizeLimit, + uint64_t rangeSizeLimit) { + assert(rangeSizeLimit > holeSizeLimit); + + ReadRangeCombiner combiner{holeSizeLimit, rangeSizeLimit}; + return combiner.coalesce(std::move(ranges)); + } + + void ReadRangeCache::cache(std::vector<ReadRange> ranges) { + ranges = ReadRangeCombiner::coalesceReadRanges(std::move(ranges), options_.holeSizeLimit, + options_.rangeSizeLimit); + + std::vector<RangeCacheEntry> newEntries = makeCacheEntries(ranges); + // Add new entries, themselves ordered by offset + if (entries_.size() > 0) { + std::vector<RangeCacheEntry> merged(entries_.size() + newEntries.size()); + std::merge(entries_.begin(), entries_.end(), newEntries.begin(), newEntries.end(), + merged.begin()); + entries_ = std::move(merged); + } else { + entries_ = std::move(newEntries); + } + } + + BufferSlice ReadRangeCache::read(const ReadRange& range) { + if (range.length == 0) { + return {std::make_shared<Buffer>(*memoryPool_, 0), 0, 0}; + } + + const auto it = std::lower_bound(entries_.begin(), entries_.end(), range, + [](const RangeCacheEntry& entry, const ReadRange& range) { + return entry.range.offset + entry.range.length < + range.offset + range.length; + }); + + BufferSlice result{}; + bool hit_cache = false; + if (it != entries_.end() && it->range.contains(range)) { + hit_cache = it->future.valid(); + it->future.get(); + result = BufferSlice{it->buffer, range.offset - it->range.offset, range.length}; + } + + if (metrics_) { + if (hit_cache) + metrics_->ReadRangeCacheHits.fetch_add(1); + else + metrics_->ReadRangeCacheMisses.fetch_add(1); + } + return result; + } + + void ReadRangeCache::evictEntriesBefore(uint64_t boundary) { + auto it = std::lower_bound(entries_.begin(), entries_.end(), boundary, + [](const RangeCacheEntry& entry, uint64_t offset) { + return entry.range.offset + entry.range.length <= offset; + }); + entries_.erase(entries_.begin(), it); + } + + std::vector<RangeCacheEntry> ReadRangeCache::makeCacheEntries( + const std::vector<ReadRange>& ranges) const { + std::vector<RangeCacheEntry> newEntries; + newEntries.reserve(ranges.size()); + for (const auto& range : ranges) { + BufferPtr buffer = std::make_shared<Buffer>(*memoryPool_, range.length); + std::future<void> future = stream_->readAsync(buffer->data(), buffer->size(), range.offset); + newEntries.emplace_back(range, std::move(buffer), std::move(future)); + } + return newEntries; + } + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/io/Cache.hh b/contrib/libs/apache/orc/c++/src/io/Cache.hh new file mode 100644 index 0000000000..7fc79718aa --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/io/Cache.hh @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "orc/MemoryPool.hh" +#include "orc/OrcFile.hh" + +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <future> +#include <utility> +#include <vector> + +namespace orc { + + struct ReadRange { + uint64_t offset; + uint64_t length; + + ReadRange() = default; + ReadRange(uint64_t offset, uint64_t length) : offset(offset), length(length) {} + + friend bool operator==(const ReadRange& left, const ReadRange& right) { + return (left.offset == right.offset && left.length == right.length); + } + friend bool operator!=(const ReadRange& left, const ReadRange& right) { + return !(left == right); + } + + bool contains(const ReadRange& other) const { + return (offset <= other.offset && offset + length >= other.offset + other.length); + } + }; + + struct ReadRangeCombiner { + const uint64_t holeSizeLimit; + const uint64_t rangeSizeLimit; + + std::vector<ReadRange> coalesce(std::vector<ReadRange> ranges) const; + + static std::vector<ReadRange> coalesceReadRanges(std::vector<ReadRange> ranges, + uint64_t holeSizeLimit, + uint64_t rangeSizeLimit); + }; + + using Buffer = DataBuffer<char>; + using BufferPtr = std::shared_ptr<Buffer>; + + struct RangeCacheEntry { + ReadRange range; + BufferPtr buffer; + std::shared_future<void> future; // use shared_future in case of multiple get calls + + RangeCacheEntry() = default; + RangeCacheEntry(const ReadRange& range, BufferPtr buffer, std::future<void> future) + : range(range), buffer(std::move(buffer)), future(std::move(future).share()) {} + + friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) { + return left.range.offset < right.range.offset; + } + }; + + struct BufferSlice { + BufferPtr buffer = nullptr; + uint64_t offset = 0; + uint64_t length = 0; + }; + + /// A read cache designed to hide IO latencies when reading. + class ReadRangeCache { + public: + /// Construct a read cache with given options + explicit ReadRangeCache(InputStream* stream, CacheOptions options, MemoryPool* memoryPool, + ReaderMetrics* metrics = nullptr) + : stream_(stream), + options_(std::move(options)), + memoryPool_(memoryPool), + metrics_(metrics) {} + + ~ReadRangeCache() = default; + + /// Cache the given ranges in the background. + /// + /// The caller must ensure that the ranges do not overlap with each other, + /// nor with previously cached ranges. Otherwise, behaviour will be undefined. + void cache(std::vector<ReadRange> ranges); + + /// Read a range previously given to Cache(). + BufferSlice read(const ReadRange& range); + + /// Evict cache entries with its range before given boundary. + void evictEntriesBefore(uint64_t boundary); + + private: + std::vector<RangeCacheEntry> makeCacheEntries(const std::vector<ReadRange>& ranges) const; + + InputStream* stream_; + CacheOptions options_; + // Ordered by offset (so as to find a matching region by binary search) + std::vector<RangeCacheEntry> entries_; + MemoryPool* memoryPool_; + ReaderMetrics* metrics_; + }; + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc index 3bf1781747..727d7b3278 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc @@ -39,17 +39,17 @@ namespace orc { } PositionProvider::PositionProvider(const std::list<uint64_t>& posns) { - position = posns.begin(); + position_ = posns.begin(); } uint64_t PositionProvider::next() { - uint64_t result = *position; - ++position; + uint64_t result = *position_; + ++position_; return result; } uint64_t PositionProvider::current() { - return *position; + return *position_; } SeekableInputStream::~SeekableInputStream() { @@ -62,26 +62,26 @@ namespace orc { SeekableArrayInputStream::SeekableArrayInputStream(const unsigned char* values, uint64_t size, uint64_t blkSize) - : data(reinterpret_cast<const char*>(values)) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); + : data_(reinterpret_cast<const char*>(values)) { + length_ = size; + position_ = 0; + blockSize_ = blkSize == 0 ? length_ : static_cast<uint64_t>(blkSize); } SeekableArrayInputStream::SeekableArrayInputStream(const char* values, uint64_t size, uint64_t blkSize) - : data(values) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); + : data_(values) { + length_ = size; + position_ = 0; + blockSize_ = blkSize == 0 ? length_ : static_cast<uint64_t>(blkSize); } bool SeekableArrayInputStream::Next(const void** buffer, int* size) { - uint64_t currentSize = std::min(length - position, blockSize); + uint64_t currentSize = std::min(length_ - position_, blockSize_); if (currentSize > 0) { - *buffer = data + position; + *buffer = data_ + position_; *size = static_cast<int>(currentSize); - position += currentSize; + position_ += currentSize; return true; } *size = 0; @@ -91,8 +91,8 @@ namespace orc { void SeekableArrayInputStream::BackUp(int count) { if (count >= 0) { uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount <= blockSize && unsignedCount <= position) { - position -= unsignedCount; + if (unsignedCount <= blockSize_ && unsignedCount <= position_) { + position_ -= unsignedCount; } else { throw std::logic_error("Can't backup that much!"); } @@ -102,27 +102,27 @@ namespace orc { bool SeekableArrayInputStream::Skip(int count) { if (count >= 0) { uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount + position <= length) { - position += unsignedCount; + if (unsignedCount + position_ <= length_) { + position_ += unsignedCount; return true; } else { - position = length; + position_ = length_; } } return false; } int64_t SeekableArrayInputStream::ByteCount() const { - return static_cast<google::protobuf::int64>(position); + return static_cast<google::protobuf::int64>(position_); } void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { - position = seekPosition.next(); + position_ = seekPosition.next(); } std::string SeekableArrayInputStream::getName() const { std::ostringstream result; - result << "SeekableArrayInputStream " << position << " of " << length; + result << "SeekableArrayInputStream " << position_ << " of " << length_; return result.str(); } @@ -131,16 +131,16 @@ namespace orc { } SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, uint64_t offset, - uint64_t byteCount, MemoryPool& _pool, - uint64_t _blockSize) - : pool(_pool), - input(stream), - start(offset), - length(byteCount), - blockSize(computeBlock(_blockSize, length)) { - position = 0; - buffer.reset(new DataBuffer<char>(pool)); - pushBack = 0; + uint64_t byteCount, MemoryPool& pool, + uint64_t blockSize) + : pool_(pool), + input_(stream), + start_(offset), + length_(byteCount), + blockSize_(computeBlock(blockSize, length_)) { + position_ = 0; + buffer_.reset(new DataBuffer<char>(pool_)); + pushBack_ = 0; } SeekableFileInputStream::~SeekableFileInputStream() { @@ -149,19 +149,19 @@ namespace orc { bool SeekableFileInputStream::Next(const void** data, int* size) { uint64_t bytesRead; - if (pushBack != 0) { - *data = buffer->data() + (buffer->size() - pushBack); - bytesRead = pushBack; + if (pushBack_ != 0) { + *data = buffer_->data() + (buffer_->size() - pushBack_); + bytesRead = pushBack_; } else { - bytesRead = std::min(length - position, blockSize); - buffer->resize(bytesRead); + bytesRead = std::min(length_ - position_, blockSize_); + buffer_->resize(bytesRead); if (bytesRead > 0) { - input->read(buffer->data(), bytesRead, start + position); - *data = static_cast<void*>(buffer->data()); + input_->read(buffer_->data(), bytesRead, start_ + position_); + *data = static_cast<void*>(buffer_->data()); } } - position += bytesRead; - pushBack = 0; + position_ += bytesRead; + pushBack_ = 0; *size = static_cast<int>(bytesRead); return bytesRead != 0; } @@ -171,14 +171,14 @@ namespace orc { throw std::logic_error("can't backup negative distances"); } uint64_t count = static_cast<uint64_t>(signedCount); - if (pushBack > 0) { + if (pushBack_ > 0) { throw std::logic_error("can't backup unless we just called Next"); } - if (count > blockSize || count > position) { + if (count > blockSize_ || count > position_) { throw std::logic_error("can't backup that far"); } - pushBack = static_cast<uint64_t>(count); - position -= pushBack; + pushBack_ = static_cast<uint64_t>(count); + position_ -= pushBack_; } bool SeekableFileInputStream::Skip(int signedCount) { @@ -186,27 +186,27 @@ namespace orc { return false; } uint64_t count = static_cast<uint64_t>(signedCount); - position = std::min(position + count, length); - pushBack = 0; - return position < length; + position_ = std::min(position_ + count, length_); + pushBack_ = 0; + return position_ < length_; } int64_t SeekableFileInputStream::ByteCount() const { - return static_cast<int64_t>(position); + return static_cast<int64_t>(position_); } void SeekableFileInputStream::seek(PositionProvider& location) { - position = location.next(); - if (position > length) { - position = length; + position_ = location.next(); + if (position_ > length_) { + position_ = length_; throw std::logic_error("seek too far"); } - pushBack = 0; + pushBack_ = 0; } std::string SeekableFileInputStream::getName() const { std::ostringstream result; - result << input->getName() << " from " << start << " for " << length; + result << input_->getName() << " from " << start_ << " for " << length_; return result.str(); } diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh index 33c64f8809..8b251c9301 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh @@ -35,7 +35,7 @@ namespace orc { class PositionProvider { private: - std::list<uint64_t>::const_iterator position; + std::list<uint64_t>::const_iterator position_; public: PositionProvider(const std::list<uint64_t>& positions); @@ -60,14 +60,14 @@ namespace orc { */ class SeekableArrayInputStream : public SeekableInputStream { private: - const char* data; - uint64_t length; - uint64_t position; - uint64_t blockSize; + const char* data_; + uint64_t length_; + uint64_t position_; + uint64_t blockSize_; public: - SeekableArrayInputStream(const unsigned char* list, uint64_t length, uint64_t block_size = 0); - SeekableArrayInputStream(const char* list, uint64_t length, uint64_t block_size = 0); + SeekableArrayInputStream(const unsigned char* list, uint64_t length, uint64_t blockSize = 0); + SeekableArrayInputStream(const char* list, uint64_t length, uint64_t blockSize = 0); virtual ~SeekableArrayInputStream() override; virtual bool Next(const void** data, int* size) override; virtual void BackUp(int count) override; @@ -82,14 +82,14 @@ namespace orc { */ class SeekableFileInputStream : public SeekableInputStream { private: - MemoryPool& pool; - InputStream* const input; - const uint64_t start; - const uint64_t length; - const uint64_t blockSize; - std::unique_ptr<DataBuffer<char> > buffer; - uint64_t position; - uint64_t pushBack; + MemoryPool& pool_; + InputStream* const input_; + const uint64_t start_; + const uint64_t length_; + const uint64_t blockSize_; + std::unique_ptr<DataBuffer<char> > buffer_; + uint64_t position_; + uint64_t pushBack_; public: SeekableFileInputStream(InputStream* input, uint64_t offset, uint64_t byteCount, diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc index 7d9fb92206..4ca59dbe95 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc @@ -29,11 +29,11 @@ namespace orc { } BufferedOutputStream::BufferedOutputStream(MemoryPool& pool, OutputStream* outStream, - uint64_t capacity_, uint64_t blockSize_, - WriterMetrics* metrics_) - : outputStream(outStream), blockSize(blockSize_), metrics(metrics_) { - dataBuffer.reset(new BlockBuffer(pool, blockSize)); - dataBuffer->reserve(capacity_); + uint64_t capacity, uint64_t blockSize, + WriterMetrics* metrics) + : outputStream_(outStream), blockSize_(blockSize), metrics_(metrics) { + dataBuffer_.reset(new BlockBuffer(pool, blockSize_)); + dataBuffer_->reserve(capacity); } BufferedOutputStream::~BufferedOutputStream() { @@ -41,7 +41,7 @@ namespace orc { } bool BufferedOutputStream::Next(void** buffer, int* size) { - auto block = dataBuffer->getNextBlock(); + auto block = dataBuffer_->getNextBlock(); if (block.data == nullptr) { throw std::logic_error("Failed to get next buffer from block buffer."); } @@ -53,16 +53,20 @@ namespace orc { void BufferedOutputStream::BackUp(int count) { if (count >= 0) { uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount <= dataBuffer->size()) { - dataBuffer->resize(dataBuffer->size() - unsignedCount); + if (unsignedCount <= dataBuffer_->size()) { + dataBuffer_->resize(dataBuffer_->size() - unsignedCount); } else { throw std::logic_error("Can't backup that much!"); } } } + void BufferedOutputStream::finishStream() { + // PASS + } + int64_t BufferedOutputStream::ByteCount() const { - return static_cast<google::protobuf::int64>(dataBuffer->size()); + return static_cast<google::protobuf::int64>(dataBuffer_->size()); } bool BufferedOutputStream::WriteAliasedRaw(const void*, int) { @@ -75,70 +79,80 @@ namespace orc { std::string BufferedOutputStream::getName() const { std::ostringstream result; - result << "BufferedOutputStream " << dataBuffer->size() << " of " << dataBuffer->capacity(); + result << "BufferedOutputStream " << dataBuffer_->size() << " of " << dataBuffer_->capacity(); return result.str(); } uint64_t BufferedOutputStream::getSize() const { - return dataBuffer->size(); + return dataBuffer_->size(); } uint64_t BufferedOutputStream::flush() { - uint64_t dataSize = dataBuffer->size(); + uint64_t dataSize = dataBuffer_->size(); // flush data buffer into outputStream if (dataSize > 0) { - SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); - dataBuffer->writeTo(outputStream, metrics); + SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount); + dataBuffer_->writeTo(outputStream_, metrics_); } - dataBuffer->resize(0); + dataBuffer_->resize(0); return dataSize; } void BufferedOutputStream::suppress() { - dataBuffer->resize(0); + dataBuffer_->resize(0); + } + + uint64_t BufferedOutputStream::getRawInputBufferSize() const { + throw std::logic_error("getRawInputBufferSize is not supported."); } void AppendOnlyBufferedStream::write(const char* data, size_t size) { size_t dataOffset = 0; while (size > 0) { - if (bufferOffset == bufferLength) { - if (!outStream->Next(reinterpret_cast<void**>(&buffer), &bufferLength)) { + if (bufferOffset_ == bufferLength_) { + if (!outStream_->Next(reinterpret_cast<void**>(&buffer_), &bufferLength_)) { throw std::logic_error("Failed to allocate buffer."); } - bufferOffset = 0; + bufferOffset_ = 0; } - size_t len = std::min(static_cast<size_t>(bufferLength - bufferOffset), size); - memcpy(buffer + bufferOffset, data + dataOffset, len); - bufferOffset += static_cast<int>(len); + size_t len = std::min(static_cast<size_t>(bufferLength_ - bufferOffset_), size); + memcpy(buffer_ + bufferOffset_, data + dataOffset, len); + bufferOffset_ += static_cast<int>(len); dataOffset += len; size -= len; } } uint64_t AppendOnlyBufferedStream::getSize() const { - return outStream->getSize(); + return outStream_->getSize(); } uint64_t AppendOnlyBufferedStream::flush() { - outStream->BackUp(bufferLength - bufferOffset); - bufferOffset = bufferLength = 0; - buffer = nullptr; - return outStream->flush(); + finishStream(); + return outStream_->flush(); } void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { - uint64_t flushedSize = outStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset); - if (outStream->isCompressed()) { + uint64_t flushedSize = outStream_->getSize(); + uint64_t unusedBufferSize = static_cast<uint64_t>(bufferLength_ - bufferOffset_); + if (outStream_->isCompressed()) { // start of the compression chunk in the stream recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outStream_->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast<uint64_t>(bufferLength); // byte offset of the start location - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } } + void AppendOnlyBufferedStream::finishStream() { + outStream_->BackUp(bufferLength_ - bufferOffset_); + outStream_->finishStream(); + bufferOffset_ = bufferLength_ = 0; + buffer_ = nullptr; + } + } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh index d8bc21ce6d..b029818125 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh @@ -49,14 +49,14 @@ namespace orc { */ class BufferedOutputStream : public google::protobuf::io::ZeroCopyOutputStream { private: - OutputStream* outputStream; - std::unique_ptr<BlockBuffer> dataBuffer; - uint64_t blockSize; - WriterMetrics* metrics; + OutputStream* outputStream_; + std::unique_ptr<BlockBuffer> dataBuffer_; + uint64_t blockSize_; + WriterMetrics* metrics_; public: BufferedOutputStream(MemoryPool& pool, OutputStream* outStream, uint64_t capacity, - uint64_t block_size, WriterMetrics* metrics); + uint64_t blockSize, WriterMetrics* metrics); virtual ~BufferedOutputStream() override; virtual bool Next(void** data, int* size) override; @@ -69,10 +69,12 @@ namespace orc { virtual uint64_t getSize() const; virtual uint64_t flush(); virtual void suppress(); + virtual uint64_t getRawInputBufferSize() const; virtual bool isCompressed() const { return false; } + virtual void finishStream(); }; DIAGNOSTIC_POP @@ -84,20 +86,21 @@ namespace orc { */ class AppendOnlyBufferedStream { private: - std::unique_ptr<BufferedOutputStream> outStream; - char* buffer; - int bufferOffset, bufferLength; + std::unique_ptr<BufferedOutputStream> outStream_; + char* buffer_; + int bufferOffset_, bufferLength_; public: - AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) - : outStream(std::move(_outStream)) { - buffer = nullptr; - bufferOffset = bufferLength = 0; + AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> outStream) + : outStream_(std::move(outStream)) { + buffer_ = nullptr; + bufferOffset_ = bufferLength_ = 0; } void write(const char* data, size_t size); uint64_t getSize() const; uint64_t flush(); + void finishStream(); void recordPosition(PositionRecorder* recorder) const; }; diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc index 9176c1f6c3..e49bca4b77 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc @@ -24,39 +24,39 @@ namespace orc { ExpressionTree::ExpressionTree(Operator op) - : mOperator(op), mLeaf(UNUSED_LEAF), mConstant(TruthValue::YES_NO_NULL) {} + : operator_(op), leaf_(UNUSED_LEAF), constant_(TruthValue::YES_NO_NULL) {} ExpressionTree::ExpressionTree(Operator op, std::initializer_list<TreeNode> children) - : mOperator(op), - mChildren(children.begin(), children.end()), - mLeaf(UNUSED_LEAF), - mConstant(TruthValue::YES_NO_NULL) { + : operator_(op), + children_(children.begin(), children.end()), + leaf_(UNUSED_LEAF), + constant_(TruthValue::YES_NO_NULL) { // PASS } ExpressionTree::ExpressionTree(size_t leaf) - : mOperator(Operator::LEAF), mChildren(), mLeaf(leaf), mConstant(TruthValue::YES_NO_NULL) { + : operator_(Operator::LEAF), children_(), leaf_(leaf), constant_(TruthValue::YES_NO_NULL) { // PASS } ExpressionTree::ExpressionTree(TruthValue constant) - : mOperator(Operator::CONSTANT), mChildren(), mLeaf(UNUSED_LEAF), mConstant(constant) { + : operator_(Operator::CONSTANT), children_(), leaf_(UNUSED_LEAF), constant_(constant) { // PASS } ExpressionTree::ExpressionTree(const ExpressionTree& other) - : mOperator(other.mOperator), mLeaf(other.mLeaf), mConstant(other.mConstant) { - for (TreeNode child : other.mChildren) { - mChildren.emplace_back(std::make_shared<ExpressionTree>(*child)); + : operator_(other.operator_), leaf_(other.leaf_), constant_(other.constant_) { + for (TreeNode child : other.children_) { + children_.emplace_back(std::make_shared<ExpressionTree>(*child)); } } ExpressionTree::Operator ExpressionTree::getOperator() const { - return mOperator; + return operator_; } const std::vector<TreeNode>& ExpressionTree::getChildren() const { - return mChildren; + return children_; } std::vector<TreeNode>& ExpressionTree::getChildren() { @@ -65,7 +65,7 @@ namespace orc { } const TreeNode ExpressionTree::getChild(size_t i) const { - return mChildren.at(i); + return children_.at(i); } TreeNode ExpressionTree::getChild(size_t i) { @@ -74,47 +74,47 @@ namespace orc { } TruthValue ExpressionTree::getConstant() const { - assert(mOperator == Operator::CONSTANT); - return mConstant; + assert(operator_ == Operator::CONSTANT); + return constant_; } size_t ExpressionTree::getLeaf() const { - assert(mOperator == Operator::LEAF); - return mLeaf; + assert(operator_ == Operator::LEAF); + return leaf_; } void ExpressionTree::setLeaf(size_t leaf) { - assert(mOperator == Operator::LEAF); - mLeaf = leaf; + assert(operator_ == Operator::LEAF); + leaf_ = leaf; } void ExpressionTree::addChild(TreeNode child) { - mChildren.push_back(child); + children_.push_back(child); } TruthValue ExpressionTree::evaluate(const std::vector<TruthValue>& leaves) const { TruthValue result; - switch (mOperator) { + switch (operator_) { case Operator::OR: { - result = mChildren.at(0)->evaluate(leaves); - for (size_t i = 1; i < mChildren.size() && !isNeeded(result); ++i) { - result = mChildren.at(i)->evaluate(leaves) || result; + result = children_.at(0)->evaluate(leaves); + for (size_t i = 1; i < children_.size() && !isNeeded(result); ++i) { + result = children_.at(i)->evaluate(leaves) || result; } return result; } case Operator::AND: { - result = mChildren.at(0)->evaluate(leaves); - for (size_t i = 1; i < mChildren.size() && isNeeded(result); ++i) { - result = mChildren.at(i)->evaluate(leaves) && result; + result = children_.at(0)->evaluate(leaves); + for (size_t i = 1; i < children_.size() && isNeeded(result); ++i) { + result = children_.at(i)->evaluate(leaves) && result; } return result; } case Operator::NOT: - return !mChildren.at(0)->evaluate(leaves); + return !children_.at(0)->evaluate(leaves); case Operator::LEAF: - return leaves[mLeaf]; + return leaves[leaf_]; case Operator::CONSTANT: - return mConstant; + return constant_; default: throw std::invalid_argument("Unknown operator!"); } @@ -143,29 +143,29 @@ namespace orc { std::string ExpressionTree::toString() const { std::ostringstream sstream; - switch (mOperator) { + switch (operator_) { case Operator::OR: sstream << "(or"; - for (const auto& child : mChildren) { + for (const auto& child : children_) { sstream << ' ' << child->toString(); } sstream << ')'; break; case Operator::AND: sstream << "(and"; - for (const auto& child : mChildren) { + for (const auto& child : children_) { sstream << ' ' << child->toString(); } sstream << ')'; break; case Operator::NOT: - sstream << "(not " << mChildren.at(0)->toString() << ')'; + sstream << "(not " << children_.at(0)->toString() << ')'; break; case Operator::LEAF: - sstream << "leaf-" << mLeaf; + sstream << "leaf-" << leaf_; break; case Operator::CONSTANT: - sstream << to_string(mConstant); + sstream << to_string(constant_); break; default: throw std::invalid_argument("unknown operator!"); diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh index 3e0b331a2d..0f801852f8 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh @@ -74,10 +74,10 @@ namespace orc { TruthValue evaluate(const std::vector<TruthValue>& leaves) const; private: - Operator mOperator; - std::vector<TreeNode> mChildren; - size_t mLeaf; - TruthValue mConstant; + Operator operator_; + std::vector<TreeNode> children_; + size_t leaf_; + TruthValue constant_; }; } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc index c0cdd62201..f36db79437 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc @@ -26,196 +26,196 @@ namespace orc { Literal::Literal(PredicateDataType type) { - mType = type; - mValue.DecimalVal = 0; - mSize = 0; - mIsNull = true; - mPrecision = 0; - mScale = 0; - mHashCode = 0; + type_ = type; + value_.DecimalVal = 0; + size_ = 0; + isNull_ = true; + precision_ = 0; + scale_ = 0; + hashCode_ = 0; } Literal::Literal(int64_t val) { - mType = PredicateDataType::LONG; - mValue.IntVal = val; - mSize = sizeof(val); - mIsNull = false; - mPrecision = 0; - mScale = 0; - mHashCode = hashCode(); + type_ = PredicateDataType::LONG; + value_.IntVal = val; + size_ = sizeof(val); + isNull_ = false; + precision_ = 0; + scale_ = 0; + hashCode_ = hashCode(); } Literal::Literal(double val) { - mType = PredicateDataType::FLOAT; - mValue.DoubleVal = val; - mSize = sizeof(val); - mIsNull = false; - mPrecision = 0; - mScale = 0; - mHashCode = hashCode(); + type_ = PredicateDataType::FLOAT; + value_.DoubleVal = val; + size_ = sizeof(val); + isNull_ = false; + precision_ = 0; + scale_ = 0; + hashCode_ = hashCode(); } Literal::Literal(bool val) { - mType = PredicateDataType::BOOLEAN; - mValue.BooleanVal = val; - mSize = sizeof(val); - mIsNull = false; - mPrecision = 0; - mScale = 0; - mHashCode = hashCode(); + type_ = PredicateDataType::BOOLEAN; + value_.BooleanVal = val; + size_ = sizeof(val); + isNull_ = false; + precision_ = 0; + scale_ = 0; + hashCode_ = hashCode(); } Literal::Literal(PredicateDataType type, int64_t val) { if (type != PredicateDataType::DATE) { throw std::invalid_argument("only DATE is supported here!"); } - mType = type; - mValue.IntVal = val; - mSize = sizeof(val); - mIsNull = false; - mPrecision = 0; - mScale = 0; - mHashCode = hashCode(); + type_ = type; + value_.IntVal = val; + size_ = sizeof(val); + isNull_ = false; + precision_ = 0; + scale_ = 0; + hashCode_ = hashCode(); } Literal::Literal(const char* str, size_t size) { - mType = PredicateDataType::STRING; - mValue.Buffer = new char[size]; - memcpy(mValue.Buffer, str, size); - mSize = size; - mIsNull = false; - mPrecision = 0; - mScale = 0; - mHashCode = hashCode(); + type_ = PredicateDataType::STRING; + value_.Buffer = new char[size]; + memcpy(value_.Buffer, str, size); + size_ = size; + isNull_ = false; + precision_ = 0; + scale_ = 0; + hashCode_ = hashCode(); } Literal::Literal(Int128 val, int32_t precision, int32_t scale) { - mType = PredicateDataType::DECIMAL; - mValue.DecimalVal = val; - mPrecision = precision; - mScale = scale; - mSize = sizeof(Int128); - mIsNull = false; - mHashCode = hashCode(); + type_ = PredicateDataType::DECIMAL; + value_.DecimalVal = val; + precision_ = precision; + scale_ = scale; + size_ = sizeof(Int128); + isNull_ = false; + hashCode_ = hashCode(); } Literal::Literal(int64_t second, int32_t nanos) { - mType = PredicateDataType::TIMESTAMP; - mValue.TimeStampVal.second = second; - mValue.TimeStampVal.nanos = nanos; - mPrecision = 0; - mScale = 0; - mSize = sizeof(Timestamp); - mIsNull = false; - mHashCode = hashCode(); + type_ = PredicateDataType::TIMESTAMP; + value_.TimeStampVal.second = second; + value_.TimeStampVal.nanos = nanos; + precision_ = 0; + scale_ = 0; + size_ = sizeof(Timestamp); + isNull_ = false; + hashCode_ = hashCode(); } Literal::Literal(const Literal& r) - : mType(r.mType), mSize(r.mSize), mIsNull(r.mIsNull), mHashCode(r.mHashCode) { - if (mType == PredicateDataType::STRING) { - mValue.Buffer = new char[r.mSize]; - memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize); - mPrecision = 0; - mScale = 0; - } else if (mType == PredicateDataType::DECIMAL) { - mPrecision = r.mPrecision; - mScale = r.mScale; - mValue = r.mValue; - } else if (mType == PredicateDataType::TIMESTAMP) { - mValue.TimeStampVal = r.mValue.TimeStampVal; + : type_(r.type_), size_(r.size_), isNull_(r.isNull_), hashCode_(r.hashCode_) { + if (type_ == PredicateDataType::STRING) { + value_.Buffer = new char[r.size_]; + memcpy(value_.Buffer, r.value_.Buffer, r.size_); + precision_ = 0; + scale_ = 0; + } else if (type_ == PredicateDataType::DECIMAL) { + precision_ = r.precision_; + scale_ = r.scale_; + value_ = r.value_; + } else if (type_ == PredicateDataType::TIMESTAMP) { + value_.TimeStampVal = r.value_.TimeStampVal; } else { - mValue = r.mValue; - mPrecision = 0; - mScale = 0; + value_ = r.value_; + precision_ = 0; + scale_ = 0; } } Literal::~Literal() { - if (mType == PredicateDataType::STRING && mValue.Buffer) { - delete[] mValue.Buffer; - mValue.Buffer = nullptr; + if (type_ == PredicateDataType::STRING && value_.Buffer) { + delete[] value_.Buffer; + value_.Buffer = nullptr; } } Literal& Literal::operator=(const Literal& r) { if (this != &r) { - if (mType == PredicateDataType::STRING && mValue.Buffer) { - delete[] mValue.Buffer; - mValue.Buffer = nullptr; + if (type_ == PredicateDataType::STRING && value_.Buffer) { + delete[] value_.Buffer; + value_.Buffer = nullptr; } - mType = r.mType; - mSize = r.mSize; - mIsNull = r.mIsNull; - mPrecision = r.mPrecision; - mScale = r.mScale; - if (mType == PredicateDataType::STRING) { - mValue.Buffer = new char[r.mSize]; - memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize); - } else if (mType == PredicateDataType::TIMESTAMP) { - mValue.TimeStampVal = r.mValue.TimeStampVal; + type_ = r.type_; + size_ = r.size_; + isNull_ = r.isNull_; + precision_ = r.precision_; + scale_ = r.scale_; + if (type_ == PredicateDataType::STRING) { + value_.Buffer = new char[r.size_]; + memcpy(value_.Buffer, r.value_.Buffer, r.size_); + } else if (type_ == PredicateDataType::TIMESTAMP) { + value_.TimeStampVal = r.value_.TimeStampVal; } else { - mValue = r.mValue; + value_ = r.value_; } - mHashCode = r.mHashCode; + hashCode_ = r.hashCode_; } return *this; } std::string Literal::toString() const { - if (mIsNull) { + if (isNull_) { return "null"; } std::ostringstream sstream; - switch (mType) { + switch (type_) { case PredicateDataType::LONG: - sstream << mValue.IntVal; + sstream << value_.IntVal; break; case PredicateDataType::DATE: - sstream << mValue.DateVal; + sstream << value_.DateVal; break; case PredicateDataType::TIMESTAMP: - sstream << mValue.TimeStampVal.second << "." << mValue.TimeStampVal.nanos; + sstream << value_.TimeStampVal.second << "." << value_.TimeStampVal.nanos; break; case PredicateDataType::FLOAT: - sstream << mValue.DoubleVal; + sstream << value_.DoubleVal; break; case PredicateDataType::BOOLEAN: - sstream << (mValue.BooleanVal ? "true" : "false"); + sstream << (value_.BooleanVal ? "true" : "false"); break; case PredicateDataType::STRING: - sstream << std::string(mValue.Buffer, mSize); + sstream << std::string(value_.Buffer, size_); break; case PredicateDataType::DECIMAL: - sstream << mValue.DecimalVal.toDecimalString(mScale); + sstream << value_.DecimalVal.toDecimalString(scale_); break; } return sstream.str(); } size_t Literal::hashCode() const { - if (mIsNull) { + if (isNull_) { return 0; } - switch (mType) { + switch (type_) { case PredicateDataType::LONG: - return std::hash<int64_t>{}(mValue.IntVal); + return std::hash<int64_t>{}(value_.IntVal); case PredicateDataType::DATE: - return std::hash<int64_t>{}(mValue.DateVal); + return std::hash<int64_t>{}(value_.DateVal); case PredicateDataType::TIMESTAMP: - return std::hash<int64_t>{}(mValue.TimeStampVal.second) * 17 + - std::hash<int32_t>{}(mValue.TimeStampVal.nanos); + return std::hash<int64_t>{}(value_.TimeStampVal.second) * 17 + + std::hash<int32_t>{}(value_.TimeStampVal.nanos); case PredicateDataType::FLOAT: - return std::hash<double>{}(mValue.DoubleVal); + return std::hash<double>{}(value_.DoubleVal); case PredicateDataType::BOOLEAN: - return std::hash<bool>{}(mValue.BooleanVal); + return std::hash<bool>{}(value_.BooleanVal); case PredicateDataType::STRING: - return std::hash<std::string>{}(std::string(mValue.Buffer, mSize)); + return std::hash<std::string>{}(std::string(value_.Buffer, size_)); case PredicateDataType::DECIMAL: // current glibc does not support hash<int128_t> - return std::hash<int64_t>{}(mValue.IntVal); + return std::hash<int64_t>{}(value_.IntVal); default: return 0; } @@ -225,30 +225,30 @@ namespace orc { if (this == &r) { return true; } - if (mHashCode != r.mHashCode || mType != r.mType || mIsNull != r.mIsNull) { + if (hashCode_ != r.hashCode_ || type_ != r.type_ || isNull_ != r.isNull_) { return false; } - if (mIsNull) { + if (isNull_) { return true; } - switch (mType) { + switch (type_) { case PredicateDataType::LONG: - return mValue.IntVal == r.mValue.IntVal; + return value_.IntVal == r.value_.IntVal; case PredicateDataType::DATE: - return mValue.DateVal == r.mValue.DateVal; + return value_.DateVal == r.value_.DateVal; case PredicateDataType::TIMESTAMP: - return mValue.TimeStampVal == r.mValue.TimeStampVal; + return value_.TimeStampVal == r.value_.TimeStampVal; case PredicateDataType::FLOAT: - return std::fabs(mValue.DoubleVal - r.mValue.DoubleVal) < + return std::fabs(value_.DoubleVal - r.value_.DoubleVal) < std::numeric_limits<double>::epsilon(); case PredicateDataType::BOOLEAN: - return mValue.BooleanVal == r.mValue.BooleanVal; + return value_.BooleanVal == r.value_.BooleanVal; case PredicateDataType::STRING: - return mSize == r.mSize && memcmp(mValue.Buffer, r.mValue.Buffer, mSize) == 0; + return size_ == r.size_ && memcmp(value_.Buffer, r.value_.Buffer, size_) == 0; case PredicateDataType::DECIMAL: - return mValue.DecimalVal == r.mValue.DecimalVal; + return value_.DecimalVal == r.value_.DecimalVal; default: return true; } @@ -269,38 +269,38 @@ namespace orc { } int64_t Literal::getLong() const { - validate(mIsNull, mType, PredicateDataType::LONG); - return mValue.IntVal; + validate(isNull_, type_, PredicateDataType::LONG); + return value_.IntVal; } int64_t Literal::getDate() const { - validate(mIsNull, mType, PredicateDataType::DATE); - return mValue.DateVal; + validate(isNull_, type_, PredicateDataType::DATE); + return value_.DateVal; } Literal::Timestamp Literal::getTimestamp() const { - validate(mIsNull, mType, PredicateDataType::TIMESTAMP); - return mValue.TimeStampVal; + validate(isNull_, type_, PredicateDataType::TIMESTAMP); + return value_.TimeStampVal; } double Literal::getFloat() const { - validate(mIsNull, mType, PredicateDataType::FLOAT); - return mValue.DoubleVal; + validate(isNull_, type_, PredicateDataType::FLOAT); + return value_.DoubleVal; } std::string Literal::getString() const { - validate(mIsNull, mType, PredicateDataType::STRING); - return std::string(mValue.Buffer, mSize); + validate(isNull_, type_, PredicateDataType::STRING); + return std::string(value_.Buffer, size_); } bool Literal::getBool() const { - validate(mIsNull, mType, PredicateDataType::BOOLEAN); - return mValue.BooleanVal; + validate(isNull_, type_, PredicateDataType::BOOLEAN); + return value_.BooleanVal; } Decimal Literal::getDecimal() const { - validate(mIsNull, mType, PredicateDataType::DECIMAL); - return Decimal(mValue.DecimalVal, mScale); + validate(isNull_, type_, PredicateDataType::DECIMAL); + return Decimal(value_.DecimalVal, scale_); } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc index 3c23e28beb..3ee58bfef5 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc @@ -30,77 +30,77 @@ namespace orc { PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, Literal literal) - : mOperator(op), mType(type), mColumnName(colName), mHasColumnName(true), mColumnId(0) { - mLiterals.emplace_back(literal); - mHashCode = hashCode(); + : operator_(op), type_(type), columnName_(colName), hasColumnName_(true), columnId_(0) { + literals_.emplace_back(literal); + hashCode_ = hashCode(); validate(); } PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, Literal literal) - : mOperator(op), mType(type), mHasColumnName(false), mColumnId(columnId) { - mLiterals.emplace_back(literal); - mHashCode = hashCode(); + : operator_(op), type_(type), hasColumnName_(false), columnId_(columnId) { + literals_.emplace_back(literal); + hashCode_ = hashCode(); validate(); } PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, const std::initializer_list<Literal>& literals) - : mOperator(op), - mType(type), - mColumnName(colName), - mHasColumnName(true), - mLiterals(literals.begin(), literals.end()) { - mHashCode = hashCode(); + : operator_(op), + type_(type), + columnName_(colName), + hasColumnName_(true), + literals_(literals.begin(), literals.end()) { + hashCode_ = hashCode(); validate(); } PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, const std::initializer_list<Literal>& literals) - : mOperator(op), - mType(type), - mHasColumnName(false), - mColumnId(columnId), - mLiterals(literals.begin(), literals.end()) { - mHashCode = hashCode(); + : operator_(op), + type_(type), + hasColumnName_(false), + columnId_(columnId), + literals_(literals.begin(), literals.end()) { + hashCode_ = hashCode(); validate(); } PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, const std::vector<Literal>& literals) - : mOperator(op), - mType(type), - mColumnName(colName), - mHasColumnName(true), - mLiterals(literals.begin(), literals.end()) { - mHashCode = hashCode(); + : operator_(op), + type_(type), + columnName_(colName), + hasColumnName_(true), + literals_(literals.begin(), literals.end()) { + hashCode_ = hashCode(); validate(); } PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, const std::vector<Literal>& literals) - : mOperator(op), - mType(type), - mHasColumnName(false), - mColumnId(columnId), - mLiterals(literals.begin(), literals.end()) { - mHashCode = hashCode(); + : operator_(op), + type_(type), + hasColumnName_(false), + columnId_(columnId), + literals_(literals.begin(), literals.end()) { + hashCode_ = hashCode(); validate(); } void PredicateLeaf::validateColumn() const { - if (mHasColumnName && mColumnName.empty()) { + if (hasColumnName_ && columnName_.empty()) { throw std::invalid_argument("column name should not be empty"); - } else if (!mHasColumnName && mColumnId == INVALID_COLUMN_ID) { + } else if (!hasColumnName_ && columnId_ == INVALID_COLUMN_ID) { throw std::invalid_argument("invalid column id"); } } void PredicateLeaf::validate() const { - switch (mOperator) { + switch (operator_) { case Operator::IS_NULL: validateColumn(); - if (!mLiterals.empty()) { + if (!literals_.empty()) { throw std::invalid_argument("No literal is required!"); } break; @@ -109,28 +109,28 @@ namespace orc { case Operator::LESS_THAN: case Operator::LESS_THAN_EQUALS: validateColumn(); - if (mLiterals.size() != 1) { + if (literals_.size() != 1) { throw std::invalid_argument("One literal is required!"); } - if (static_cast<int>(mLiterals.at(0).getType()) != static_cast<int>(mType)) { + if (static_cast<int>(literals_.at(0).getType()) != static_cast<int>(type_)) { throw std::invalid_argument("leaf and literal types do not match!"); } break; case Operator::IN: validateColumn(); - if (mLiterals.size() < 2) { + if (literals_.size() < 2) { throw std::invalid_argument("At least two literals are required!"); } - for (auto literal : mLiterals) { - if (static_cast<int>(literal.getType()) != static_cast<int>(mType)) { + for (auto literal : literals_) { + if (static_cast<int>(literal.getType()) != static_cast<int>(type_)) { throw std::invalid_argument("leaf and literal types do not match!"); } } break; case Operator::BETWEEN: validateColumn(); - for (auto literal : mLiterals) { - if (static_cast<int>(literal.getType()) != static_cast<int>(mType)) { + for (auto literal : literals_) { + if (static_cast<int>(literal.getType()) != static_cast<int>(type_)) { throw std::invalid_argument("leaf and literal types do not match!"); } } @@ -141,40 +141,40 @@ namespace orc { } PredicateLeaf::Operator PredicateLeaf::getOperator() const { - return mOperator; + return operator_; } PredicateDataType PredicateLeaf::getType() const { - return mType; + return type_; } bool PredicateLeaf::hasColumnName() const { - return mHasColumnName; + return hasColumnName_; } /** * Get the simple column name. */ const std::string& PredicateLeaf::getColumnName() const { - return mColumnName; + return columnName_; } uint64_t PredicateLeaf::getColumnId() const { - return mColumnId; + return columnId_; } /** * Get the literal half of the predicate leaf. */ Literal PredicateLeaf::getLiteral() const { - return mLiterals.at(0); + return literals_.at(0); } /** * For operators with multiple literals (IN and BETWEEN), get the literals. */ const std::vector<Literal>& PredicateLeaf::getLiteralList() const { - return mLiterals; + return literals_; } static std::string getLiteralString(const std::vector<Literal>& literals) { @@ -195,40 +195,40 @@ namespace orc { } std::string PredicateLeaf::columnDebugString() const { - if (mHasColumnName) return mColumnName; + if (hasColumnName_) return columnName_; std::ostringstream sstream; - sstream << "column(id=" << mColumnId << ')'; + sstream << "column(id=" << columnId_ << ')'; return sstream.str(); } std::string PredicateLeaf::toString() const { std::ostringstream sstream; sstream << '('; - switch (mOperator) { + switch (operator_) { case Operator::IS_NULL: sstream << columnDebugString() << " is null"; break; case Operator::EQUALS: - sstream << columnDebugString() << " = " << getLiteralString(mLiterals); + sstream << columnDebugString() << " = " << getLiteralString(literals_); break; case Operator::NULL_SAFE_EQUALS: - sstream << columnDebugString() << " null_safe_= " << getLiteralString(mLiterals); + sstream << columnDebugString() << " null_safe_= " << getLiteralString(literals_); break; case Operator::LESS_THAN: - sstream << columnDebugString() << " < " << getLiteralString(mLiterals); + sstream << columnDebugString() << " < " << getLiteralString(literals_); break; case Operator::LESS_THAN_EQUALS: - sstream << columnDebugString() << " <= " << getLiteralString(mLiterals); + sstream << columnDebugString() << " <= " << getLiteralString(literals_); break; case Operator::IN: - sstream << columnDebugString() << " in " << getLiteralsString(mLiterals); + sstream << columnDebugString() << " in " << getLiteralsString(literals_); break; case Operator::BETWEEN: - sstream << columnDebugString() << " between " << getLiteralsString(mLiterals); + sstream << columnDebugString() << " between " << getLiteralsString(literals_); break; default: sstream << "unknown operator, column: " << columnDebugString() - << ", literals: " << getLiteralsString(mLiterals); + << ", literals: " << getLiteralsString(literals_); } sstream << ')'; return sstream.str(); @@ -236,25 +236,25 @@ namespace orc { size_t PredicateLeaf::hashCode() const { size_t value = 0; - std::for_each(mLiterals.cbegin(), mLiterals.cend(), + std::for_each(literals_.cbegin(), literals_.cend(), [&](const Literal& literal) { value = value * 17 + literal.getHashCode(); }); auto colHash = - mHasColumnName ? std::hash<std::string>{}(mColumnName) : std::hash<uint64_t>{}(mColumnId); - return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(mOperator)) + - std::hash<int>{}(static_cast<int>(mType)) * 17 + colHash * 3 * 17; + hasColumnName_ ? std::hash<std::string>{}(columnName_) : std::hash<uint64_t>{}(columnId_); + return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(operator_)) + + std::hash<int>{}(static_cast<int>(type_)) * 17 + colHash * 3 * 17; } bool PredicateLeaf::operator==(const PredicateLeaf& r) const { if (this == &r) { return true; } - if (mHashCode != r.mHashCode || mType != r.mType || mOperator != r.mOperator || - mHasColumnName != r.mHasColumnName || mColumnName != r.mColumnName || - mColumnId != r.mColumnId || mLiterals.size() != r.mLiterals.size()) { + if (hashCode_ != r.hashCode_ || type_ != r.type_ || operator_ != r.operator_ || + hasColumnName_ != r.hasColumnName_ || columnName_ != r.columnName_ || + columnId_ != r.columnId_ || literals_.size() != r.literals_.size()) { return false; } - for (size_t i = 0; i != mLiterals.size(); ++i) { - if (mLiterals[i] != r.mLiterals[i]) { + for (size_t i = 0; i != literals_.size(); ++i) { + if (literals_[i] != r.literals_[i]) { return false; } } @@ -507,12 +507,12 @@ namespace orc { TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const { TruthValue result = TruthValue::YES_NO_NULL; - switch (mType) { + switch (type_) { case PredicateDataType::LONG: { if (colStats.has_int_statistics() && colStats.int_statistics().has_minimum() && colStats.int_statistics().has_maximum()) { const auto& stats = colStats.int_statistics(); - result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(), + result = evaluatePredicateRange(operator_, literal2Long(literals_), stats.minimum(), stats.maximum(), colStats.has_null()); } break; @@ -524,7 +524,7 @@ namespace orc { if (!std::isfinite(stats.sum())) { result = colStats.has_null() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; } else { - result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(), + result = evaluatePredicateRange(operator_, literal2Double(literals_), stats.minimum(), stats.maximum(), colStats.has_null()); } } @@ -535,7 +535,7 @@ namespace orc { if (colStats.has_string_statistics() && colStats.string_statistics().has_minimum() && colStats.string_statistics().has_maximum()) { const auto& stats = colStats.string_statistics(); - result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(), + result = evaluatePredicateRange(operator_, literal2String(literals_), stats.minimum(), stats.maximum(), colStats.has_null()); } break; @@ -544,7 +544,7 @@ namespace orc { if (colStats.has_date_statistics() && colStats.date_statistics().has_minimum() && colStats.date_statistics().has_maximum()) { const auto& stats = colStats.date_statistics(); - result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(), + result = evaluatePredicateRange(operator_, literal2Date(literals_), stats.minimum(), stats.maximum(), colStats.has_null()); } break; @@ -566,7 +566,7 @@ namespace orc { Literal::Timestamp maxTimestamp( stats.maximum_utc() / 1000, static_cast<int32_t>((stats.maximum_utc() % 1000) * 1000000) + maxNano); - result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp, + result = evaluatePredicateRange(operator_, literal2Timestamp(literals_), minTimestamp, maxTimestamp, colStats.has_null()); } break; @@ -575,7 +575,7 @@ namespace orc { if (colStats.has_decimal_statistics() && colStats.decimal_statistics().has_minimum() && colStats.decimal_statistics().has_maximum()) { const auto& stats = colStats.decimal_statistics(); - result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals), + result = evaluatePredicateRange(operator_, literal2Decimal(literals_), Decimal(stats.minimum()), Decimal(stats.maximum()), colStats.has_null()); } @@ -583,7 +583,7 @@ namespace orc { } case PredicateDataType::BOOLEAN: { if (colStats.has_bucket_statistics()) { - result = evaluateBoolPredicate(mOperator, mLiterals, colStats); + result = evaluateBoolPredicate(operator_, literals_, colStats); } break; } @@ -592,8 +592,8 @@ namespace orc { } // make sure null literal is respected for IN operator - if (mOperator == Operator::IN && colStats.has_null()) { - for (const auto& literal : mLiterals) { + if (operator_ == Operator::IN && colStats.has_null()) { + for (const auto& literal : literals_) { if (literal.isNull()) { result = TruthValue::YES_NO_NULL; break; @@ -664,18 +664,18 @@ namespace orc { } TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter* bf, bool hasNull) const { - switch (mOperator) { + switch (operator_) { case Operator::NULL_SAFE_EQUALS: // null safe equals does not return *_NULL variant. // So set hasNull to false - return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, false); + return checkInBloomFilter(operator_, type_, literals_.front(), bf, false); case Operator::EQUALS: - return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, hasNull); + return checkInBloomFilter(operator_, type_, literals_.front(), bf, hasNull); case Operator::IN: - for (const auto& literal : mLiterals) { + for (const auto& literal : literals_) { // if at least one value in IN list exist in bloom filter, // qualify the row group/stripe - TruthValue result = checkInBloomFilter(mOperator, mType, literal, bf, hasNull); + TruthValue result = checkInBloomFilter(operator_, type_, literal, bf, hasNull); if (result == TruthValue::YES_NO_NULL || result == TruthValue::YES_NO) { return result; } @@ -695,7 +695,7 @@ namespace orc { const BloomFilter* bloomFilter) const { // files written before ORC-135 stores timestamp wrt to local timezone // causing issues with PPD. disable PPD for timestamp for all old files - if (mType == PredicateDataType::TIMESTAMP) { + if (type_ == PredicateDataType::TIMESTAMP) { if (writerVersion < WriterVersion::WriterVersion_ORC_135) { return TruthValue::YES_NO_NULL; } @@ -705,9 +705,9 @@ namespace orc { if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL; bool allNull = colStats.has_null() && colStats.number_of_values() == 0; - if (mOperator == Operator::IS_NULL || - ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) && - mLiterals.at(0).isNull())) { + if (operator_ == Operator::IS_NULL || + ((operator_ == Operator::EQUALS || operator_ == Operator::NULL_SAFE_EQUALS) && + literals_.at(0).isNull())) { // IS_NULL operator does not need to check min/max stats and bloom filter return allNull ? TruthValue::YES : (colStats.has_null() ? TruthValue::YES_NO : TruthValue::NO); @@ -717,7 +717,7 @@ namespace orc { } TruthValue result = evaluatePredicateMinMax(colStats); - if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) { + if (shouldEvaluateBloomFilter(operator_, result, bloomFilter)) { return evaluatePredicateBloomFiter(bloomFilter, colStats.has_null()); } else { return result; diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh index 21ed456155..81fd6d98b7 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh @@ -127,7 +127,7 @@ namespace orc { bool operator==(const PredicateLeaf& r) const; size_t getHashCode() const { - return mHashCode; + return hashCode_; } private: @@ -143,13 +143,13 @@ namespace orc { TruthValue evaluatePredicateBloomFiter(const BloomFilter* bloomFilter, bool hasNull) const; private: - Operator mOperator; - PredicateDataType mType; - std::string mColumnName; - bool mHasColumnName; - uint64_t mColumnId; - std::vector<Literal> mLiterals; - size_t mHashCode; + Operator operator_; + PredicateDataType type_; + std::string columnName_; + bool hasColumnName_; + uint64_t columnId_; + std::vector<Literal> literals_; + size_t hashCode_; }; struct PredicateLeafHash { diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc index 0e369bf453..b3085964d4 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc @@ -40,24 +40,24 @@ namespace orc { SargsApplier::SargsApplier(const Type& type, const SearchArgument* searchArgument, uint64_t rowIndexStride, WriterVersion writerVersion, ReaderMetrics* metrics, const SchemaEvolution* schemaEvolution) - : mType(type), - mSearchArgument(searchArgument), - mSchemaEvolution(schemaEvolution), - mRowIndexStride(rowIndexStride), - mWriterVersion(writerVersion), - mHasEvaluatedFileStats(false), - mFileStatsEvalResult(true), - mMetrics(metrics) { - const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument); + : type_(type), + searchArgument_(searchArgument), + schemaEvolution_(schemaEvolution), + rowIndexStride_(rowIndexStride), + writerVersion_(writerVersion), + hasEvaluatedFileStats_(false), + fileStatsEvalResult_(true), + metrics_(metrics) { + const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(searchArgument_); // find the mapping from predicate leaves to columns const std::vector<PredicateLeaf>& leaves = sargs->getLeaves(); - mFilterColumns.resize(leaves.size(), INVALID_COLUMN_ID); - for (size_t i = 0; i != mFilterColumns.size(); ++i) { + filterColumns_.resize(leaves.size(), INVALID_COLUMN_ID); + for (size_t i = 0; i != filterColumns_.size(); ++i) { if (leaves[i].hasColumnName()) { - mFilterColumns[i] = findColumn(type, leaves[i].getColumnName()); + filterColumns_[i] = findColumn(type, leaves[i].getColumnName()); } else { - mFilterColumns[i] = leaves[i].getColumnId(); + filterColumns_[i] = leaves[i].getColumnId(); } } } @@ -66,30 +66,30 @@ namespace orc { const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes, const std::map<uint32_t, BloomFilterIndex>& bloomFilters) { // init state of each row group - uint64_t groupsInStripe = (rowsInStripe + mRowIndexStride - 1) / mRowIndexStride; - mNextSkippedRows.resize(groupsInStripe); - mTotalRowsInStripe = rowsInStripe; + uint64_t groupsInStripe = (rowsInStripe + rowIndexStride_ - 1) / rowIndexStride_; + nextSkippedRows_.resize(groupsInStripe); + totalRowsInStripe_ = rowsInStripe; // row indexes do not exist, simply read all rows if (rowIndexes.empty()) { return true; } - const auto& leaves = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument)->getLeaves(); + const auto& leaves = dynamic_cast<const SearchArgumentImpl*>(searchArgument_)->getLeaves(); std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL); - mHasSelected = false; - mHasSkipped = false; + hasSelected_ = false; + hasSkipped_ = false; uint64_t nextSkippedRowGroup = groupsInStripe; size_t rowGroup = groupsInStripe; do { --rowGroup; for (size_t pred = 0; pred != leaves.size(); ++pred) { - uint64_t columnIdx = mFilterColumns[pred]; + uint64_t columnIdx = filterColumns_[pred]; auto rowIndexIter = rowIndexes.find(columnIdx); if (columnIdx == INVALID_COLUMN_ID || rowIndexIter == rowIndexes.cend()) { // this column does not exist in current file leafValues[pred] = TruthValue::YES_NO_NULL; - } else if (mSchemaEvolution && !mSchemaEvolution->isSafePPDConversion(columnIdx)) { + } else if (schemaEvolution_ && !schemaEvolution_->isSafePPDConversion(columnIdx)) { // cannot evaluate predicate when ppd is not safe leafValues[pred] = TruthValue::YES_NO_NULL; } else { @@ -104,37 +104,37 @@ namespace orc { bloomFilter = iter->second.entries.at(rowGroup); } - leafValues[pred] = leaves[pred].evaluate(mWriterVersion, statistics, bloomFilter.get()); + leafValues[pred] = leaves[pred].evaluate(writerVersion_, statistics, bloomFilter.get()); } } - bool needed = isNeeded(mSearchArgument->evaluate(leafValues)); + bool needed = isNeeded(searchArgument_->evaluate(leafValues)); if (!needed) { - mNextSkippedRows[rowGroup] = 0; + nextSkippedRows_[rowGroup] = 0; nextSkippedRowGroup = rowGroup; } else { - mNextSkippedRows[rowGroup] = (nextSkippedRowGroup == groupsInStripe) + nextSkippedRows_[rowGroup] = (nextSkippedRowGroup == groupsInStripe) ? rowsInStripe - : (nextSkippedRowGroup * mRowIndexStride); + : (nextSkippedRowGroup * rowIndexStride_); } - mHasSelected |= needed; - mHasSkipped |= !needed; + hasSelected_ |= needed; + hasSkipped_ |= !needed; } while (rowGroup != 0); // update stats uint64_t selectedRGs = std::accumulate( - mNextSkippedRows.cbegin(), mNextSkippedRows.cend(), 0UL, + nextSkippedRows_.cbegin(), nextSkippedRows_.cend(), 0UL, [](uint64_t initVal, uint64_t rg) { return rg > 0 ? initVal + 1 : initVal; }); - if (mMetrics != nullptr) { - mMetrics->SelectedRowGroupCount.fetch_add(selectedRGs); - mMetrics->EvaluatedRowGroupCount.fetch_add(groupsInStripe); + if (metrics_ != nullptr) { + metrics_->SelectedRowGroupCount.fetch_add(selectedRGs); + metrics_->EvaluatedRowGroupCount.fetch_add(groupsInStripe); } - return mHasSelected; + return hasSelected_; } bool SargsApplier::evaluateColumnStatistics(const PbColumnStatistics& colStats) const { - const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument); + const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(searchArgument_); if (sargs == nullptr) { throw InvalidArgument("Failed to cast to SearchArgumentImpl"); } @@ -143,14 +143,14 @@ namespace orc { std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL); for (size_t pred = 0; pred != leaves.size(); ++pred) { - uint64_t columnId = mFilterColumns[pred]; + uint64_t columnId = filterColumns_[pred]; if (columnId != INVALID_COLUMN_ID && colStats.size() > static_cast<int>(columnId)) { - leafValues[pred] = leaves[pred].evaluate(mWriterVersion, + leafValues[pred] = leaves[pred].evaluate(writerVersion_, colStats.Get(static_cast<int>(columnId)), nullptr); } } - return isNeeded(mSearchArgument->evaluate(leafValues)); + return isNeeded(searchArgument_->evaluate(leafValues)); } bool SargsApplier::evaluateStripeStatistics(const proto::StripeStatistics& stripeStats, @@ -160,29 +160,29 @@ namespace orc { } bool ret = evaluateColumnStatistics(stripeStats.col_stats()); - if (mMetrics != nullptr) { - mMetrics->EvaluatedRowGroupCount.fetch_add(stripeRowGroupCount); + if (metrics_ != nullptr) { + metrics_->EvaluatedRowGroupCount.fetch_add(stripeRowGroupCount); } if (!ret) { // reset mNextSkippedRows when the current stripe does not satisfy the PPD - mNextSkippedRows.clear(); + nextSkippedRows_.clear(); } return ret; } bool SargsApplier::evaluateFileStatistics(const proto::Footer& footer, uint64_t numRowGroupsInStripeRange) { - if (!mHasEvaluatedFileStats) { + if (!hasEvaluatedFileStats_) { if (footer.statistics_size() == 0) { - mFileStatsEvalResult = true; + fileStatsEvalResult_ = true; } else { - mFileStatsEvalResult = evaluateColumnStatistics(footer.statistics()); - if (mMetrics != nullptr) { - mMetrics->EvaluatedRowGroupCount.fetch_add(numRowGroupsInStripeRange); + fileStatsEvalResult_ = evaluateColumnStatistics(footer.statistics()); + if (metrics_ != nullptr) { + metrics_->EvaluatedRowGroupCount.fetch_add(numRowGroupsInStripeRange); } } - mHasEvaluatedFileStats = true; + hasEvaluatedFileStats_ = true; } - return mFileStatsEvalResult; + return fileStatsEvalResult_; } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh index 73703dcf6b..65c8dec83b 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh @@ -75,30 +75,30 @@ namespace orc { * Only valid after invoking pickRowGroups(). */ const std::vector<uint64_t>& getNextSkippedRows() const { - return mNextSkippedRows; + return nextSkippedRows_; } /** * Indicate whether any row group is selected in the last evaluation */ bool hasSelected() const { - return mHasSelected; + return hasSelected_; } /** * Indicate whether any row group is skipped in the last evaluation */ bool hasSkipped() const { - return mHasSkipped; + return hasSkipped_; } /** * Whether any row group from current row in the stripe matches PPD. */ bool hasSelectedFrom(uint64_t currentRowInStripe) const { - uint64_t rg = currentRowInStripe / mRowIndexStride; - for (; rg < mNextSkippedRows.size(); ++rg) { - if (mNextSkippedRows[rg]) { + uint64_t rg = currentRowInStripe / rowIndexStride_; + for (; rg < nextSkippedRows_.size(); ++rg) { + if (nextSkippedRows_[rg]) { return true; } } @@ -106,9 +106,9 @@ namespace orc { } std::pair<uint64_t, uint64_t> getStats() const { - if (mMetrics != nullptr) { - return std::make_pair(mMetrics->SelectedRowGroupCount.load(), - mMetrics->EvaluatedRowGroupCount.load()); + if (metrics_ != nullptr) { + return std::make_pair(metrics_->SelectedRowGroupCount.load(), + metrics_->EvaluatedRowGroupCount.load()); } else { return {0, 0}; } @@ -125,27 +125,27 @@ namespace orc { static uint64_t findColumn(const Type& type, const std::string& colName); private: - const Type& mType; - const SearchArgument* mSearchArgument; - const SchemaEvolution* mSchemaEvolution; - uint64_t mRowIndexStride; - WriterVersion mWriterVersion; + const Type& type_; + const SearchArgument* searchArgument_; + const SchemaEvolution* schemaEvolution_; + uint64_t rowIndexStride_; + WriterVersion writerVersion_; // column ids for each predicate leaf in the search argument - std::vector<uint64_t> mFilterColumns; + std::vector<uint64_t> filterColumns_; // Map from RowGroup index to the next skipped row of the selected range it // locates. If the RowGroup is not selected, set the value to 0. // Calculated in pickRowGroups(). - std::vector<uint64_t> mNextSkippedRows; - uint64_t mTotalRowsInStripe; - bool mHasSelected; - bool mHasSkipped; + std::vector<uint64_t> nextSkippedRows_; + uint64_t totalRowsInStripe_; + bool hasSelected_; + bool hasSkipped_; // store result of file stats evaluation - bool mHasEvaluatedFileStats; - bool mFileStatsEvalResult; + bool hasEvaluatedFileStats_; + bool fileStatsEvalResult_; // use the SelectedRowGroupCount and EvaluatedRowGroupCount to // keep stats of selected RGs and evaluated RGs - ReaderMetrics* mMetrics; + ReaderMetrics* metrics_; }; } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc index 806727f0a0..83d4af2435 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc @@ -30,23 +30,23 @@ namespace orc { } const std::vector<PredicateLeaf>& SearchArgumentImpl::getLeaves() const { - return mLeaves; + return leaves_; } const ExpressionTree* SearchArgumentImpl::getExpression() const { - return mExpressionTree.get(); + return expressionTree_.get(); } TruthValue SearchArgumentImpl::evaluate(const std::vector<TruthValue>& leaves) const { - return mExpressionTree == nullptr ? TruthValue::YES : mExpressionTree->evaluate(leaves); + return expressionTree_ == nullptr ? TruthValue::YES : expressionTree_->evaluate(leaves); } std::string SearchArgumentImpl::toString() const { std::ostringstream sstream; - for (size_t i = 0; i != mLeaves.size(); ++i) { - sstream << "leaf-" << i << " = " << mLeaves.at(i).toString() << ", "; + for (size_t i = 0; i != leaves_.size(); ++i) { + sstream << "leaf-" << i << " = " << leaves_.at(i).toString() << ", "; } - sstream << "expr = " << mExpressionTree->toString(); + sstream << "expr = " << expressionTree_->toString(); return sstream.str(); } @@ -55,14 +55,14 @@ namespace orc { } SearchArgumentBuilderImpl::SearchArgumentBuilderImpl() { - mRoot.reset(new ExpressionTree(ExpressionTree::Operator::AND)); - mCurrTree.push_back(mRoot); + root_.reset(new ExpressionTree(ExpressionTree::Operator::AND)); + currTree_.push_back(root_); } SearchArgumentBuilder& SearchArgumentBuilderImpl::start(ExpressionTree::Operator op) { TreeNode node = std::make_shared<ExpressionTree>(op); - mCurrTree.front()->addChild(node); - mCurrTree.push_front(node); + currTree_.front()->addChild(node); + currTree_.push_front(node); return *this; } @@ -79,9 +79,9 @@ namespace orc { } SearchArgumentBuilder& SearchArgumentBuilderImpl::end() { - TreeNode& current = mCurrTree.front(); + TreeNode& current = currTree_.front(); if (current->getChildren().empty()) { - throw std::invalid_argument("Cannot create expression " + mRoot->toString() + + throw std::invalid_argument("Cannot create expression " + root_->toString() + " with no children."); } if (current->getOperator() == ExpressionTree::Operator::NOT && @@ -89,13 +89,13 @@ namespace orc { throw std::invalid_argument("Can't create NOT expression " + current->toString() + " with more than 1 child."); } - mCurrTree.pop_front(); + currTree_.pop_front(); return *this; } size_t SearchArgumentBuilderImpl::addLeaf(PredicateLeaf leaf) { - size_t id = mLeaves.size(); - const auto& result = mLeaves.insert(std::make_pair(leaf, id)); + size_t id = leaves_.size(); + const auto& result = leaves_.insert(std::make_pair(leaf, id)); return result.first->second; } @@ -112,7 +112,7 @@ namespace orc { T column, PredicateDataType type, Literal literal) { - TreeNode parent = mCurrTree.front(); + TreeNode parent = currTree_.front(); if (isInvalidColumn(column)) { parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); } else { @@ -181,7 +181,7 @@ namespace orc { template <typename T, typename CONTAINER> SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIn(T column, PredicateDataType type, const CONTAINER& literals) { - TreeNode& parent = mCurrTree.front(); + TreeNode& parent = currTree_.front(); if (isInvalidColumn(column)) { parent->addChild(std::make_shared<ExpressionTree>((TruthValue::YES_NO_NULL))); } else { @@ -219,7 +219,7 @@ namespace orc { template <typename T> SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIsNull(T column, PredicateDataType type) { - TreeNode& parent = mCurrTree.front(); + TreeNode& parent = currTree_.front(); if (isInvalidColumn(column)) { parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); } else { @@ -244,7 +244,7 @@ namespace orc { PredicateDataType type, Literal lower, Literal upper) { - TreeNode& parent = mCurrTree.front(); + TreeNode& parent = currTree_.front(); if (isInvalidColumn(column)) { parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); } else { @@ -267,7 +267,7 @@ namespace orc { } SearchArgumentBuilder& SearchArgumentBuilderImpl::literal(TruthValue truth) { - TreeNode& parent = mCurrTree.front(); + TreeNode& parent = currTree_.front(); parent->addChild(std::make_shared<ExpressionTree>(truth)); return *this; } @@ -555,34 +555,34 @@ namespace orc { } SearchArgumentImpl::SearchArgumentImpl(TreeNode root, const std::vector<PredicateLeaf>& leaves) - : mExpressionTree(root), mLeaves(leaves) { + : expressionTree_(root), leaves_(leaves) { // PASS } std::unique_ptr<SearchArgument> SearchArgumentBuilderImpl::build() { - if (mCurrTree.size() != 1) { - throw std::invalid_argument("Failed to end " + std::to_string(mCurrTree.size()) + + if (currTree_.size() != 1) { + throw std::invalid_argument("Failed to end " + std::to_string(currTree_.size()) + " operations."); } - mRoot = pushDownNot(mRoot); - mRoot = foldMaybe(mRoot); - mRoot = flatten(mRoot); - mRoot = convertToCNF(mRoot); - mRoot = flatten(mRoot); - std::vector<size_t> leafReorder(mLeaves.size(), UNUSED_LEAF); - size_t newLeafCount = compactLeaves(mRoot, 0, leafReorder.data()); - mRoot = rewriteLeaves(mRoot, leafReorder.data()); + root_ = pushDownNot(root_); + root_ = foldMaybe(root_); + root_ = flatten(root_); + root_ = convertToCNF(root_); + root_ = flatten(root_); + std::vector<size_t> leafReorder(leaves_.size(), UNUSED_LEAF); + size_t newLeafCount = compactLeaves(root_, 0, leafReorder.data()); + root_ = rewriteLeaves(root_, leafReorder.data()); std::vector<PredicateLeaf> leafList(newLeafCount, PredicateLeaf()); // build the new list - for (auto& leaf : mLeaves) { + for (auto& leaf : leaves_) { size_t newLoc = leafReorder[leaf.second]; if (newLoc != UNUSED_LEAF) { leafList[newLoc] = leaf.first; } } - return std::make_unique<SearchArgumentImpl>(mRoot, leafList); + return std::make_unique<SearchArgumentImpl>(root_, leafList); } std::unique_ptr<SearchArgumentBuilder> SearchArgumentFactory::newBuilder() { diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh index 4b74b28743..1963c993d6 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh @@ -66,8 +66,8 @@ namespace orc { std::string toString() const override; private: - std::shared_ptr<ExpressionTree> mExpressionTree; - std::vector<PredicateLeaf> mLeaves; + std::shared_ptr<ExpressionTree> expressionTree_; + std::vector<PredicateLeaf> leaves_; }; /** @@ -304,9 +304,9 @@ namespace orc { static TreeNode convertToCNF(TreeNode root); private: - std::deque<TreeNode> mCurrTree; - std::unordered_map<PredicateLeaf, size_t, PredicateLeafHash, PredicateLeafComparator> mLeaves; - std::shared_ptr<ExpressionTree> mRoot; + std::deque<TreeNode> currTree_; + std::unordered_map<PredicateLeaf, size_t, PredicateLeafHash, PredicateLeafComparator> leaves_; + std::shared_ptr<ExpressionTree> root_; }; } // namespace orc diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make index 12617d59ab..b757fec915 100644 --- a/contrib/libs/apache/orc/ya.make +++ b/contrib/libs/apache/orc/ya.make @@ -6,9 +6,9 @@ LICENSE(Apache-2.0) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -VERSION(2.0.3) +VERSION(2.1.0) -ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.0.3.tar.gz) +ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.1.0.tar.gz) PEERDIR( contrib/libs/apache/orc-format @@ -65,6 +65,7 @@ SRCS( c++/src/TypeImpl.cc c++/src/Vector.cc c++/src/Writer.cc + c++/src/io/Cache.cc c++/src/io/InputStream.cc c++/src/io/OutputStream.cc c++/src/sargs/ExpressionTree.cc |