aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs
diff options
context:
space:
mode:
authorrobot-contrib <robot-contrib@yandex-team.com>2025-01-14 13:24:41 +0300
committerrobot-contrib <robot-contrib@yandex-team.com>2025-01-14 13:52:04 +0300
commit9e771f1b1c96aedbfdd5ac897a61aa7af1fb1684 (patch)
treef68905ba70bf0ac0db3f6b06edc1034395def40a /contrib/libs
parentd04cf8fc2232c749af6ad9ffc0a8d235627db0aa (diff)
downloadydb-9e771f1b1c96aedbfdd5ac897a61aa7af1fb1684.tar.gz
Update contrib/libs/apache/orc to 2.1.0
commit_hash:69caf27dc9a3b69957ea34c11fa5f7f2d2f6360a
Diffstat (limited to 'contrib/libs')
-rw-r--r--contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report4
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh8
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Common.hh12
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Exceptions.hh28
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Int128.hh134
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh26
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/OrcFile.hh13
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Reader.hh63
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Vector.hh26
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Writer.hh28
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/orc-config.hh2
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh22
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor-linux.hh1
-rw-r--r--contrib/libs/apache/orc/c++/src/BlockBuffer.cc50
-rw-r--r--contrib/libs/apache/orc/c++/src/BlockBuffer.hh20
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.cc80
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.hh8
-rw-r--r--contrib/libs/apache/orc/c++/src/BpackingDefault.cc154
-rw-r--r--contrib/libs/apache/orc/c++/src/BpackingDefault.hh2
-rw-r--r--contrib/libs/apache/orc/c++/src/ByteRLE.cc70
-rw-r--r--contrib/libs/apache/orc/c++/src/ByteRLE.hh7
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnPrinter.cc326
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnReader.cc403
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.cc740
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.hh18
-rw-r--r--contrib/libs/apache/orc/c++/src/Common.cc4
-rw-r--r--contrib/libs/apache/orc/c++/src/Compression.cc449
-rw-r--r--contrib/libs/apache/orc/c++/src/Compression.hh15
-rw-r--r--contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc811
-rw-r--r--contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc161
-rw-r--r--contrib/libs/apache/orc/c++/src/Exceptions.cc32
-rw-r--r--contrib/libs/apache/orc/c++/src/Int128.cc50
-rw-r--r--contrib/libs/apache/orc/c++/src/MemoryPool.cc162
-rw-r--r--contrib/libs/apache/orc/c++/src/Options.hh136
-rw-r--r--contrib/libs/apache/orc/c++/src/OrcFile.cc90
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.cc16
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.hh9
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv1.cc127
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv1.hh24
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv2.hh82
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.cc796
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.hh131
-rw-r--r--contrib/libs/apache/orc/c++/src/RleDecoderV2.cc174
-rw-r--r--contrib/libs/apache/orc/c++/src/RleEncoderV2.cc191
-rw-r--r--contrib/libs/apache/orc/c++/src/SchemaEvolution.cc69
-rw-r--r--contrib/libs/apache/orc/c++/src/SchemaEvolution.hh8
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.cc208
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.hh800
-rw-r--r--contrib/libs/apache/orc/c++/src/StripeStream.cc116
-rw-r--r--contrib/libs/apache/orc/c++/src/StripeStream.hh115
-rw-r--r--contrib/libs/apache/orc/c++/src/Timezone.cc284
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.cc159
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.hh22
-rw-r--r--contrib/libs/apache/orc/c++/src/Utils.hh102
-rw-r--r--contrib/libs/apache/orc/c++/src/Vector.cc63
-rw-r--r--contrib/libs/apache/orc/c++/src/Writer.cc362
-rw-r--r--contrib/libs/apache/orc/c++/src/io/Cache.cc171
-rw-r--r--contrib/libs/apache/orc/c++/src/io/Cache.hh122
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.cc110
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.hh30
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.cc82
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.hh27
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc72
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh8
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/Literal.cc268
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc174
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh16
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc94
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh44
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc66
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh10
-rw-r--r--contrib/libs/apache/orc/ya.make5
72 files changed, 5458 insertions, 3854 deletions
diff --git a/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report b/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report
index bd6f063606..a2e9c7ccd9 100644
--- a/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report
@@ -31,7 +31,7 @@
KEEP Apache-2.0 44dc743c95835a9e71d7b3cca63dcc7c
BELONGS ya.make
-FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c++/include/orc/ColumnPrinter.hh at line 3, c++/include/orc/Common.hh at line 3, c++/include/orc/Exceptions.hh at line 3, c++/include/orc/Int128.hh at line 3, c++/include/orc/MemoryPool.hh at line 3, c++/include/orc/OrcFile.hh at line 3, c++/include/orc/Reader.hh at line 3, c++/include/orc/Statistics.hh at line 3, c++/include/orc/Type.hh at line 3, c++/include/orc/Vector.hh at line 3, c++/include/orc/Writer.hh at line 3, c++/include/orc/orc-config.hh at line 3, c++/include/orc/sargs/Literal.hh at line 3, c++/include/orc/sargs/SearchArgument.hh at line 3, c++/include/orc/sargs/TruthValue.hh at line 3, c++/src/Adaptor-linux.hh at line 3, c++/src/Adaptor.cc at line 3, c++/src/BlockBuffer.cc at line 3, c++/src/BlockBuffer.hh at line 3, c++/src/BloomFilter.cc at line 3, c++/src/BloomFilter.hh at line 3, c++/src/Bpacking.hh at line 3, c++/src/BpackingDefault.cc at line 3, c++/src/BpackingDefault.hh at line 3, c++/src/ByteRLE.cc at line 3, c++/src/ByteRLE.hh at line 3, c++/src/ColumnPrinter.cc at line 3, c++/src/ColumnReader.cc at line 3, c++/src/ColumnReader.hh at line 3, c++/src/ColumnWriter.cc at line 3, c++/src/ColumnWriter.hh at line 3, c++/src/Common.cc at line 3, c++/src/Compression.cc at line 3, c++/src/Compression.hh at line 3, c++/src/ConvertColumnReader.cc at line 3, c++/src/ConvertColumnReader.hh at line 3, c++/src/CpuInfoUtil.cc at line 3, c++/src/CpuInfoUtil.hh at line 3, c++/src/Dispatch.hh at line 3, c++/src/Exceptions.cc at line 3, c++/src/Int128.cc at line 3, c++/src/LzoDecompressor.hh at line 3, c++/src/MemoryPool.cc at line 3, c++/src/Murmur3.cc at line 3, c++/src/Murmur3.hh at line 3, c++/src/Options.hh at line 3, c++/src/OrcFile.cc at line 3, c++/src/RLE.cc at line 3, c++/src/RLE.hh at line 3, c++/src/RLEV2Util.hh at line 3, c++/src/RLEv1.cc at line 3, c++/src/RLEv1.hh at line 3, c++/src/RLEv2.hh at line 3, c++/src/Reader.cc at line 3, c++/src/Reader.hh at line 3, c++/src/RleDecoderV2.cc at line 3, c++/src/SchemaEvolution.cc at line 3, c++/src/SchemaEvolution.hh at line 3, c++/src/Statistics.cc at line 3, c++/src/Statistics.hh at line 3, c++/src/StripeStream.cc at line 3, c++/src/StripeStream.hh at line 3, c++/src/Timezone.cc at line 3, c++/src/Timezone.hh at line 3, c++/src/TypeImpl.cc at line 3, c++/src/TypeImpl.hh at line 3, c++/src/Utils.hh at line 3, c++/src/Vector.cc at line 3, c++/src/Writer.cc at line 3, c++/src/io/InputStream.cc at line 3, c++/src/io/InputStream.hh at line 3, c++/src/io/OutputStream.cc at line 3, c++/src/io/OutputStream.hh at line 3, c++/src/sargs/ExpressionTree.cc at line 3, c++/src/sargs/ExpressionTree.hh at line 3, c++/src/sargs/Literal.cc at line 3, c++/src/sargs/PredicateLeaf.cc at line 3, c++/src/sargs/PredicateLeaf.hh at line 3, c++/src/sargs/SargsApplier.cc at line 3, c++/src/sargs/SargsApplier.hh at line 3, c++/src/sargs/SearchArgument.cc at line 3, c++/src/sargs/SearchArgument.hh at line 3, c++/src/sargs/TruthValue.cc at line 3
+FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c++/include/orc/ColumnPrinter.hh at line 3, c++/include/orc/Common.hh at line 3, c++/include/orc/Exceptions.hh at line 3, c++/include/orc/Int128.hh at line 3, c++/include/orc/MemoryPool.hh at line 3, c++/include/orc/OrcFile.hh at line 3, c++/include/orc/Reader.hh at line 3, c++/include/orc/Statistics.hh at line 3, c++/include/orc/Type.hh at line 3, c++/include/orc/Vector.hh at line 3, c++/include/orc/Writer.hh at line 3, c++/include/orc/orc-config.hh at line 3, c++/include/orc/sargs/Literal.hh at line 3, c++/include/orc/sargs/SearchArgument.hh at line 3, c++/include/orc/sargs/TruthValue.hh at line 3, c++/src/Adaptor-linux.hh at line 3, c++/src/Adaptor.cc at line 3, c++/src/BlockBuffer.cc at line 3, c++/src/BlockBuffer.hh at line 3, c++/src/BloomFilter.cc at line 3, c++/src/BloomFilter.hh at line 3, c++/src/Bpacking.hh at line 3, c++/src/BpackingDefault.cc at line 3, c++/src/BpackingDefault.hh at line 3, c++/src/ByteRLE.cc at line 3, c++/src/ByteRLE.hh at line 3, c++/src/ColumnPrinter.cc at line 3, c++/src/ColumnReader.cc at line 3, c++/src/ColumnReader.hh at line 3, c++/src/ColumnWriter.cc at line 3, c++/src/ColumnWriter.hh at line 3, c++/src/Common.cc at line 3, c++/src/Compression.cc at line 3, c++/src/Compression.hh at line 3, c++/src/ConvertColumnReader.cc at line 3, c++/src/ConvertColumnReader.hh at line 3, c++/src/CpuInfoUtil.cc at line 3, c++/src/CpuInfoUtil.hh at line 3, c++/src/Dispatch.hh at line 3, c++/src/Exceptions.cc at line 3, c++/src/Int128.cc at line 3, c++/src/LzoDecompressor.hh at line 3, c++/src/MemoryPool.cc at line 3, c++/src/Murmur3.cc at line 3, c++/src/Murmur3.hh at line 3, c++/src/Options.hh at line 3, c++/src/OrcFile.cc at line 3, c++/src/RLE.cc at line 3, c++/src/RLE.hh at line 3, c++/src/RLEV2Util.hh at line 3, c++/src/RLEv1.cc at line 3, c++/src/RLEv1.hh at line 3, c++/src/RLEv2.hh at line 3, c++/src/Reader.cc at line 3, c++/src/Reader.hh at line 3, c++/src/RleDecoderV2.cc at line 3, c++/src/SchemaEvolution.cc at line 3, c++/src/SchemaEvolution.hh at line 3, c++/src/Statistics.cc at line 3, c++/src/Statistics.hh at line 3, c++/src/StripeStream.cc at line 3, c++/src/StripeStream.hh at line 3, c++/src/Timezone.cc at line 3, c++/src/Timezone.hh at line 3, c++/src/TypeImpl.cc at line 3, c++/src/TypeImpl.hh at line 3, c++/src/Utils.hh at line 3, c++/src/Vector.cc at line 3, c++/src/Writer.cc at line 3, c++/src/io/Cache.cc at line 3, c++/src/io/Cache.hh at line 3, c++/src/io/InputStream.cc at line 3, c++/src/io/InputStream.hh at line 3, c++/src/io/OutputStream.cc at line 3, c++/src/io/OutputStream.hh at line 3, c++/src/sargs/ExpressionTree.cc at line 3, c++/src/sargs/ExpressionTree.hh at line 3, c++/src/sargs/Literal.cc at line 3, c++/src/sargs/PredicateLeaf.cc at line 3, c++/src/sargs/PredicateLeaf.hh at line 3, c++/src/sargs/SargsApplier.cc at line 3, c++/src/sargs/SargsApplier.hh at line 3, c++/src/sargs/SearchArgument.cc at line 3, c++/src/sargs/SearchArgument.hh at line 3, c++/src/sargs/TruthValue.cc at line 3
Note: matched license text is too long. Read it in the source files.
Scancode info:
Original SPDX id: Apache-2.0
@@ -109,6 +109,8 @@ FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c+
c++/src/Utils.hh [2:16]
c++/src/Vector.cc [2:16]
c++/src/Writer.cc [2:16]
+ c++/src/io/Cache.cc [2:16]
+ c++/src/io/Cache.hh [2:16]
c++/src/io/InputStream.cc [2:16]
c++/src/io/InputStream.hh [2:16]
c++/src/io/OutputStream.cc [2:16]
diff --git a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh
index 328c0e84b6..dbdd49a65b 100644
--- a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh
@@ -29,7 +29,6 @@
#include <vector>
namespace orc {
-
class ColumnPrinter {
protected:
std::string& buffer;
@@ -42,8 +41,13 @@ namespace orc {
virtual void printRow(uint64_t rowId) = 0;
// should be called once at the start of each batch of rows
virtual void reset(const ColumnVectorBatch& batch);
+ struct Param {
+ bool printDecimalAsString = false;
+ bool printDecimalTrimTrailingZeros = false;
+ };
};
- std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&, const Type* type);
+ std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&, const Type* type,
+ ColumnPrinter::Param = {});
} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Common.hh b/contrib/libs/apache/orc/c++/include/orc/Common.hh
index e983280e46..d72ecc9f62 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Common.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Common.hh
@@ -33,32 +33,32 @@ namespace orc {
class FileVersion {
private:
- uint32_t majorVersion;
- uint32_t minorVersion;
+ uint32_t majorVersion_;
+ uint32_t minorVersion_;
public:
static const FileVersion& v_0_11();
static const FileVersion& v_0_12();
static const FileVersion& UNSTABLE_PRE_2_0();
- FileVersion(uint32_t major, uint32_t minor) : majorVersion(major), minorVersion(minor) {}
+ FileVersion(uint32_t major, uint32_t minor) : majorVersion_(major), minorVersion_(minor) {}
/**
* Get major version
*/
uint32_t getMajor() const {
- return this->majorVersion;
+ return this->majorVersion_;
}
/**
* Get minor version
*/
uint32_t getMinor() const {
- return this->minorVersion;
+ return this->minorVersion_;
}
bool operator==(const FileVersion& right) const {
- return this->majorVersion == right.getMajor() && this->minorVersion == right.getMinor();
+ return this->majorVersion_ == right.getMajor() && this->minorVersion_ == right.getMinor();
}
bool operator!=(const FileVersion& right) const {
diff --git a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh
index 0536dbd164..b19a00760c 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh
@@ -28,8 +28,8 @@ namespace orc {
class NotImplementedYet : public std::logic_error {
public:
- explicit NotImplementedYet(const std::string& what_arg);
- explicit NotImplementedYet(const char* what_arg);
+ explicit NotImplementedYet(const std::string& whatArg);
+ explicit NotImplementedYet(const char* whatArg);
~NotImplementedYet() noexcept override;
NotImplementedYet(const NotImplementedYet&);
@@ -39,8 +39,8 @@ namespace orc {
class ParseError : public std::runtime_error {
public:
- explicit ParseError(const std::string& what_arg);
- explicit ParseError(const char* what_arg);
+ explicit ParseError(const std::string& whatArg);
+ explicit ParseError(const char* whatArg);
~ParseError() noexcept override;
ParseError(const ParseError&);
@@ -50,8 +50,8 @@ namespace orc {
class InvalidArgument : public std::runtime_error {
public:
- explicit InvalidArgument(const std::string& what_arg);
- explicit InvalidArgument(const char* what_arg);
+ explicit InvalidArgument(const std::string& whatArg);
+ explicit InvalidArgument(const char* whatArg);
~InvalidArgument() noexcept override;
InvalidArgument(const InvalidArgument&);
@@ -61,12 +61,24 @@ namespace orc {
class SchemaEvolutionError : public std::logic_error {
public:
- explicit SchemaEvolutionError(const std::string& what_arg);
- explicit SchemaEvolutionError(const char* what_arg);
+ explicit SchemaEvolutionError(const std::string& whatArg);
+ explicit SchemaEvolutionError(const char* whatArg);
virtual ~SchemaEvolutionError() noexcept override;
SchemaEvolutionError(const SchemaEvolutionError&);
SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete;
};
+
+ class CompressionError : public std::runtime_error {
+ public:
+ explicit CompressionError(const std::string& whatArg);
+ explicit CompressionError(const char* whatArg);
+ ~CompressionError() noexcept override;
+ CompressionError(const CompressionError&);
+
+ private:
+ CompressionError& operator=(const CompressionError&);
+ };
+
} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh
index bcb4a58e22..6954c771cf 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh
@@ -37,8 +37,8 @@ namespace orc {
class Int128 {
public:
Int128() {
- highbits = 0;
- lowbits = 0;
+ highbits_ = 0;
+ lowbits_ = 0;
}
/**
@@ -46,11 +46,11 @@ namespace orc {
*/
Int128(int64_t right) {
if (right >= 0) {
- highbits = 0;
- lowbits = static_cast<uint64_t>(right);
+ highbits_ = 0;
+ lowbits_ = static_cast<uint64_t>(right);
} else {
- highbits = -1;
- lowbits = static_cast<uint64_t>(right);
+ highbits_ = -1;
+ lowbits_ = static_cast<uint64_t>(right);
}
}
@@ -58,8 +58,8 @@ namespace orc {
* Create from the twos complement representation.
*/
Int128(int64_t high, uint64_t low) {
- highbits = high;
- lowbits = low;
+ highbits_ = high;
+ lowbits_ = low;
}
/**
@@ -78,16 +78,16 @@ namespace orc {
static Int128 minimumValue();
Int128& negate() {
- lowbits = ~lowbits + 1;
- highbits = ~highbits;
- if (lowbits == 0) {
- highbits += 1;
+ lowbits_ = ~lowbits_ + 1;
+ highbits_ = ~highbits_;
+ if (lowbits_ == 0) {
+ highbits_ += 1;
}
return *this;
}
Int128& abs() {
- if (highbits < 0) {
+ if (highbits_ < 0) {
negate();
}
return *this;
@@ -100,8 +100,8 @@ namespace orc {
}
Int128& invert() {
- lowbits = ~lowbits;
- highbits = ~highbits;
+ lowbits_ = ~lowbits_;
+ highbits_ = ~highbits_;
return *this;
}
@@ -111,12 +111,12 @@ namespace orc {
* @return *this
*/
Int128& operator+=(const Int128& right) {
- uint64_t sum = lowbits + right.lowbits;
- highbits += right.highbits;
- if (sum < lowbits) {
- highbits += 1;
+ uint64_t sum = lowbits_ + right.lowbits_;
+ highbits_ += right.highbits_;
+ if (sum < lowbits_) {
+ highbits_ += 1;
}
- lowbits = sum;
+ lowbits_ = sum;
return *this;
}
@@ -126,12 +126,12 @@ namespace orc {
* @return *this
*/
Int128& operator-=(const Int128& right) {
- uint64_t diff = lowbits - right.lowbits;
- highbits -= right.highbits;
- if (diff > lowbits) {
- highbits -= 1;
+ uint64_t diff = lowbits_ - right.lowbits_;
+ highbits_ -= right.highbits_;
+ if (diff > lowbits_) {
+ highbits_ -= 1;
}
- lowbits = diff;
+ lowbits_ = diff;
return *this;
}
@@ -162,8 +162,8 @@ namespace orc {
* @return *this
*/
Int128& operator|=(const Int128& right) {
- lowbits |= right.lowbits;
- highbits |= right.highbits;
+ lowbits_ |= right.lowbits_;
+ highbits_ |= right.highbits_;
return *this;
}
@@ -173,8 +173,8 @@ namespace orc {
* @return *this
*/
Int128& operator&=(const Int128& right) {
- lowbits &= right.lowbits;
- highbits &= right.highbits;
+ lowbits_ &= right.lowbits_;
+ highbits_ &= right.highbits_;
return *this;
}
@@ -196,15 +196,15 @@ namespace orc {
Int128& operator<<=(uint32_t bits) {
if (bits != 0) {
if (bits < 64) {
- highbits <<= bits;
- highbits |= (lowbits >> (64 - bits));
- lowbits <<= bits;
+ highbits_ <<= bits;
+ highbits_ |= (lowbits_ >> (64 - bits));
+ lowbits_ <<= bits;
} else if (bits < 128) {
- highbits = static_cast<int64_t>(lowbits) << (bits - 64);
- lowbits = 0;
+ highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
+ lowbits_ = 0;
} else {
- highbits = 0;
- lowbits = 0;
+ highbits_ = 0;
+ lowbits_ = 0;
}
}
return *this;
@@ -217,74 +217,74 @@ namespace orc {
Int128& operator>>=(uint32_t bits) {
if (bits != 0) {
if (bits < 64) {
- lowbits >>= bits;
- lowbits |= static_cast<uint64_t>(highbits << (64 - bits));
- highbits = static_cast<int64_t>(static_cast<uint64_t>(highbits) >> bits);
+ lowbits_ >>= bits;
+ lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
+ highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >> bits);
} else if (bits < 128) {
- lowbits = static_cast<uint64_t>(highbits >> (bits - 64));
- highbits = highbits >= 0 ? 0 : -1l;
+ lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
+ highbits_ = highbits_ >= 0 ? 0 : -1l;
} else {
- highbits = highbits >= 0 ? 0 : -1l;
- lowbits = static_cast<uint64_t>(highbits);
+ highbits_ = highbits_ >= 0 ? 0 : -1l;
+ lowbits_ = static_cast<uint64_t>(highbits_);
}
}
return *this;
}
bool operator==(const Int128& right) const {
- return highbits == right.highbits && lowbits == right.lowbits;
+ return highbits_ == right.highbits_ && lowbits_ == right.lowbits_;
}
bool operator!=(const Int128& right) const {
- return highbits != right.highbits || lowbits != right.lowbits;
+ return highbits_ != right.highbits_ || lowbits_ != right.lowbits_;
}
bool operator<(const Int128& right) const {
- if (highbits == right.highbits) {
- return lowbits < right.lowbits;
+ if (highbits_ == right.highbits_) {
+ return lowbits_ < right.lowbits_;
} else {
- return highbits < right.highbits;
+ return highbits_ < right.highbits_;
}
}
bool operator<=(const Int128& right) const {
- if (highbits == right.highbits) {
- return lowbits <= right.lowbits;
+ if (highbits_ == right.highbits_) {
+ return lowbits_ <= right.lowbits_;
} else {
- return highbits <= right.highbits;
+ return highbits_ <= right.highbits_;
}
}
bool operator>(const Int128& right) const {
- if (highbits == right.highbits) {
- return lowbits > right.lowbits;
+ if (highbits_ == right.highbits_) {
+ return lowbits_ > right.lowbits_;
} else {
- return highbits > right.highbits;
+ return highbits_ > right.highbits_;
}
}
bool operator>=(const Int128& right) const {
- if (highbits == right.highbits) {
- return lowbits >= right.lowbits;
+ if (highbits_ == right.highbits_) {
+ return lowbits_ >= right.lowbits_;
} else {
- return highbits >= right.highbits;
+ return highbits_ >= right.highbits_;
}
}
uint32_t hash() const {
- return static_cast<uint32_t>(highbits >> 32) ^ static_cast<uint32_t>(highbits) ^
- static_cast<uint32_t>(lowbits >> 32) ^ static_cast<uint32_t>(lowbits);
+ return static_cast<uint32_t>(highbits_ >> 32) ^ static_cast<uint32_t>(highbits_) ^
+ static_cast<uint32_t>(lowbits_ >> 32) ^ static_cast<uint32_t>(lowbits_);
}
/**
* Does this value fit into a long?
*/
bool fitsInLong() const {
- switch (highbits) {
+ switch (highbits_) {
case 0:
- return 0 == (lowbits & LONG_SIGN_BIT);
+ return 0 == (lowbits_ & LONG_SIGN_BIT);
case -1:
- return 0 != (lowbits & LONG_SIGN_BIT);
+ return 0 != (lowbits_ & LONG_SIGN_BIT);
default:
return false;
}
@@ -295,7 +295,7 @@ namespace orc {
*/
int64_t toLong() const {
if (fitsInLong()) {
- return static_cast<int64_t>(lowbits);
+ return static_cast<int64_t>(lowbits_);
}
throw std::range_error("Int128 too large to convert to long");
}
@@ -331,14 +331,14 @@ namespace orc {
* Get the high bits of the twos complement representation of the number.
*/
int64_t getHighBits() const {
- return highbits;
+ return highbits_;
}
/**
* Get the low bits of the twos complement representation of the number.
*/
uint64_t getLowBits() const {
- return lowbits;
+ return lowbits_;
}
/**
@@ -352,8 +352,8 @@ namespace orc {
private:
static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u;
- int64_t highbits;
- uint64_t lowbits;
+ int64_t highbits_;
+ uint64_t lowbits_;
};
/**
diff --git a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh
index 6d999d3aa8..a914e5f260 100644
--- a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh
@@ -36,50 +36,50 @@ namespace orc {
template <class T>
class DataBuffer {
private:
- MemoryPool& memoryPool;
- T* buf;
+ MemoryPool& memoryPool_;
+ T* buf_;
// current size
- uint64_t currentSize;
+ uint64_t currentSize_;
// maximal capacity (actual allocated memory)
- uint64_t currentCapacity;
+ uint64_t currentCapacity_;
// not implemented
DataBuffer(DataBuffer& buffer);
DataBuffer& operator=(DataBuffer& buffer);
public:
- DataBuffer(MemoryPool& pool, uint64_t _size = 0);
+ DataBuffer(MemoryPool& pool, uint64_t size = 0);
DataBuffer(DataBuffer<T>&& buffer) noexcept;
virtual ~DataBuffer();
T* data() {
- return buf;
+ return buf_;
}
const T* data() const {
- return buf;
+ return buf_;
}
uint64_t size() const {
- return currentSize;
+ return currentSize_;
}
uint64_t capacity() const {
- return currentCapacity;
+ return currentCapacity_;
}
const T& operator[](uint64_t i) const {
- return buf[i];
+ return buf_[i];
}
T& operator[](uint64_t i) {
- return buf[i];
+ return buf_[i];
}
- void reserve(uint64_t _size);
- void resize(uint64_t _size);
+ void reserve(uint64_t size);
+ void resize(uint64_t size);
void zeroOut();
};
diff --git a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh
index a9ad692d42..ea71567c5f 100644
--- a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh
@@ -19,6 +19,7 @@
#ifndef ORC_FILE_HH
#define ORC_FILE_HH
+#include <future>
#include <string>
#include "orc/Reader.hh"
@@ -59,6 +60,18 @@ namespace orc {
virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
/**
+ * Read data asynchronously into the buffer. The buffer is allocated by the caller.
+ * @param buf the buffer to read into
+ * @param length the number of bytes to read.
+ * @param offset the position in the stream to read from.
+ * @return a future that will be set when the read is complete.
+ */
+ virtual std::future<void> readAsync(void* buf, uint64_t length, uint64_t offset) {
+ return std::async(std::launch::async,
+ [this, buf, length, offset] { this->read(buf, length, offset); });
+ }
+
+ /**
* Get the name of the stream for error messages.
*/
virtual const std::string& getName() const = 0;
diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh
index b631c2c6ea..b015b64910 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh
@@ -40,6 +40,17 @@ namespace orc {
struct ReaderOptionsPrivate;
struct RowReaderOptionsPrivate;
+ struct CacheOptions {
+ // The maximum distance in bytes between two consecutive
+ // ranges; beyond this value, ranges are not combined
+ uint64_t holeSizeLimit = 8192;
+
+ // The maximum size in bytes of a combined range; if
+ // combining two consecutive ranges would produce a range of a
+ // size greater than this, they are not combined
+ uint64_t rangeSizeLimit = 32 * 1024 * 1024;
+ };
+
/**
* Expose the reader metrics including the latency and
* number of calls of the decompression/decoding/IO modules.
@@ -59,15 +70,26 @@ namespace orc {
std::atomic<uint64_t> IOBlockingLatencyUs{0};
std::atomic<uint64_t> SelectedRowGroupCount{0};
std::atomic<uint64_t> EvaluatedRowGroupCount{0};
+ std::atomic<uint64_t> ReadRangeCacheHits{0};
+ std::atomic<uint64_t> ReadRangeCacheMisses{0};
};
ReaderMetrics* getDefaultReaderMetrics();
+ // Row group index of a single column in a stripe.
+ struct RowGroupIndex {
+ // Positions are represented as a two-dimensional array where the first
+ // dimension is row group index and the second dimension is the position
+ // list of the row group. The size of the second dimension should be equal
+ // among all row groups.
+ std::vector<std::vector<uint64_t>> positions;
+ };
+
/**
* Options for creating a Reader.
*/
class ReaderOptions {
private:
- std::unique_ptr<ReaderOptionsPrivate> privateBits;
+ std::unique_ptr<ReaderOptionsPrivate> privateBits_;
public:
ReaderOptions();
@@ -108,6 +130,11 @@ namespace orc {
ReaderOptions& setReaderMetrics(ReaderMetrics* metrics);
/**
+ * Set the cache options.
+ */
+ ReaderOptions& setCacheOptions(const CacheOptions& cacheOptions);
+
+ /**
* Set the location of the tail as defined by the logical length of the
* file.
*/
@@ -138,6 +165,11 @@ namespace orc {
* Get the reader metrics.
*/
ReaderMetrics* getReaderMetrics() const;
+
+ /**
+ * Set the cache options.
+ */
+ const CacheOptions& getCacheOptions() const;
};
/**
@@ -145,7 +177,7 @@ namespace orc {
*/
class RowReaderOptions {
private:
- std::unique_ptr<RowReaderOptionsPrivate> privateBits;
+ std::unique_ptr<RowReaderOptionsPrivate> privateBits_;
public:
RowReaderOptions();
@@ -605,6 +637,33 @@ namespace orc {
*/
virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters(
uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
+
+ /**
+ * Get row group index of all selected columns in the specified stripe
+ * @param stripeIndex index of the stripe to be read for row group index.
+ * @param included index of selected columns to return (if not specified,
+ * all columns will be returned).
+ * @return map of row group index keyed by its column index.
+ */
+ virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
+ uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const = 0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
+ * It is thread safe. Users should make sure requested stripes and columns
+ * are not overlapped, otherwise the overlapping part will be prefetched multiple time,
+ * which doesn't affect correctness but waste IO and memory resources.
+ * @param stripes the stripes to prefetch
+ * @param includeTypes the types to prefetch
+ */
+ virtual void preBuffer(const std::vector<uint32_t>& stripes,
+ const std::list<uint64_t>& includeTypes) = 0;
+
+ /**
+ * Release cached entries whose right boundary is less than or equal to the given boundary.
+ * @param boundary the boundary value to release cache entries
+ */
+ virtual void releaseBuffer(uint64_t boundary) = 0;
};
/**
diff --git a/contrib/libs/apache/orc/c++/include/orc/Vector.hh b/contrib/libs/apache/orc/c++/include/orc/Vector.hh
index 0dfe926965..663bef9cd7 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Vector.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Vector.hh
@@ -57,6 +57,8 @@ namespace orc {
bool hasNulls;
// whether the vector batch is encoded
bool isEncoded;
+ // whether the dictionary is decoded into vector batch
+ bool dictionaryDecoded;
// custom memory pool
MemoryPool& memoryPool;
@@ -88,6 +90,14 @@ namespace orc {
*/
virtual bool hasVariableLength();
+ /**
+ * Decode possible dictionary into vector batch.
+ */
+ void decodeDictionary();
+
+ protected:
+ virtual void decodeDictionaryImpl() {}
+
private:
ColumnVectorBatch(const ColumnVectorBatch&);
ColumnVectorBatch& operator=(const ColumnVectorBatch&);
@@ -248,6 +258,10 @@ namespace orc {
~EncodedStringVectorBatch() override;
std::string toString() const override;
void resize(uint64_t capacity) override;
+
+ // Calculate data and length in StringVectorBatch from dictionary and index
+ void decodeDictionaryImpl() override;
+
std::shared_ptr<StringDictionary> dictionary;
// index for dictionary entry
@@ -264,6 +278,9 @@ namespace orc {
bool hasVariableLength() override;
std::vector<ColumnVectorBatch*> fields;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct ListVectorBatch : public ColumnVectorBatch {
@@ -283,6 +300,9 @@ namespace orc {
// the concatenated elements
std::unique_ptr<ColumnVectorBatch> elements;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct MapVectorBatch : public ColumnVectorBatch {
@@ -304,6 +324,9 @@ namespace orc {
std::unique_ptr<ColumnVectorBatch> keys;
// the concatenated elements
std::unique_ptr<ColumnVectorBatch> elements;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct UnionVectorBatch : public ColumnVectorBatch {
@@ -327,6 +350,9 @@ namespace orc {
// the sub-columns
std::vector<ColumnVectorBatch*> children;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct Decimal {
diff --git a/contrib/libs/apache/orc/c++/include/orc/Writer.hh b/contrib/libs/apache/orc/c++/include/orc/Writer.hh
index 047ee9ffc5..78f06739bc 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Writer.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Writer.hh
@@ -55,7 +55,7 @@ namespace orc {
*/
class WriterOptions {
private:
- std::unique_ptr<WriterOptionsPrivate> privateBits;
+ std::unique_ptr<WriterOptionsPrivate> privateBits_;
public:
WriterOptions();
@@ -277,6 +277,32 @@ namespace orc {
* @return if not set, return default value which is 1 MB.
*/
uint64_t getOutputBufferCapacity() const;
+
+ /**
+ * Set the initial block size of original input buffer in the class CompressionStream.
+ * the input buffer is used to store raw data before compression, while the output buffer is
+ * dedicated to holding compressed data
+ */
+ WriterOptions& setMemoryBlockSize(uint64_t capacity);
+
+ /**
+ * Get the initial block size of original input buffer in the class CompressionStream.
+ * @return if not set, return default value which is 64 KB.
+ */
+ uint64_t getMemoryBlockSize() const;
+
+ /**
+ * Set whether the compression block should be aligned to row group boundary.
+ * The boolean type may not be aligned to row group boundary due to the
+ * requirement of the Boolean RLE encoder to pack input bits into bytes
+ */
+ WriterOptions& setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup);
+
+ /**
+ * Get if the compression block should be aligned to row group boundary.
+ * @return if not set, return default value which is false.
+ */
+ bool getAlignBlockBoundToRowGroup() const;
};
class Writer {
diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
index 5205a56af6..7bd4ac63b5 100644
--- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
@@ -19,7 +19,7 @@
#ifndef ORC_CONFIG_HH
#define ORC_CONFIG_HH
-#define ORC_VERSION "2.0.3"
+#define ORC_VERSION "2.1.0"
#define ORC_CXX_HAS_CSTDINT
diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh
index 9ce958302d..f7d37005a5 100644
--- a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh
@@ -39,7 +39,7 @@ namespace orc {
Timestamp(const Timestamp&) = default;
Timestamp(Timestamp&&) = default;
~Timestamp() = default;
- Timestamp(int64_t second_, int32_t nanos_) : second(second_), nanos(nanos_) {
+ Timestamp(int64_t second, int32_t nanos) : second(second), nanos(nanos) {
// PASS
}
Timestamp& operator=(const Timestamp&) = default;
@@ -130,15 +130,15 @@ namespace orc {
* Check if a literal is null
*/
bool isNull() const {
- return mIsNull;
+ return isNull_;
}
PredicateDataType getType() const {
- return mType;
+ return type_;
}
std::string toString() const;
size_t getHashCode() const {
- return mHashCode;
+ return hashCode_;
}
private:
@@ -158,13 +158,13 @@ namespace orc {
};
private:
- LiteralVal mValue; // data value for this literal if not null
- PredicateDataType mType; // data type of the literal
- size_t mSize; // size of mValue if it is Buffer
- int32_t mPrecision; // precision of decimal type
- int32_t mScale; // scale of decimal type
- bool mIsNull; // whether this literal is null
- size_t mHashCode; // precomputed hash code for the literal
+ LiteralVal value_; // data value for this literal if not null
+ PredicateDataType type_; // data type of the literal
+ size_t size_; // size of mValue if it is Buffer
+ int32_t precision_; // precision of decimal type
+ int32_t scale_; // scale of decimal type
+ bool isNull_; // whether this literal is null
+ size_t hashCode_; // precomputed hash code for the literal
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
index 286188e3a1..b10cc775ec 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
+++ b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
@@ -70,6 +70,7 @@ typedef SSIZE_T ssize_t;
#define PRAGMA(TXT) _Pragma(#TXT)
#if defined(_MSC_VER)
+ // Handles both cl.exe and clang-cl.exe compilers
#define DIAGNOSTIC_IGNORE(XXX) __pragma(warning(disable : XXX))
#elif defined(__clang__)
#define DIAGNOSTIC_IGNORE(XXX) PRAGMA(clang diagnostic ignored XXX)
diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.cc b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc
index 1f7843fad7..09bf078c85 100644
--- a/contrib/libs/apache/orc/c++/src/BlockBuffer.cc
+++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc
@@ -24,56 +24,56 @@
namespace orc {
- BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t _blockSize)
- : memoryPool(pool), currentSize(0), currentCapacity(0), blockSize(_blockSize) {
- if (blockSize == 0) {
+ BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t blockSize)
+ : memoryPool_(pool), currentSize_(0), currentCapacity_(0), blockSize_(blockSize) {
+ if (blockSize_ == 0) {
throw std::logic_error("Block size cannot be zero");
}
- reserve(blockSize);
+ reserve(blockSize_);
}
BlockBuffer::~BlockBuffer() {
- for (size_t i = 0; i < blocks.size(); ++i) {
- memoryPool.free(blocks[i]);
+ for (size_t i = 0; i < blocks_.size(); ++i) {
+ memoryPool_.free(blocks_[i]);
}
- blocks.clear();
- currentSize = currentCapacity = 0;
+ blocks_.clear();
+ currentSize_ = currentCapacity_ = 0;
}
BlockBuffer::Block BlockBuffer::getBlock(uint64_t blockIndex) const {
if (blockIndex >= getBlockNumber()) {
throw std::out_of_range("Block index out of range");
}
- return Block(blocks[blockIndex], std::min(currentSize - blockIndex * blockSize, blockSize));
+ return Block(blocks_[blockIndex], std::min(currentSize_ - blockIndex * blockSize_, blockSize_));
}
BlockBuffer::Block BlockBuffer::getNextBlock() {
- if (currentSize < currentCapacity) {
- Block emptyBlock(blocks[currentSize / blockSize] + currentSize % blockSize,
- blockSize - currentSize % blockSize);
- currentSize = (currentSize / blockSize + 1) * blockSize;
+ if (currentSize_ < currentCapacity_) {
+ Block emptyBlock(blocks_[currentSize_ / blockSize_] + currentSize_ % blockSize_,
+ blockSize_ - currentSize_ % blockSize_);
+ currentSize_ = (currentSize_ / blockSize_ + 1) * blockSize_;
return emptyBlock;
} else {
- resize(currentSize + blockSize);
- return Block(blocks.back(), blockSize);
+ resize(currentSize_ + blockSize_);
+ return Block(blocks_.back(), blockSize_);
}
}
void BlockBuffer::resize(uint64_t size) {
reserve(size);
- if (currentCapacity >= size) {
- currentSize = size;
+ if (currentCapacity_ >= size) {
+ currentSize_ = size;
} else {
throw std::logic_error("Block buffer resize error");
}
}
void BlockBuffer::reserve(uint64_t newCapacity) {
- while (currentCapacity < newCapacity) {
- char* newBlockPtr = memoryPool.malloc(blockSize);
+ while (currentCapacity_ < newCapacity) {
+ char* newBlockPtr = memoryPool_.malloc(blockSize_);
if (newBlockPtr != nullptr) {
- blocks.push_back(newBlockPtr);
- currentCapacity += blockSize;
+ blocks_.push_back(newBlockPtr);
+ currentCapacity_ += blockSize_;
} else {
break;
}
@@ -81,7 +81,7 @@ namespace orc {
}
void BlockBuffer::writeTo(OutputStream* output, WriterMetrics* metrics) {
- if (currentSize == 0) {
+ if (currentSize_ == 0) {
return;
}
static uint64_t MAX_CHUNK_SIZE = 1024 * 1024 * 1024;
@@ -92,12 +92,12 @@ namespace orc {
uint64_t ioCount = 0;
uint64_t blockNumber = getBlockNumber();
// if only exists one block, currentSize is equal to first block size
- if (blockNumber == 1 && currentSize <= chunkSize) {
+ if (blockNumber == 1 && currentSize_ <= chunkSize) {
Block block = getBlock(0);
output->write(block.data, block.size);
++ioCount;
} else {
- char* chunk = memoryPool.malloc(chunkSize);
+ char* chunk = memoryPool_.malloc(chunkSize);
uint64_t chunkOffset = 0;
for (uint64_t i = 0; i < blockNumber; ++i) {
Block block = getBlock(i);
@@ -121,7 +121,7 @@ namespace orc {
output->write(chunk, chunkOffset);
++ioCount;
}
- memoryPool.free(chunk);
+ memoryPool_.free(chunk);
}
if (metrics != nullptr) {
diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.hh b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh
index 0f5f78e3fe..6d265b0e32 100644
--- a/contrib/libs/apache/orc/c++/src/BlockBuffer.hh
+++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh
@@ -34,15 +34,15 @@ namespace orc {
*/
class BlockBuffer {
private:
- MemoryPool& memoryPool;
+ MemoryPool& memoryPool_;
// current buffer size
- uint64_t currentSize;
+ uint64_t currentSize_;
// maximal capacity (actual allocated memory)
- uint64_t currentCapacity;
+ uint64_t currentCapacity_;
// unit for buffer expansion
- const uint64_t blockSize;
+ const uint64_t blockSize_;
// pointers to the start of each block
- std::vector<char*> blocks;
+ std::vector<char*> blocks_;
// non-copy-constructible
BlockBuffer(BlockBuffer& buffer) = delete;
@@ -66,7 +66,7 @@ namespace orc {
uint64_t size;
Block() : data(nullptr), size(0) {}
- Block(char* _data, uint64_t _size) : data(_data), size(_size) {}
+ Block(char* data, uint64_t size) : data(data), size(size) {}
Block(const Block& block) = default;
~Block() = default;
};
@@ -94,24 +94,26 @@ namespace orc {
* Get the number of blocks that are fully or partially occupied
*/
uint64_t getBlockNumber() const {
- return (currentSize + blockSize - 1) / blockSize;
+ return (currentSize_ + blockSize_ - 1) / blockSize_;
}
uint64_t size() const {
- return currentSize;
+ return currentSize_;
}
uint64_t capacity() const {
- return currentCapacity;
+ return currentCapacity_;
}
void resize(uint64_t size);
+
/**
* Requests the BlockBuffer to contain at least newCapacity bytes.
* Reallocation happens if there is need of more space.
* @param newCapacity new capacity of BlockBuffer
*/
void reserve(uint64_t newCapacity);
+
/**
* Write the BlockBuffer content into OutputStream
* @param output the output stream to write to
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
index 882c6f4252..887637223a 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
@@ -37,50 +37,50 @@ namespace orc {
* Implementation of BitSet
*/
BitSet::BitSet(uint64_t numBits) {
- mData.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0);
+ data_.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0);
}
BitSet::BitSet(const uint64_t* bits, uint64_t numBits) {
// caller should make sure numBits is multiple of 64
- mData.resize(numBits >> SHIFT_6_BITS, 0);
- memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS);
+ data_.resize(numBits >> SHIFT_6_BITS, 0);
+ memcpy(data_.data(), bits, numBits >> SHIFT_3_BITS);
}
void BitSet::set(uint64_t index) {
- mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG));
+ data_[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG));
}
bool BitSet::get(uint64_t index) {
- return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0;
+ return (data_[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0;
}
uint64_t BitSet::bitSize() {
- return mData.size() << SHIFT_6_BITS;
+ return data_.size() << SHIFT_6_BITS;
}
void BitSet::merge(const BitSet& other) {
- if (mData.size() != other.mData.size()) {
+ if (data_.size() != other.data_.size()) {
std::stringstream ss;
- ss << "BitSet must be of equal length (" << mData.size() << " != " << other.mData.size()
+ ss << "BitSet must be of equal length (" << data_.size() << " != " << other.data_.size()
<< ")";
throw std::logic_error(ss.str());
}
- for (size_t i = 0; i != mData.size(); i++) {
- mData[i] |= other.mData[i];
+ for (size_t i = 0; i != data_.size(); i++) {
+ data_[i] |= other.data_[i];
}
}
void BitSet::clear() {
- memset(mData.data(), 0, sizeof(uint64_t) * mData.size());
+ memset(data_.data(), 0, sizeof(uint64_t) * data_.size());
}
const uint64_t* BitSet::getData() const {
- return mData.data();
+ return data_.data();
}
bool BitSet::operator==(const BitSet& other) const {
- return mData == other.mData;
+ return data_ == other.data_;
}
/**
@@ -127,9 +127,9 @@ namespace orc {
uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp));
// make 'mNumBits' multiple of 64
- mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG));
- mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits);
- mBitSet.reset(new BitSet(mNumBits));
+ numBits_ = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG));
+ numHashFunctions_ = optimalNumOfHashFunctions(expectedEntries, numBits_);
+ bitSet_.reset(new BitSet(numBits_));
}
void BloomFilterImpl::addBytes(const char* data, int64_t length) {
@@ -155,11 +155,11 @@ namespace orc {
}
uint64_t BloomFilterImpl::getBitSize() const {
- return mBitSet->bitSize();
+ return bitSet_->bitSize();
}
int32_t BloomFilterImpl::getNumHashFunctions() const {
- return mNumHashFunctions;
+ return numHashFunctions_;
}
DIAGNOSTIC_PUSH
@@ -175,17 +175,17 @@ namespace orc {
// caller should make sure input proto::BloomFilter is valid since
// no check will be performed in the following constructor
BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) {
- mNumHashFunctions = static_cast<int32_t>(bloomFilter.num_hash_functions());
+ numHashFunctions_ = static_cast<int32_t>(bloomFilter.num_hash_functions());
const std::string& bitsetStr = bloomFilter.utf8bitset();
- mNumBits = bitsetStr.size() << SHIFT_3_BITS;
- checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!");
+ numBits_ = bitsetStr.size() << SHIFT_3_BITS;
+ checkArgument(numBits_ % BITS_OF_LONG == 0, "numBits should be multiple of 64!");
const uint64_t* bitset = reinterpret_cast<const uint64_t*>(bitsetStr.data());
if (isLittleEndian()) {
- mBitSet.reset(new BitSet(bitset, mNumBits));
+ bitSet_.reset(new BitSet(bitset, numBits_));
} else {
- std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS);
+ std::vector<uint64_t> longs(numBits_ >> SHIFT_6_BITS);
for (size_t i = 0; i != longs.size(); ++i) {
// convert little-endian to big-endian
const uint64_t src = bitset[i];
@@ -195,7 +195,7 @@ namespace orc {
}
}
- mBitSet.reset(new BitSet(longs.data(), mNumBits));
+ bitSet_.reset(new BitSet(longs.data(), numBits_));
}
}
@@ -215,14 +215,14 @@ namespace orc {
// So we cast hash64 to uint64_t here for an unsigned right shift.
int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32);
- for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
+ for (int32_t i = 1; i <= numHashFunctions_; ++i) {
int32_t combinedHash = hash1 + i * hash2;
// hashcode should be positive, flip all the bits if it's negative
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
- uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits;
- mBitSet->set(pos);
+ uint64_t pos = static_cast<uint64_t>(combinedHash) % numBits_;
+ bitSet_->set(pos);
}
}
@@ -232,14 +232,14 @@ namespace orc {
// So we cast hash64 to uint64_t here for an unsigned right shift.
int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32);
- for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
+ for (int32_t i = 1; i <= numHashFunctions_; ++i) {
int32_t combinedHash = hash1 + i * hash2;
// hashcode should be positive, flip all the bits if it's negative
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
- uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits;
- if (!mBitSet->get(pos)) {
+ uint64_t pos = static_cast<uint64_t>(combinedHash) % numBits_;
+ if (!bitSet_->get(pos)) {
return false;
}
}
@@ -247,33 +247,33 @@ namespace orc {
}
void BloomFilterImpl::merge(const BloomFilterImpl& other) {
- if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) {
+ if (numBits_ != other.numBits_ || numHashFunctions_ != other.numHashFunctions_) {
std::stringstream ss;
ss << "BloomFilters are not compatible for merging: "
- << "this: numBits:" << mNumBits << ",numHashFunctions:" << mNumHashFunctions
- << ", that: numBits:" << other.mNumBits << ",numHashFunctions:" << other.mNumHashFunctions;
+ << "this: numBits:" << numBits_ << ",numHashFunctions:" << numHashFunctions_
+ << ", that: numBits:" << other.numBits_ << ",numHashFunctions:" << other.numHashFunctions_;
throw std::logic_error(ss.str());
}
- mBitSet->merge(*other.mBitSet);
+ bitSet_->merge(*other.bitSet_);
}
void BloomFilterImpl::reset() {
- mBitSet->clear();
+ bitSet_->clear();
}
void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const {
- bloomFilter.set_num_hash_functions(static_cast<uint32_t>(mNumHashFunctions));
+ bloomFilter.set_num_hash_functions(static_cast<uint32_t>(numHashFunctions_));
// According to ORC standard, the encoding is a sequence of bytes with
// a little endian encoding in the utf8bitset field.
if (isLittleEndian()) {
// bytes are already organized in little endian; thus no conversion needed
- const char* bitset = reinterpret_cast<const char*>(mBitSet->getData());
+ const char* bitset = reinterpret_cast<const char*>(bitSet_->getData());
bloomFilter.set_utf8bitset(bitset, sizeInBytes());
} else {
std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0);
- const uint64_t* longs = mBitSet->getData();
+ const uint64_t* longs = bitSet_->getData();
for (size_t i = 0; i != bitset.size(); ++i) {
uint64_t& dst = bitset[i];
const uint64_t src = longs[i];
@@ -287,8 +287,8 @@ namespace orc {
}
bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const {
- return mNumBits == other.mNumBits && mNumHashFunctions == other.mNumHashFunctions &&
- *mBitSet == *other.mBitSet;
+ return numBits_ == other.numBits_ && numHashFunctions_ == other.numHashFunctions_ &&
+ *bitSet_ == *other.bitSet_;
}
BloomFilter::~BloomFilter() {
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
index d72961a83c..ebc4a5ee04 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
@@ -90,7 +90,7 @@ namespace orc {
bool operator==(const BitSet& other) const;
private:
- std::vector<uint64_t> mData;
+ std::vector<uint64_t> data_;
};
/**
@@ -174,9 +174,9 @@ namespace orc {
private:
static constexpr double DEFAULT_FPP = 0.05;
- uint64_t mNumBits;
- int32_t mNumHashFunctions;
- std::unique_ptr<BitSet> mBitSet;
+ uint64_t numBits_;
+ int32_t numHashFunctions_;
+ std::unique_ptr<BitSet> bitSet_;
};
struct BloomFilterUTF8Utils {
diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.cc b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc
index 5a80bc6fb1..401a217d35 100644
--- a/contrib/libs/apache/orc/c++/src/BpackingDefault.cc
+++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc
@@ -22,7 +22,7 @@
namespace orc {
- UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder(dec) {
+ UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder_(dec) {
// PASS
}
@@ -34,17 +34,17 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8.
- while (decoder->getBitsLeft() > 0 && curIdx < offset + len) {
- decoder->setBitsLeft(decoder->getBitsLeft() - 4);
- data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15;
+ while (decoder_->getBitsLeft() > 0 && curIdx < offset + len) {
+ decoder_->setBitsLeft(decoder_->getBitsLeft() - 4);
+ data[curIdx++] = (decoder_->getCurByte() >> decoder_->getBitsLeft()) & 15;
}
if (curIdx == offset + len) return;
// Exhaust the buffer
uint64_t numGroups = (offset + len - curIdx) / 2;
- numGroups = std::min(numGroups, static_cast<uint64_t>(decoder->bufLength()));
+ numGroups = std::min(numGroups, static_cast<uint64_t>(decoder_->bufLength()));
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
uint32_t localByte;
for (uint64_t i = 0; i < numGroups; ++i) {
localByte = *buffer++;
@@ -52,12 +52,12 @@ namespace orc {
data[curIdx + 1] = localByte & 15;
curIdx += 2;
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// readByte() will update 'bufferStart' and 'bufferEnd'
- decoder->setCurByte(decoder->readByte());
- decoder->setBitsLeft(8);
+ decoder_->setCurByte(decoder_->readByte());
+ decoder_->setBitsLeft(8);
}
}
@@ -65,18 +65,18 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength();
+ int64_t bufferNum = decoder_->bufLength();
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
data[curIdx++] = *buffer++;
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// readByte() will update 'bufferStart' and 'bufferEnd'.
- data[curIdx++] = decoder->readByte();
+ data[curIdx++] = decoder_->readByte();
}
}
@@ -84,23 +84,23 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 2;
+ int64_t bufferNum = decoder_->bufLength() / 2;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint16_t b0, b1;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint16_t>(*buffer);
b1 = static_cast<uint16_t>(*(buffer + 1));
buffer += 2;
data[curIdx++] = (b0 << 8) | b1;
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
data[curIdx++] = (b0 << 8) | b1;
}
}
@@ -109,11 +109,11 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 3;
+ int64_t bufferNum = decoder_->bufLength() / 3;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint32_t b0, b1, b2;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint32_t>(*buffer);
b1 = static_cast<uint32_t>(*(buffer + 1));
@@ -122,13 +122,13 @@ namespace orc {
data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
}
//////decoder->bufferStart += bufferNum * 3;
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
- b2 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
+ b2 = decoder_->readByte();
data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
}
}
@@ -137,11 +137,11 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 4;
+ int64_t bufferNum = decoder_->bufLength() / 4;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint32_t b0, b1, b2, b3;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint32_t>(*buffer);
b1 = static_cast<uint32_t>(*(buffer + 1));
@@ -150,14 +150,14 @@ namespace orc {
buffer += 4;
data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
- b2 = decoder->readByte();
- b3 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
+ b2 = decoder_->readByte();
+ b3 = decoder_->readByte();
data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
}
}
@@ -166,11 +166,11 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 5;
+ int64_t bufferNum = decoder_->bufLength() / 5;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint64_t b0, b1, b2, b3, b4;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint32_t>(*buffer);
b1 = static_cast<uint32_t>(*(buffer + 1));
@@ -181,15 +181,15 @@ namespace orc {
data[curIdx++] =
static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
- b2 = decoder->readByte();
- b3 = decoder->readByte();
- b4 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
+ b2 = decoder_->readByte();
+ b3 = decoder_->readByte();
+ b4 = decoder_->readByte();
data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
}
}
@@ -198,11 +198,11 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 6;
+ int64_t bufferNum = decoder_->bufLength() / 6;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint64_t b0, b1, b2, b3, b4, b5;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint32_t>(*buffer);
b1 = static_cast<uint32_t>(*(buffer + 1));
@@ -214,16 +214,16 @@ namespace orc {
data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) |
(b4 << 8) | b5);
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
- b2 = decoder->readByte();
- b3 = decoder->readByte();
- b4 = decoder->readByte();
- b5 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
+ b2 = decoder_->readByte();
+ b3 = decoder_->readByte();
+ b4 = decoder_->readByte();
+ b5 = decoder_->readByte();
data[curIdx++] =
static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5);
}
@@ -233,11 +233,11 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 7;
+ int64_t bufferNum = decoder_->bufLength() / 7;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint64_t b0, b1, b2, b3, b4, b5, b6;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint32_t>(*buffer);
b1 = static_cast<uint32_t>(*(buffer + 1));
@@ -250,17 +250,17 @@ namespace orc {
data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) |
(b4 << 16) | (b5 << 8) | b6);
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
- b2 = decoder->readByte();
- b3 = decoder->readByte();
- b4 = decoder->readByte();
- b5 = decoder->readByte();
- b6 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
+ b2 = decoder_->readByte();
+ b3 = decoder_->readByte();
+ b4 = decoder_->readByte();
+ b5 = decoder_->readByte();
+ b6 = decoder_->readByte();
data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) |
(b4 << 16) | (b5 << 8) | b6);
}
@@ -270,11 +270,11 @@ namespace orc {
uint64_t curIdx = offset;
while (curIdx < offset + len) {
// Exhaust the buffer
- int64_t bufferNum = decoder->bufLength() / 8;
+ int64_t bufferNum = decoder_->bufLength() / 8;
bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
uint64_t b0, b1, b2, b3, b4, b5, b6, b7;
// Avoid updating 'bufferStart' inside the loop.
- auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder_->getBufStart());
for (int i = 0; i < bufferNum; ++i) {
b0 = static_cast<uint32_t>(*buffer);
b1 = static_cast<uint32_t>(*(buffer + 1));
@@ -288,18 +288,18 @@ namespace orc {
data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) |
(b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
}
- decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ decoder_->setBufStart(reinterpret_cast<char*>(buffer));
if (curIdx == offset + len) return;
// One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = decoder->readByte();
- b1 = decoder->readByte();
- b2 = decoder->readByte();
- b3 = decoder->readByte();
- b4 = decoder->readByte();
- b5 = decoder->readByte();
- b6 = decoder->readByte();
- b7 = decoder->readByte();
+ b0 = decoder_->readByte();
+ b1 = decoder_->readByte();
+ b2 = decoder_->readByte();
+ b3 = decoder_->readByte();
+ b4 = decoder_->readByte();
+ b5 = decoder_->readByte();
+ b6 = decoder_->readByte();
+ b7 = decoder_->readByte();
data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) |
(b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
}
@@ -309,19 +309,19 @@ namespace orc {
for (uint64_t i = offset; i < (offset + len); i++) {
uint64_t result = 0;
uint64_t bitsLeftToRead = fbs;
- while (bitsLeftToRead > decoder->getBitsLeft()) {
- result <<= decoder->getBitsLeft();
- result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1);
- bitsLeftToRead -= decoder->getBitsLeft();
- decoder->setCurByte(decoder->readByte());
- decoder->setBitsLeft(8);
+ while (bitsLeftToRead > decoder_->getBitsLeft()) {
+ result <<= decoder_->getBitsLeft();
+ result |= decoder_->getCurByte() & ((1 << decoder_->getBitsLeft()) - 1);
+ bitsLeftToRead -= decoder_->getBitsLeft();
+ decoder_->setCurByte(decoder_->readByte());
+ decoder_->setBitsLeft(8);
}
// handle the left over bits
if (bitsLeftToRead > 0) {
result <<= bitsLeftToRead;
- decoder->setBitsLeft(decoder->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead));
- result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1);
+ decoder_->setBitsLeft(decoder_->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead));
+ result |= (decoder_->getCurByte() >> decoder_->getBitsLeft()) & ((1 << bitsLeftToRead) - 1);
}
data[i] = static_cast<int64_t>(result);
}
diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.hh b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh
index 0a58234495..bbd7851260 100644
--- a/contrib/libs/apache/orc/c++/src/BpackingDefault.hh
+++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh
@@ -45,7 +45,7 @@ namespace orc {
void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs);
private:
- RleDecoderV2* decoder;
+ RleDecoderV2* decoder_;
};
class BitUnpackDefault : public BitUnpack {
diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc
index b81d282e35..ded9f55a00 100644
--- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc
+++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc
@@ -63,6 +63,8 @@ namespace orc {
virtual void suppress() override;
+ virtual void finishEncode() override;
+
/**
* Reset to initial state
*/
@@ -186,16 +188,17 @@ namespace orc {
void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
uint64_t flushedSize = outputStream->getSize();
- uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
+ uint64_t unusedBufferSize = static_cast<uint64_t>(bufferLength - bufferPosition);
if (outputStream->isCompressed()) {
// start of the compression chunk in the stream
recorder->add(flushedSize);
- // number of decompressed bytes that need to be consumed
- recorder->add(unflushedSize);
+ // There are multiple blocks in the input buffer, but bufferPosition only records the
+ // effective length of the last block. We need rawInputBufferSize to record the total length
+ // of all variable blocks.
+ recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize);
} else {
- flushedSize -= static_cast<uint64_t>(bufferLength);
// byte offset of the RLE run’s start location
- recorder->add(flushedSize + unflushedSize);
+ recorder->add(flushedSize - unusedBufferSize);
}
recorder->add(static_cast<uint64_t>(numLiterals));
}
@@ -215,6 +218,13 @@ namespace orc {
reset();
}
+ void ByteRleEncoderImpl::finishEncode() {
+ writeValues();
+ outputStream->BackUp(bufferLength - bufferPosition);
+ outputStream->finishStream();
+ bufferLength = bufferPosition = 0;
+ }
+
std::unique_ptr<ByteRleEncoder> createByteRleEncoder(
std::unique_ptr<BufferedOutputStream> output) {
return std::make_unique<ByteRleEncoderImpl>(std::move(output));
@@ -244,14 +254,14 @@ namespace orc {
virtual void suppress() override;
private:
- int bitsRemained;
- char current;
+ int bitsRemained_;
+ char current_;
};
BooleanRleEncoderImpl::BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output)
: ByteRleEncoderImpl(std::move(output)) {
- bitsRemained = 8;
- current = static_cast<char>(0);
+ bitsRemained_ = 8;
+ current_ = static_cast<char>(0);
}
BooleanRleEncoderImpl::~BooleanRleEncoderImpl() {
@@ -260,43 +270,43 @@ namespace orc {
void BooleanRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
- if (bitsRemained == 0) {
- write(current);
- current = static_cast<char>(0);
- bitsRemained = 8;
+ if (bitsRemained_ == 0) {
+ write(current_);
+ current_ = static_cast<char>(0);
+ bitsRemained_ = 8;
}
if (!notNull || notNull[i]) {
if (!data || data[i]) {
- current = static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
+ current_ = static_cast<char>(current_ | (0x80 >> (8 - bitsRemained_)));
}
- --bitsRemained;
+ --bitsRemained_;
}
}
- if (bitsRemained == 0) {
- write(current);
- current = static_cast<char>(0);
- bitsRemained = 8;
+ if (bitsRemained_ == 0) {
+ write(current_);
+ current_ = static_cast<char>(0);
+ bitsRemained_ = 8;
}
}
uint64_t BooleanRleEncoderImpl::flush() {
- if (bitsRemained != 8) {
- write(current);
+ if (bitsRemained_ != 8) {
+ write(current_);
}
- bitsRemained = 8;
- current = static_cast<char>(0);
+ bitsRemained_ = 8;
+ current_ = static_cast<char>(0);
return ByteRleEncoderImpl::flush();
}
void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
ByteRleEncoderImpl::recordPosition(recorder);
- recorder->add(static_cast<uint64_t>(8 - bitsRemained));
+ recorder->add(static_cast<uint64_t>(8 - bitsRemained_));
}
void BooleanRleEncoderImpl::suppress() {
ByteRleEncoderImpl::suppress();
- bitsRemained = 8;
- current = static_cast<char>(0);
+ bitsRemained_ = 8;
+ current_ = static_cast<char>(0);
}
std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder(
@@ -386,8 +396,8 @@ namespace orc {
}
ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* _metrics)
- : metrics(_metrics) {
+ ReaderMetrics* metrics)
+ : metrics(metrics) {
inputStream = std::move(input);
reset();
}
@@ -526,8 +536,8 @@ namespace orc {
};
BooleanRleDecoderImpl::BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* _metrics)
- : ByteRleDecoderImpl(std::move(input), _metrics) {
+ ReaderMetrics* metrics)
+ : ByteRleDecoderImpl(std::move(input), metrics) {
remainingBits = 0;
lastByte = 0;
}
diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh
index bd19f52ecc..bee064f666 100644
--- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh
+++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh
@@ -59,6 +59,13 @@ namespace orc {
* suppress the data and reset to initial state
*/
virtual void suppress() = 0;
+
+ /**
+ * Finalize the encoding process. This function should be called after all data required for
+ * encoding has been added. It ensures that any remaining data is processed and the final state
+ * of the encoder is set.
+ */
+ virtual void finishEncode() = 0;
};
class ByteRleDecoder {
diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
index 5297f80371..8b16ecbd09 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
@@ -17,6 +17,7 @@
*/
#include "orc/ColumnPrinter.hh"
+#include "orc/Int128.hh"
#include "orc/orc-config.hh"
#include "Adaptor.hh"
@@ -35,7 +36,7 @@ namespace orc {
class VoidColumnPrinter : public ColumnPrinter {
public:
- VoidColumnPrinter(std::string&);
+ VoidColumnPrinter(std::string&, ColumnPrinter::Param);
~VoidColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -43,10 +44,10 @@ namespace orc {
class BooleanColumnPrinter : public ColumnPrinter {
private:
- const int64_t* data;
+ const int64_t* data_;
public:
- BooleanColumnPrinter(std::string&);
+ BooleanColumnPrinter(std::string&, ColumnPrinter::Param);
~BooleanColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -54,10 +55,10 @@ namespace orc {
class LongColumnPrinter : public ColumnPrinter {
private:
- const int64_t* data;
+ const int64_t* data_;
public:
- LongColumnPrinter(std::string&);
+ LongColumnPrinter(std::string&, ColumnPrinter::Param);
~LongColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -65,11 +66,11 @@ namespace orc {
class DoubleColumnPrinter : public ColumnPrinter {
private:
- const double* data;
- const bool isFloat;
+ const double* data_;
+ const bool isFloat_;
public:
- DoubleColumnPrinter(std::string&, const Type& type);
+ DoubleColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param);
virtual ~DoubleColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -77,11 +78,11 @@ namespace orc {
class TimestampColumnPrinter : public ColumnPrinter {
private:
- const int64_t* seconds;
- const int64_t* nanoseconds;
+ const int64_t* seconds_;
+ const int64_t* nanoseconds_;
public:
- TimestampColumnPrinter(std::string&);
+ TimestampColumnPrinter(std::string&, ColumnPrinter::Param);
~TimestampColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -89,10 +90,10 @@ namespace orc {
class DateColumnPrinter : public ColumnPrinter {
private:
- const int64_t* data;
+ const int64_t* data_;
public:
- DateColumnPrinter(std::string&);
+ DateColumnPrinter(std::string&, ColumnPrinter::Param);
~DateColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -100,11 +101,12 @@ namespace orc {
class Decimal64ColumnPrinter : public ColumnPrinter {
private:
- const int64_t* data;
- int32_t scale;
+ const int64_t* data_;
+ int32_t scale_;
+ ColumnPrinter::Param param_;
public:
- Decimal64ColumnPrinter(std::string&);
+ Decimal64ColumnPrinter(std::string&, ColumnPrinter::Param);
~Decimal64ColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -112,11 +114,12 @@ namespace orc {
class Decimal128ColumnPrinter : public ColumnPrinter {
private:
- const Int128* data;
- int32_t scale;
+ const Int128* data_;
+ int32_t scale_;
+ ColumnPrinter::Param param_;
public:
- Decimal128ColumnPrinter(std::string&);
+ Decimal128ColumnPrinter(std::string&, ColumnPrinter::Param);
~Decimal128ColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -124,11 +127,11 @@ namespace orc {
class StringColumnPrinter : public ColumnPrinter {
private:
- const char* const* start;
- const int64_t* length;
+ const char* const* start_;
+ const int64_t* length_;
public:
- StringColumnPrinter(std::string&);
+ StringColumnPrinter(std::string&, ColumnPrinter::Param);
virtual ~StringColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -136,11 +139,11 @@ namespace orc {
class BinaryColumnPrinter : public ColumnPrinter {
private:
- const char* const* start;
- const int64_t* length;
+ const char* const* start_;
+ const int64_t* length_;
public:
- BinaryColumnPrinter(std::string&);
+ BinaryColumnPrinter(std::string&, ColumnPrinter::Param);
virtual ~BinaryColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -148,11 +151,11 @@ namespace orc {
class ListColumnPrinter : public ColumnPrinter {
private:
- const int64_t* offsets;
- std::unique_ptr<ColumnPrinter> elementPrinter;
+ const int64_t* offsets_;
+ std::unique_ptr<ColumnPrinter> elementPrinter_;
public:
- ListColumnPrinter(std::string&, const Type& type);
+ ListColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param);
virtual ~ListColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -160,12 +163,12 @@ namespace orc {
class MapColumnPrinter : public ColumnPrinter {
private:
- const int64_t* offsets;
- std::unique_ptr<ColumnPrinter> keyPrinter;
- std::unique_ptr<ColumnPrinter> elementPrinter;
+ const int64_t* offsets_;
+ std::unique_ptr<ColumnPrinter> keyPrinter_;
+ std::unique_ptr<ColumnPrinter> elementPrinter_;
public:
- MapColumnPrinter(std::string&, const Type& type);
+ MapColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param);
virtual ~MapColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -173,23 +176,23 @@ namespace orc {
class UnionColumnPrinter : public ColumnPrinter {
private:
- const unsigned char* tags;
- const uint64_t* offsets;
- std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter;
+ const unsigned char* tags_;
+ const uint64_t* offsets_;
+ std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter_;
public:
- UnionColumnPrinter(std::string&, const Type& type);
+ UnionColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param);
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
class StructColumnPrinter : public ColumnPrinter {
private:
- std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter;
- std::vector<std::string> fieldNames;
+ std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter_;
+ std::vector<std::string> fieldNames_;
public:
- StructColumnPrinter(std::string&, const Type& type);
+ StructColumnPrinter(std::string&, const Type& type, ColumnPrinter::Param);
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
@@ -203,7 +206,7 @@ namespace orc {
file.append(ptr, len);
}
- ColumnPrinter::ColumnPrinter(std::string& _buffer) : buffer(_buffer) {
+ ColumnPrinter::ColumnPrinter(std::string& buffer) : buffer(buffer) {
notNull = nullptr;
hasNulls = false;
}
@@ -221,69 +224,70 @@ namespace orc {
}
}
- std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type) {
+ std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type,
+ ColumnPrinter::Param param) {
std::unique_ptr<ColumnPrinter> result;
if (type == nullptr) {
- result = std::make_unique<VoidColumnPrinter>(buffer);
+ result = std::make_unique<VoidColumnPrinter>(buffer, param);
} else {
switch (static_cast<int64_t>(type->getKind())) {
case BOOLEAN:
- result = std::make_unique<BooleanColumnPrinter>(buffer);
+ result = std::make_unique<BooleanColumnPrinter>(buffer, param);
break;
case BYTE:
case SHORT:
case INT:
case LONG:
- result = std::make_unique<LongColumnPrinter>(buffer);
+ result = std::make_unique<LongColumnPrinter>(buffer, param);
break;
case FLOAT:
case DOUBLE:
- result = std::make_unique<DoubleColumnPrinter>(buffer, *type);
+ result = std::make_unique<DoubleColumnPrinter>(buffer, *type, param);
break;
case STRING:
case VARCHAR:
case CHAR:
- result = std::make_unique<StringColumnPrinter>(buffer);
+ result = std::make_unique<StringColumnPrinter>(buffer, param);
break;
case BINARY:
- result = std::make_unique<BinaryColumnPrinter>(buffer);
+ result = std::make_unique<BinaryColumnPrinter>(buffer, param);
break;
case TIMESTAMP:
case TIMESTAMP_INSTANT:
- result = std::make_unique<TimestampColumnPrinter>(buffer);
+ result = std::make_unique<TimestampColumnPrinter>(buffer, param);
break;
case LIST:
- result = std::make_unique<ListColumnPrinter>(buffer, *type);
+ result = std::make_unique<ListColumnPrinter>(buffer, *type, param);
break;
case MAP:
- result = std::make_unique<MapColumnPrinter>(buffer, *type);
+ result = std::make_unique<MapColumnPrinter>(buffer, *type, param);
break;
case STRUCT:
- result = std::make_unique<StructColumnPrinter>(buffer, *type);
+ result = std::make_unique<StructColumnPrinter>(buffer, *type, param);
break;
case DECIMAL:
if (type->getPrecision() == 0 || type->getPrecision() > 18) {
- result = std::make_unique<Decimal128ColumnPrinter>(buffer);
+ result = std::make_unique<Decimal128ColumnPrinter>(buffer, param);
} else {
- result = std::make_unique<Decimal64ColumnPrinter>(buffer);
+ result = std::make_unique<Decimal64ColumnPrinter>(buffer, param);
}
break;
case DATE:
- result = std::make_unique<DateColumnPrinter>(buffer);
+ result = std::make_unique<DateColumnPrinter>(buffer, param);
break;
case UNION:
- result = std::make_unique<UnionColumnPrinter>(buffer, *type);
+ result = std::make_unique<UnionColumnPrinter>(buffer, *type, param);
break;
default:
@@ -293,7 +297,8 @@ namespace orc {
return result;
}
- VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer) : ColumnPrinter(_buffer) {
+ VoidColumnPrinter::VoidColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer) {
// PASS
}
@@ -305,33 +310,34 @@ namespace orc {
writeString(buffer, "null");
}
- LongColumnPrinter::LongColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), data(nullptr) {
+ LongColumnPrinter::LongColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer), data_(nullptr) {
// PASS
}
void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+ data_ = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
void LongColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- const auto numBuffer = std::to_string(static_cast<int64_t>(data[rowId]));
+ const auto numBuffer = std::to_string(static_cast<int64_t>(data_[rowId]));
writeString(buffer, numBuffer.c_str());
}
}
- DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, const Type& type)
- : ColumnPrinter(_buffer), data(nullptr), isFloat(type.getKind() == FLOAT) {
+ DoubleColumnPrinter::DoubleColumnPrinter(std::string& buffer, const Type& type,
+ ColumnPrinter::Param)
+ : ColumnPrinter(buffer), data_(nullptr), isFloat_(type.getKind() == FLOAT) {
// PASS
}
void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
+ data_ = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
}
void DoubleColumnPrinter::printRow(uint64_t rowId) {
@@ -339,86 +345,76 @@ namespace orc {
writeString(buffer, "null");
} else {
char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", data[rowId]);
+ snprintf(numBuffer, sizeof(numBuffer), isFloat_ ? "%.7g" : "%.14g", data_[rowId]);
writeString(buffer, numBuffer);
}
}
- Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), data(nullptr), scale(0) {
+ Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& buffer, ColumnPrinter::Param param)
+ : ColumnPrinter(buffer), data_(nullptr), scale_(0), param_(param) {
// PASS
}
void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
- scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
+ data_ = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
+ scale_ = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
}
- std::string toDecimalString(int64_t value, int32_t scale) {
- std::stringstream buffer;
- if (scale == 0) {
- buffer << value;
- return buffer.str();
- }
- std::string sign = "";
- if (value < 0) {
- sign = "-";
- value = -value;
- }
- buffer << value;
- std::string str = buffer.str();
- int32_t len = static_cast<int32_t>(str.length());
- if (len > scale) {
- return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(scale));
- } else if (len == scale) {
- return sign + "0." + str;
- } else {
- std::string result = sign + "0.";
- for (int32_t i = 0; i < scale - len; ++i) {
- result += "0";
- }
- return result + str;
- }
+ std::string toDecimalString(int64_t value, int32_t scale, bool trimTrailingZeros) {
+ return Int128(value).toDecimalString(scale, trimTrailingZeros);
}
void Decimal64ColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- writeString(buffer, toDecimalString(data[rowId], scale).c_str());
+ bool trimTrailingZeros = param_.printDecimalTrimTrailingZeros;
+ if (param_.printDecimalAsString) {
+ writeChar(buffer, '"');
+ writeString(buffer, toDecimalString(data_[rowId], scale_, trimTrailingZeros).c_str());
+ writeChar(buffer, '"');
+ } else {
+ writeString(buffer, toDecimalString(data_[rowId], scale_, trimTrailingZeros).c_str());
+ }
}
}
- Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), data(nullptr), scale(0) {
+ Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& buffer, ColumnPrinter::Param param)
+ : ColumnPrinter(buffer), data_(nullptr), scale_(0), param_(param) {
// PASS
}
void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
- scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
+ data_ = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
+ scale_ = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
}
void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- writeString(buffer, data[rowId].toDecimalString(scale).c_str());
+ bool trimTrailingZeros = param_.printDecimalTrimTrailingZeros;
+ if (param_.printDecimalAsString) {
+ writeChar(buffer, '"');
+ writeString(buffer, data_[rowId].toDecimalString(scale_, trimTrailingZeros).c_str());
+ writeChar(buffer, '"');
+ } else {
+ writeString(buffer, data_[rowId].toDecimalString(scale_, trimTrailingZeros).c_str());
+ }
}
}
- StringColumnPrinter::StringColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), start(nullptr), length(nullptr) {
+ StringColumnPrinter::StringColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer), start_(nullptr), length_(nullptr) {
// PASS
}
void StringColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- start = dynamic_cast<const StringVectorBatch&>(batch).data.data();
- length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
+ start_ = dynamic_cast<const StringVectorBatch&>(batch).data.data();
+ length_ = dynamic_cast<const StringVectorBatch&>(batch).length.data();
}
void StringColumnPrinter::printRow(uint64_t rowId) {
@@ -426,8 +422,8 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '"');
- for (int64_t i = 0; i < length[rowId]; ++i) {
- char ch = static_cast<char>(start[rowId][i]);
+ for (int64_t i = 0; i < length_[rowId]; ++i) {
+ char ch = static_cast<char>(start_[rowId][i]);
switch (ch) {
case '\\':
writeString(buffer, "\\\\");
@@ -459,15 +455,16 @@ namespace orc {
}
}
- ListColumnPrinter::ListColumnPrinter(std::string& _buffer, const Type& type)
- : ColumnPrinter(_buffer), offsets(nullptr) {
- elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
+ ListColumnPrinter::ListColumnPrinter(std::string& buffer, const Type& type,
+ ColumnPrinter::Param param)
+ : ColumnPrinter(buffer), offsets_(nullptr) {
+ elementPrinter_ = createColumnPrinter(buffer, type.getSubtype(0), param);
}
void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
- elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements);
+ offsets_ = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
+ elementPrinter_->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements);
}
void ListColumnPrinter::printRow(uint64_t rowId) {
@@ -475,28 +472,29 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
- if (i != offsets[rowId]) {
+ for (int64_t i = offsets_[rowId]; i < offsets_[rowId + 1]; ++i) {
+ if (i != offsets_[rowId]) {
writeString(buffer, ", ");
}
- elementPrinter->printRow(static_cast<uint64_t>(i));
+ elementPrinter_->printRow(static_cast<uint64_t>(i));
}
writeChar(buffer, ']');
}
}
- MapColumnPrinter::MapColumnPrinter(std::string& _buffer, const Type& type)
- : ColumnPrinter(_buffer), offsets(nullptr) {
- keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
- elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
+ MapColumnPrinter::MapColumnPrinter(std::string& buffer, const Type& type,
+ ColumnPrinter::Param param)
+ : ColumnPrinter(buffer), offsets_(nullptr) {
+ keyPrinter_ = createColumnPrinter(buffer, type.getSubtype(0), param);
+ elementPrinter_ = createColumnPrinter(buffer, type.getSubtype(1), param);
}
void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
- offsets = myBatch.offsets.data();
- keyPrinter->reset(*myBatch.keys);
- elementPrinter->reset(*myBatch.elements);
+ offsets_ = myBatch.offsets.data();
+ keyPrinter_->reset(*myBatch.keys);
+ elementPrinter_->reset(*myBatch.elements);
}
void MapColumnPrinter::printRow(uint64_t rowId) {
@@ -504,34 +502,35 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
- if (i != offsets[rowId]) {
+ for (int64_t i = offsets_[rowId]; i < offsets_[rowId + 1]; ++i) {
+ if (i != offsets_[rowId]) {
writeString(buffer, ", ");
}
writeString(buffer, "{\"key\": ");
- keyPrinter->printRow(static_cast<uint64_t>(i));
+ keyPrinter_->printRow(static_cast<uint64_t>(i));
writeString(buffer, ", \"value\": ");
- elementPrinter->printRow(static_cast<uint64_t>(i));
+ elementPrinter_->printRow(static_cast<uint64_t>(i));
writeChar(buffer, '}');
}
writeChar(buffer, ']');
}
}
- UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, const Type& type)
- : ColumnPrinter(_buffer), tags(nullptr), offsets(nullptr) {
+ UnionColumnPrinter::UnionColumnPrinter(std::string& buffer, const Type& type,
+ ColumnPrinter::Param param)
+ : ColumnPrinter(buffer), tags_(nullptr), offsets_(nullptr) {
for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
- fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)));
+ fieldPrinter_.push_back(createColumnPrinter(buffer, type.getSubtype(i), param));
}
}
void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const UnionVectorBatch& unionBatch = dynamic_cast<const UnionVectorBatch&>(batch);
- tags = unionBatch.tags.data();
- offsets = unionBatch.offsets.data();
- for (size_t i = 0; i < fieldPrinter.size(); ++i) {
- fieldPrinter[i]->reset(*(unionBatch.children[i]));
+ tags_ = unionBatch.tags.data();
+ offsets_ = unionBatch.offsets.data();
+ for (size_t i = 0; i < fieldPrinter_.size(); ++i) {
+ fieldPrinter_[i]->reset(*(unionBatch.children[i]));
}
}
@@ -540,27 +539,28 @@ namespace orc {
writeString(buffer, "null");
} else {
writeString(buffer, "{\"tag\": ");
- const auto numBuffer = std::to_string(static_cast<int64_t>(tags[rowId]));
+ const auto numBuffer = std::to_string(static_cast<int64_t>(tags_[rowId]));
writeString(buffer, numBuffer.c_str());
writeString(buffer, ", \"value\": ");
- fieldPrinter[tags[rowId]]->printRow(offsets[rowId]);
+ fieldPrinter_[tags_[rowId]]->printRow(offsets_[rowId]);
writeChar(buffer, '}');
}
}
- StructColumnPrinter::StructColumnPrinter(std::string& _buffer, const Type& type)
- : ColumnPrinter(_buffer) {
+ StructColumnPrinter::StructColumnPrinter(std::string& buffer, const Type& type,
+ ColumnPrinter::Param param)
+ : ColumnPrinter(buffer) {
for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
- fieldNames.push_back(type.getFieldName(i));
- fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)));
+ fieldNames_.push_back(type.getFieldName(i));
+ fieldPrinter_.push_back(createColumnPrinter(buffer, type.getSubtype(i), param));
}
}
void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const StructVectorBatch& structBatch = dynamic_cast<const StructVectorBatch&>(batch);
- for (size_t i = 0; i < fieldPrinter.size(); ++i) {
- fieldPrinter[i]->reset(*(structBatch.fields[i]));
+ for (size_t i = 0; i < fieldPrinter_.size(); ++i) {
+ fieldPrinter_[i]->reset(*(structBatch.fields[i]));
}
}
@@ -569,21 +569,21 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '{');
- for (unsigned int i = 0; i < fieldPrinter.size(); ++i) {
+ for (unsigned int i = 0; i < fieldPrinter_.size(); ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
writeChar(buffer, '"');
- writeString(buffer, fieldNames[i].c_str());
+ writeString(buffer, fieldNames_[i].c_str());
writeString(buffer, "\": ");
- fieldPrinter[i]->printRow(rowId);
+ fieldPrinter_[i]->printRow(rowId);
}
writeChar(buffer, '}');
}
}
- DateColumnPrinter::DateColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), data(nullptr) {
+ DateColumnPrinter::DateColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer), data_(nullptr) {
// PASS
}
@@ -591,7 +591,7 @@ namespace orc {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- const time_t timeValue = data[rowId] * 24 * 60 * 60;
+ const time_t timeValue = data_[rowId] * 24 * 60 * 60;
struct tm tmValue;
gmtime_r(&timeValue, &tmValue);
char timeBuffer[11];
@@ -604,11 +604,11 @@ namespace orc {
void DateColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+ data_ = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
- BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), data(nullptr) {
+ BooleanColumnPrinter::BooleanColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer), data_(nullptr) {
// PASS
}
@@ -616,17 +616,17 @@ namespace orc {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- writeString(buffer, (data[rowId] ? "true" : "false"));
+ writeString(buffer, (data_[rowId] ? "true" : "false"));
}
}
void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+ data_ = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
- BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), start(nullptr), length(nullptr) {
+ BinaryColumnPrinter::BinaryColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer), start_(nullptr), length_(nullptr) {
// PASS
}
@@ -635,11 +635,11 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for (int64_t i = 0; i < length[rowId]; ++i) {
+ for (int64_t i = 0; i < length_[rowId]; ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
- const auto numBuffer = std::to_string(static_cast<int>(start[rowId][i]) & 0xff);
+ const auto numBuffer = std::to_string(static_cast<int>(start_[rowId][i]) & 0xff);
writeString(buffer, numBuffer.c_str());
}
writeChar(buffer, ']');
@@ -648,12 +648,12 @@ namespace orc {
void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- start = dynamic_cast<const StringVectorBatch&>(batch).data.data();
- length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
+ start_ = dynamic_cast<const StringVectorBatch&>(batch).data.data();
+ length_ = dynamic_cast<const StringVectorBatch&>(batch).length.data();
}
- TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer)
- : ColumnPrinter(_buffer), seconds(nullptr), nanoseconds(nullptr) {
+ TimestampColumnPrinter::TimestampColumnPrinter(std::string& buffer, ColumnPrinter::Param)
+ : ColumnPrinter(buffer), seconds_(nullptr), nanoseconds_(nullptr) {
// PASS
}
@@ -662,8 +662,8 @@ namespace orc {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- int64_t nanos = nanoseconds[rowId];
- time_t secs = static_cast<time_t>(seconds[rowId]);
+ int64_t nanos = nanoseconds_[rowId];
+ time_t secs = static_cast<time_t>(seconds_[rowId]);
struct tm tmValue;
gmtime_r(&secs, &tmValue);
char timeBuffer[20];
@@ -694,7 +694,7 @@ namespace orc {
void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const TimestampVectorBatch& ts = dynamic_cast<const TimestampVectorBatch&>(batch);
- seconds = ts.data.data();
- nanoseconds = ts.nanoseconds.data();
+ seconds_ = ts.data.data();
+ nanoseconds_ = ts.nanoseconds.data();
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
index a6bbdabedc..af434c37ca 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
@@ -138,7 +138,7 @@ namespace orc {
template <typename BatchType>
class BooleanColumnReader : public ColumnReader {
private:
- std::unique_ptr<orc::ByteRleDecoder> rle;
+ std::unique_ptr<orc::ByteRleDecoder> rle_;
public:
BooleanColumnReader(const Type& type, StripeStreams& stipe);
@@ -157,7 +157,7 @@ namespace orc {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) throw ParseError("DATA stream not found in Boolean column");
- rle = createBooleanRleDecoder(std::move(stream), metrics);
+ rle_ = createBooleanRleDecoder(std::move(stream), metrics);
}
template <typename BatchType>
@@ -168,7 +168,7 @@ namespace orc {
template <typename BatchType>
uint64_t BooleanColumnReader<BatchType>::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
+ rle_->skip(numValues);
return numValues;
}
@@ -180,8 +180,8 @@ namespace orc {
// LongVectorBatch with long*. We cheat here in that case and use the long*
// and then expand it in a second pass..
auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr), numValues,
- rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ rle_->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
expandBytesToIntegers(ptr, numValues);
}
@@ -189,27 +189,27 @@ namespace orc {
void BooleanColumnReader<BatchType>::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
+ rle_->seek(positions.at(columnId));
}
template <typename BatchType>
class ByteColumnReader : public ColumnReader {
private:
- std::unique_ptr<orc::ByteRleDecoder> rle;
+ std::unique_ptr<orc::ByteRleDecoder> rle_;
public:
ByteColumnReader(const Type& type, StripeStreams& stripe) : ColumnReader(type, stripe) {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) throw ParseError("DATA stream not found in Byte column");
- rle = createByteRleDecoder(std::move(stream), metrics);
+ rle_ = createByteRleDecoder(std::move(stream), metrics);
}
~ByteColumnReader() override = default;
uint64_t skip(uint64_t numValues) override {
numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
+ rle_->skip(numValues);
return numValues;
}
@@ -218,14 +218,14 @@ namespace orc {
// Since the byte rle places the output in a char* instead of long*,
// we cheat here and use the long* and then expand it in a second pass.
auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr), numValues,
- rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ rle_->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
expandBytesToIntegers(ptr, numValues);
}
void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override {
ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
+ rle_->seek(positions.at(columnId));
}
};
@@ -267,12 +267,12 @@ namespace orc {
class TimestampColumnReader : public ColumnReader {
private:
- std::unique_ptr<orc::RleDecoder> secondsRle;
- std::unique_ptr<orc::RleDecoder> nanoRle;
- const Timezone* writerTimezone;
- const Timezone* readerTimezone;
- const int64_t epochOffset;
- const bool sameTimezone;
+ std::unique_ptr<orc::RleDecoder> secondsRle_;
+ std::unique_ptr<orc::RleDecoder> nanoRle_;
+ const Timezone* writerTimezone_;
+ const Timezone* readerTimezone_;
+ const int64_t epochOffset_;
+ const bool sameTimezone_;
public:
TimestampColumnReader(const Type& type, StripeStreams& stripe, bool isInstantType);
@@ -288,18 +288,18 @@ namespace orc {
TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe,
bool isInstantType)
: ColumnReader(type, stripe),
- writerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getWriterTimezone()),
- readerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()),
- epochOffset(writerTimezone->getEpoch()),
- sameTimezone(writerTimezone == readerTimezone) {
+ writerTimezone_(isInstantType ? &getTimezoneByName("GMT") : &stripe.getWriterTimezone()),
+ readerTimezone_(isInstantType ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()),
+ epochOffset_(writerTimezone_->getEpoch()),
+ sameTimezone_(writerTimezone_ == readerTimezone_) {
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) throw ParseError("DATA stream not found in Timestamp column");
- secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
+ secondsRle_ = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
if (stream == nullptr) throw ParseError("SECONDARY stream not found in Timestamp column");
- nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
+ nanoRle_ = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
}
TimestampColumnReader::~TimestampColumnReader() {
@@ -308,8 +308,8 @@ namespace orc {
uint64_t TimestampColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- secondsRle->skip(numValues);
- nanoRle->skip(numValues);
+ secondsRle_->skip(numValues);
+ nanoRle_->skip(numValues);
return numValues;
}
@@ -318,9 +318,9 @@ namespace orc {
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
TimestampVectorBatch& timestampBatch = dynamic_cast<TimestampVectorBatch&>(rowBatch);
int64_t* secsBuffer = timestampBatch.data.data();
- secondsRle->next(secsBuffer, numValues, notNull);
+ secondsRle_->next(secsBuffer, numValues, notNull);
int64_t* nanoBuffer = timestampBatch.nanoseconds.data();
- nanoRle->next(nanoBuffer, numValues, notNull);
+ nanoRle_->next(nanoBuffer, numValues, notNull);
// Construct the values
for (uint64_t i = 0; i < numValues; i++) {
@@ -332,17 +332,17 @@ namespace orc {
nanoBuffer[i] *= 10;
}
}
- int64_t writerTime = secsBuffer[i] + epochOffset;
- if (!sameTimezone) {
+ int64_t writerTime = secsBuffer[i] + epochOffset_;
+ if (!sameTimezone_) {
// adjust timestamp value to same wall clock time if writer and reader
// time zones have different rules, which is required for Apache Orc.
- const auto& wv = writerTimezone->getVariant(writerTime);
- const auto& rv = readerTimezone->getVariant(writerTime);
+ const auto& wv = writerTimezone_->getVariant(writerTime);
+ const auto& rv = readerTimezone_->getVariant(writerTime);
if (!wv.hasSameTzRule(rv)) {
// If the timezone adjustment moves the millis across a DST boundary,
// we need to reevaluate the offsets.
int64_t adjustedTime = writerTime + wv.gmtOffset - rv.gmtOffset;
- const auto& adjustedReader = readerTimezone->getVariant(adjustedTime);
+ const auto& adjustedReader = readerTimezone_->getVariant(adjustedTime);
writerTime = writerTime + wv.gmtOffset - adjustedReader.gmtOffset;
}
}
@@ -357,8 +357,8 @@ namespace orc {
void TimestampColumnReader::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- secondsRle->seek(positions.at(columnId));
- nanoRle->seek(positions.at(columnId));
+ secondsRle_->seek(positions.at(columnId));
+ nanoRle_->seek(positions.at(columnId));
}
template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
@@ -374,39 +374,39 @@ namespace orc {
void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
private:
- std::unique_ptr<SeekableInputStream> inputStream;
- const uint64_t bytesPerValue = (columnKind == FLOAT) ? 4 : 8;
- const char* bufferPointer;
- const char* bufferEnd;
+ std::unique_ptr<SeekableInputStream> inputStream_;
+ const uint64_t bytesPerValue_ = (columnKind == FLOAT) ? 4 : 8;
+ const char* bufferPointer_;
+ const char* bufferEnd_;
unsigned char readByte() {
- if (bufferPointer == bufferEnd) {
+ if (bufferPointer_ == bufferEnd_) {
int length;
- if (!inputStream->Next(reinterpret_cast<const void**>(&bufferPointer), &length)) {
+ if (!inputStream_->Next(reinterpret_cast<const void**>(&bufferPointer_), &length)) {
throw ParseError("bad read in DoubleColumnReader::next()");
}
- bufferEnd = bufferPointer + length;
+ bufferEnd_ = bufferPointer_ + length;
}
- return static_cast<unsigned char>(*(bufferPointer++));
+ return static_cast<unsigned char>(*(bufferPointer_++));
}
template <typename FloatType>
FloatType readDouble() {
int64_t bits = 0;
- if (bufferEnd - bufferPointer >= 8) {
+ if (bufferEnd_ - bufferPointer_ >= 8) {
if (isLittleEndian) {
- memcpy(&bits, bufferPointer, sizeof(bits));
+ memcpy(&bits, bufferPointer_, sizeof(bits));
} else {
- bits = static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[0]));
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[1])) << 8;
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[2])) << 16;
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[3])) << 24;
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[4])) << 32;
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[5])) << 40;
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[6])) << 48;
- bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer[7])) << 56;
+ bits = static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[0]));
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[1])) << 8;
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[2])) << 16;
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[3])) << 24;
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[4])) << 32;
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[5])) << 40;
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[6])) << 48;
+ bits |= static_cast<int64_t>(static_cast<unsigned char>(bufferPointer_[7])) << 56;
}
- bufferPointer += 8;
+ bufferPointer_ += 8;
} else {
for (uint64_t i = 0; i < 8; i++) {
bits |= static_cast<int64_t>(readByte()) << (i * 8);
@@ -419,16 +419,16 @@ namespace orc {
template <typename FloatType>
FloatType readFloat() {
int32_t bits = 0;
- if (bufferEnd - bufferPointer >= 4) {
+ if (bufferEnd_ - bufferPointer_ >= 4) {
if (isLittleEndian) {
- bits = *(reinterpret_cast<const int32_t*>(bufferPointer));
+ bits = *(reinterpret_cast<const int32_t*>(bufferPointer_));
} else {
- bits = static_cast<unsigned char>(bufferPointer[0]);
- bits |= static_cast<unsigned char>(bufferPointer[1]) << 8;
- bits |= static_cast<unsigned char>(bufferPointer[2]) << 16;
- bits |= static_cast<unsigned char>(bufferPointer[3]) << 24;
+ bits = static_cast<unsigned char>(bufferPointer_[0]);
+ bits |= static_cast<unsigned char>(bufferPointer_[1]) << 8;
+ bits |= static_cast<unsigned char>(bufferPointer_[2]) << 16;
+ bits |= static_cast<unsigned char>(bufferPointer_[3]) << 24;
}
- bufferPointer += 4;
+ bufferPointer_ += 4;
} else {
for (uint64_t i = 0; i < 4; i++) {
bits |= readByte() << (i * 8);
@@ -445,9 +445,9 @@ namespace orc {
template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::DoubleColumnReader(
const Type& type, StripeStreams& stripe)
- : ColumnReader(type, stripe), bufferPointer(nullptr), bufferEnd(nullptr) {
- inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (inputStream == nullptr) throw ParseError("DATA stream not found in Double column");
+ : ColumnReader(type, stripe), bufferPointer_(nullptr), bufferEnd_(nullptr) {
+ inputStream_ = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (inputStream_ == nullptr) throw ParseError("DATA stream not found in Double column");
}
template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
@@ -455,19 +455,19 @@ namespace orc {
uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- if (static_cast<size_t>(bufferEnd - bufferPointer) >= bytesPerValue * numValues) {
- bufferPointer += bytesPerValue * numValues;
+ if (static_cast<size_t>(bufferEnd_ - bufferPointer_) >= bytesPerValue_ * numValues) {
+ bufferPointer_ += bytesPerValue_ * numValues;
} else {
size_t sizeToSkip =
- bytesPerValue * numValues - static_cast<size_t>(bufferEnd - bufferPointer);
+ bytesPerValue_ * numValues - static_cast<size_t>(bufferEnd_ - bufferPointer_);
const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
while (sizeToSkip != 0) {
size_t step = sizeToSkip > cap ? cap : sizeToSkip;
- inputStream->Skip(static_cast<int>(step));
+ inputStream_->Skip(static_cast<int>(step));
sizeToSkip -= step;
}
- bufferEnd = nullptr;
- bufferPointer = nullptr;
+ bufferEnd_ = nullptr;
+ bufferPointer_ = nullptr;
}
return numValues;
@@ -506,12 +506,12 @@ namespace orc {
// Only viable when the machine is little-endian.
uint64_t bufferNum = 0;
if (isLittleEndian) {
- bufferNum =
- std::min(numValues, static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue);
- uint64_t bufferBytes = bufferNum * bytesPerValue;
+ bufferNum = std::min(numValues,
+ static_cast<size_t>(bufferEnd_ - bufferPointer_) / bytesPerValue_);
+ uint64_t bufferBytes = bufferNum * bytesPerValue_;
if (bufferBytes > 0) {
- memcpy(outArray, bufferPointer, bufferBytes);
- bufferPointer += bufferBytes;
+ memcpy(outArray, bufferPointer_, bufferBytes);
+ bufferPointer_ += bufferBytes;
}
}
for (size_t i = bufferNum; i < numValues; ++i) {
@@ -525,10 +525,10 @@ namespace orc {
void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- inputStream->seek(positions.at(columnId));
+ inputStream_->seek(positions.at(columnId));
// clear buffer state after seek
- bufferEnd = nullptr;
- bufferPointer = nullptr;
+ bufferEnd_ = nullptr;
+ bufferPointer_ = nullptr;
}
void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) {
@@ -549,8 +549,8 @@ namespace orc {
class StringDictionaryColumnReader : public ColumnReader {
private:
- std::shared_ptr<StringDictionary> dictionary;
- std::unique_ptr<RleDecoder> rle;
+ std::shared_ptr<StringDictionary> dictionary_;
+ std::unique_ptr<RleDecoder> rle_;
public:
StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
@@ -567,7 +567,7 @@ namespace orc {
StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type,
StripeStreams& stripe)
- : ColumnReader(type, stripe), dictionary(new StringDictionary(stripe.getMemoryPool())) {
+ : ColumnReader(type, stripe), dictionary_(new StringDictionary(stripe.getMemoryPool())) {
RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind());
uint32_t dictSize = stripe.getEncoding(columnId).dictionary_size();
std::unique_ptr<SeekableInputStream> stream =
@@ -575,15 +575,15 @@ namespace orc {
if (stream == nullptr) {
throw ParseError("DATA stream not found in StringDictionaryColumn");
}
- rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
+ rle_ = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
if (dictSize > 0 && stream == nullptr) {
throw ParseError("LENGTH stream not found in StringDictionaryColumn");
}
std::unique_ptr<RleDecoder> lengthDecoder =
createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
- dictionary->dictionaryOffset.resize(dictSize + 1);
- int64_t* lengthArray = dictionary->dictionaryOffset.data();
+ dictionary_->dictionaryOffset.resize(dictSize + 1);
+ int64_t* lengthArray = dictionary_->dictionaryOffset.data();
lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
lengthArray[0] = 0;
for (uint32_t i = 1; i < dictSize + 1; ++i) {
@@ -593,13 +593,13 @@ namespace orc {
lengthArray[i] += lengthArray[i - 1];
}
int64_t blobSize = lengthArray[dictSize];
- dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
+ dictionary_->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
std::unique_ptr<SeekableInputStream> blobStream =
stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
if (blobSize > 0 && blobStream == nullptr) {
throw ParseError("DICTIONARY_DATA stream not found in StringDictionaryColumn");
}
- readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get());
+ readFully(dictionary_->dictionaryBlob.data(), blobSize, blobStream.get());
}
StringDictionaryColumnReader::~StringDictionaryColumnReader() {
@@ -608,7 +608,7 @@ namespace orc {
uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
+ rle_->skip(numValues);
return numValues;
}
@@ -618,12 +618,12 @@ namespace orc {
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char* blob = dictionary->dictionaryBlob.data();
- int64_t* dictionaryOffsets = dictionary->dictionaryOffset.data();
+ char* blob = dictionary_->dictionaryBlob.data();
+ int64_t* dictionaryOffsets = dictionary_->dictionaryOffset.data();
char** outputStarts = byteBatch.data.data();
int64_t* outputLengths = byteBatch.length.data();
- rle->next(outputLengths, numValues, notNull);
- uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1;
+ rle_->next(outputLengths, numValues, notNull);
+ uint64_t dictionaryCount = dictionary_->dictionaryOffset.size() - 1;
if (notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
@@ -654,24 +654,24 @@ namespace orc {
rowBatch.isEncoded = true;
EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch);
- batch.dictionary = this->dictionary;
+ batch.dictionary = this->dictionary_;
// Length buffer is reused to save dictionary entry ids
- rle->next(batch.index.data(), numValues, notNull);
+ rle_->next(batch.index.data(), numValues, notNull);
}
void StringDictionaryColumnReader::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
+ rle_->seek(positions.at(columnId));
}
class StringDirectColumnReader : public ColumnReader {
private:
- std::unique_ptr<RleDecoder> lengthRle;
- std::unique_ptr<SeekableInputStream> blobStream;
- const char* lastBuffer;
- size_t lastBufferLength;
+ std::unique_ptr<RleDecoder> lengthRle_;
+ std::unique_ptr<SeekableInputStream> blobStream_;
+ const char* lastBuffer_;
+ size_t lastBufferLength_;
/**
* Compute the total length of the values.
@@ -699,11 +699,11 @@ namespace orc {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
if (stream == nullptr) throw ParseError("LENGTH stream not found in StringDirectColumn");
- lengthRle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
- blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (blobStream == nullptr) throw ParseError("DATA stream not found in StringDirectColumn");
- lastBuffer = nullptr;
- lastBufferLength = 0;
+ lengthRle_ = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
+ blobStream_ = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (blobStream_ == nullptr) throw ParseError("DATA stream not found in StringDirectColumn");
+ lastBuffer_ = nullptr;
+ lastBufferLength_ = 0;
}
StringDirectColumnReader::~StringDirectColumnReader() {
@@ -719,25 +719,25 @@ namespace orc {
// read the lengths, so we know haw many bytes to skip
while (done < numValues) {
uint64_t step = std::min(BUFFER_SIZE, static_cast<size_t>(numValues - done));
- lengthRle->next(buffer, step, nullptr);
+ lengthRle_->next(buffer, step, nullptr);
totalBytes += computeSize(buffer, nullptr, step);
done += step;
}
- if (totalBytes <= lastBufferLength) {
+ if (totalBytes <= lastBufferLength_) {
// subtract the needed bytes from the ones left over
- lastBufferLength -= totalBytes;
- lastBuffer += totalBytes;
+ lastBufferLength_ -= totalBytes;
+ lastBuffer_ += totalBytes;
} else {
// move the stream forward after accounting for the buffered bytes
- totalBytes -= lastBufferLength;
+ totalBytes -= lastBufferLength_;
const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
while (totalBytes != 0) {
size_t step = totalBytes > cap ? cap : totalBytes;
- blobStream->Skip(static_cast<int>(step));
+ blobStream_->Skip(static_cast<int>(step));
totalBytes -= step;
}
- lastBufferLength = 0;
- lastBuffer = nullptr;
+ lastBufferLength_ = 0;
+ lastBuffer_ = nullptr;
}
return numValues;
}
@@ -769,7 +769,7 @@ namespace orc {
int64_t* lengthPtr = byteBatch.length.data();
// read the length vector
- lengthRle->next(lengthPtr, numValues, notNull);
+ lengthRle_->next(lengthPtr, numValues, notNull);
// figure out the total length of data we need from the blob stream
const size_t totalLength = computeSize(lengthPtr, notNull, numValues);
@@ -779,23 +779,23 @@ namespace orc {
size_t bytesBuffered = 0;
byteBatch.blob.resize(totalLength);
char* ptr = byteBatch.blob.data();
- while (bytesBuffered + lastBufferLength < totalLength) {
- memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength);
- bytesBuffered += lastBufferLength;
+ while (bytesBuffered + lastBufferLength_ < totalLength) {
+ memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ bytesBuffered += lastBufferLength_;
const void* readBuffer;
int readLength;
- if (!blobStream->Next(&readBuffer, &readLength)) {
+ if (!blobStream_->Next(&readBuffer, &readLength)) {
throw ParseError("failed to read in StringDirectColumnReader.next");
}
- lastBuffer = static_cast<const char*>(readBuffer);
- lastBufferLength = static_cast<size_t>(readLength);
+ lastBuffer_ = static_cast<const char*>(readBuffer);
+ lastBufferLength_ = static_cast<size_t>(readLength);
}
if (bytesBuffered < totalLength) {
size_t moreBytes = totalLength - bytesBuffered;
- memcpy(ptr + bytesBuffered, lastBuffer, moreBytes);
- lastBuffer += moreBytes;
- lastBufferLength -= moreBytes;
+ memcpy(ptr + bytesBuffered, lastBuffer_, moreBytes);
+ lastBuffer_ += moreBytes;
+ lastBufferLength_ -= moreBytes;
}
size_t filledSlots = 0;
@@ -820,16 +820,16 @@ namespace orc {
void StringDirectColumnReader::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- blobStream->seek(positions.at(columnId));
- lengthRle->seek(positions.at(columnId));
+ blobStream_->seek(positions.at(columnId));
+ lengthRle_->seek(positions.at(columnId));
// clear buffer state after seek
- lastBuffer = nullptr;
- lastBufferLength = 0;
+ lastBuffer_ = nullptr;
+ lastBufferLength_ = 0;
}
class StructColumnReader : public ColumnReader {
private:
- std::vector<std::unique_ptr<ColumnReader>> children;
+ std::vector<std::unique_ptr<ColumnReader>> children_;
public:
StructColumnReader(const Type& type, StripeStreams& stripe, bool useTightNumericVector = false,
@@ -859,7 +859,7 @@ namespace orc {
for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
const Type& child = *type.getSubtype(i);
if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
- children.push_back(
+ children_.push_back(
buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow));
}
}
@@ -874,7 +874,7 @@ namespace orc {
uint64_t StructColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- for (auto& ptr : children) {
+ for (auto& ptr : children_) {
ptr->skip(numValues);
}
return numValues;
@@ -895,7 +895,7 @@ namespace orc {
ColumnReader::next(rowBatch, numValues, notNull);
uint64_t i = 0;
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- for (auto iter = children.begin(); iter != children.end(); ++iter, ++i) {
+ for (auto iter = children_.begin(); iter != children_.end(); ++iter, ++i) {
if (encoded) {
(*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues,
notNull);
@@ -909,15 +909,15 @@ namespace orc {
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- for (auto& ptr : children) {
+ for (auto& ptr : children_) {
ptr->seekToRowGroup(positions);
}
}
class ListColumnReader : public ColumnReader {
private:
- std::unique_ptr<ColumnReader> child;
- std::unique_ptr<RleDecoder> rle;
+ std::unique_ptr<ColumnReader> child_;
+ std::unique_ptr<RleDecoder> rle_;
public:
ListColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,
@@ -947,10 +947,11 @@ namespace orc {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
if (stream == nullptr) throw ParseError("LENGTH stream not found in List column");
- rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
+ rle_ = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
const Type& childType = *type.getSubtype(0);
if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
- child = buildReader(childType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
+ child_ =
+ buildReader(childType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
}
@@ -960,7 +961,7 @@ namespace orc {
uint64_t ListColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- ColumnReader* childReader = child.get();
+ ColumnReader* childReader = child_.get();
if (childReader) {
const uint64_t BUFFER_SIZE = 1024;
int64_t buffer[BUFFER_SIZE];
@@ -968,7 +969,7 @@ namespace orc {
uint64_t lengthsRead = 0;
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
- rle->next(buffer, chunk, nullptr);
+ rle_->next(buffer, chunk, nullptr);
for (size_t i = 0; i < chunk; ++i) {
childrenElements += static_cast<size_t>(buffer[i]);
}
@@ -976,7 +977,7 @@ namespace orc {
}
childReader->skip(childrenElements);
} else {
- rle->skip(numValues);
+ rle_->skip(numValues);
}
return numValues;
}
@@ -997,7 +998,7 @@ namespace orc {
ListVectorBatch& listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
int64_t* offsets = listBatch.offsets.data();
notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr;
- rle->next(offsets, numValues, notNull);
+ rle_->next(offsets, numValues, notNull);
uint64_t totalChildren = 0;
if (notNull) {
for (size_t i = 0; i < numValues; ++i) {
@@ -1017,7 +1018,7 @@ namespace orc {
}
}
offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader* childReader = child.get();
+ ColumnReader* childReader = child_.get();
if (childReader) {
if (encoded) {
childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr);
@@ -1029,17 +1030,17 @@ namespace orc {
void ListColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- if (child.get()) {
- child->seekToRowGroup(positions);
+ rle_->seek(positions.at(columnId));
+ if (child_.get()) {
+ child_->seekToRowGroup(positions);
}
}
class MapColumnReader : public ColumnReader {
private:
- std::unique_ptr<ColumnReader> keyReader;
- std::unique_ptr<ColumnReader> elementReader;
- std::unique_ptr<RleDecoder> rle;
+ std::unique_ptr<ColumnReader> keyReader_;
+ std::unique_ptr<ColumnReader> elementReader_;
+ std::unique_ptr<RleDecoder> rle_;
public:
MapColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,
@@ -1068,15 +1069,15 @@ namespace orc {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
if (stream == nullptr) throw ParseError("LENGTH stream not found in Map column");
- rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
+ rle_ = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
const Type& keyType = *type.getSubtype(0);
if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) {
- keyReader =
+ keyReader_ =
buildReader(keyType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
const Type& elementType = *type.getSubtype(1);
if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) {
- elementReader =
+ elementReader_ =
buildReader(elementType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
}
@@ -1087,8 +1088,8 @@ namespace orc {
uint64_t MapColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- ColumnReader* rawKeyReader = keyReader.get();
- ColumnReader* rawElementReader = elementReader.get();
+ ColumnReader* rawKeyReader = keyReader_.get();
+ ColumnReader* rawElementReader = elementReader_.get();
if (rawKeyReader || rawElementReader) {
const uint64_t BUFFER_SIZE = 1024;
int64_t buffer[BUFFER_SIZE];
@@ -1096,7 +1097,7 @@ namespace orc {
uint64_t lengthsRead = 0;
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
- rle->next(buffer, chunk, nullptr);
+ rle_->next(buffer, chunk, nullptr);
for (size_t i = 0; i < chunk; ++i) {
childrenElements += static_cast<size_t>(buffer[i]);
}
@@ -1109,7 +1110,7 @@ namespace orc {
rawElementReader->skip(childrenElements);
}
} else {
- rle->skip(numValues);
+ rle_->skip(numValues);
}
return numValues;
}
@@ -1130,7 +1131,7 @@ namespace orc {
MapVectorBatch& mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
int64_t* offsets = mapBatch.offsets.data();
notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr;
- rle->next(offsets, numValues, notNull);
+ rle_->next(offsets, numValues, notNull);
uint64_t totalChildren = 0;
if (notNull) {
for (size_t i = 0; i < numValues; ++i) {
@@ -1150,7 +1151,7 @@ namespace orc {
}
}
offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader* rawKeyReader = keyReader.get();
+ ColumnReader* rawKeyReader = keyReader_.get();
if (rawKeyReader) {
if (encoded) {
rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr);
@@ -1158,7 +1159,7 @@ namespace orc {
rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr);
}
}
- ColumnReader* rawElementReader = elementReader.get();
+ ColumnReader* rawElementReader = elementReader_.get();
if (rawElementReader) {
if (encoded) {
rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr);
@@ -1170,21 +1171,21 @@ namespace orc {
void MapColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- if (keyReader.get()) {
- keyReader->seekToRowGroup(positions);
+ rle_->seek(positions.at(columnId));
+ if (keyReader_.get()) {
+ keyReader_->seekToRowGroup(positions);
}
- if (elementReader.get()) {
- elementReader->seekToRowGroup(positions);
+ if (elementReader_.get()) {
+ elementReader_->seekToRowGroup(positions);
}
}
class UnionColumnReader : public ColumnReader {
private:
- std::unique_ptr<ByteRleDecoder> rle;
- std::vector<std::unique_ptr<ColumnReader>> childrenReader;
- std::vector<int64_t> childrenCounts;
- uint64_t numChildren;
+ std::unique_ptr<ByteRleDecoder> rle_;
+ std::vector<std::unique_ptr<ColumnReader>> childrenReader_;
+ std::vector<int64_t> childrenCounts_;
+ uint64_t numChildren_;
public:
UnionColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,
@@ -1207,20 +1208,20 @@ namespace orc {
bool useTightNumericVector,
bool throwOnSchemaEvolutionOverflow)
: ColumnReader(type, stripe) {
- numChildren = type.getSubtypeCount();
- childrenReader.resize(numChildren);
- childrenCounts.resize(numChildren);
+ numChildren_ = type.getSubtypeCount();
+ childrenReader_.resize(numChildren_);
+ childrenCounts_.resize(numChildren_);
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) throw ParseError("LENGTH stream not found in Union column");
- rle = createByteRleDecoder(std::move(stream), metrics);
+ rle_ = createByteRleDecoder(std::move(stream), metrics);
// figure out which types are selected
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- for (unsigned int i = 0; i < numChildren; ++i) {
+ for (unsigned int i = 0; i < numChildren_; ++i) {
const Type& child = *type.getSubtype(i);
if (selectedColumns[static_cast<size_t>(child.getColumnId())]) {
- childrenReader[i] =
+ childrenReader_[i] =
buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
}
@@ -1231,19 +1232,19 @@ namespace orc {
const uint64_t BUFFER_SIZE = 1024;
char buffer[BUFFER_SIZE];
uint64_t lengthsRead = 0;
- int64_t* counts = childrenCounts.data();
- memset(counts, 0, sizeof(int64_t) * numChildren);
+ int64_t* counts = childrenCounts_.data();
+ memset(counts, 0, sizeof(int64_t) * numChildren_);
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
- rle->next(buffer, chunk, nullptr);
+ rle_->next(buffer, chunk, nullptr);
for (size_t i = 0; i < chunk; ++i) {
counts[static_cast<size_t>(buffer[i])] += 1;
}
lengthsRead += chunk;
}
- for (size_t i = 0; i < numChildren; ++i) {
- if (counts[i] != 0 && childrenReader[i] != nullptr) {
- childrenReader[i]->skip(static_cast<uint64_t>(counts[i]));
+ for (size_t i = 0; i < numChildren_; ++i) {
+ if (counts[i] != 0 && childrenReader_[i] != nullptr) {
+ childrenReader_[i]->skip(static_cast<uint64_t>(counts[i]));
}
}
return numValues;
@@ -1264,11 +1265,11 @@ namespace orc {
ColumnReader::next(rowBatch, numValues, notNull);
UnionVectorBatch& unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
uint64_t* offsets = unionBatch.offsets.data();
- int64_t* counts = childrenCounts.data();
- memset(counts, 0, sizeof(int64_t) * numChildren);
+ int64_t* counts = childrenCounts_.data();
+ memset(counts, 0, sizeof(int64_t) * numChildren_);
unsigned char* tags = unionBatch.tags.data();
notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr;
- rle->next(reinterpret_cast<char*>(tags), numValues, notNull);
+ rle_->next(reinterpret_cast<char*>(tags), numValues, notNull);
// set the offsets for each row
if (notNull) {
for (size_t i = 0; i < numValues; ++i) {
@@ -1282,14 +1283,14 @@ namespace orc {
}
}
// read the right number of each child column
- for (size_t i = 0; i < numChildren; ++i) {
- if (childrenReader[i] != nullptr) {
+ for (size_t i = 0; i < numChildren_; ++i) {
+ if (childrenReader_[i] != nullptr) {
if (encoded) {
- childrenReader[i]->nextEncoded(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
+ childrenReader_[i]->nextEncoded(*(unionBatch.children[i]),
+ static_cast<uint64_t>(counts[i]), nullptr);
} else {
- childrenReader[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]),
- nullptr);
+ childrenReader_[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]),
+ nullptr);
}
}
}
@@ -1298,10 +1299,10 @@ namespace orc {
void UnionColumnReader::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- for (size_t i = 0; i < numChildren; ++i) {
- if (childrenReader[i] != nullptr) {
- childrenReader[i]->seekToRowGroup(positions);
+ rle_->seek(positions.at(columnId));
+ for (size_t i = 0; i < numChildren_; ++i) {
+ if (childrenReader_[i] != nullptr) {
+ childrenReader_[i]->seekToRowGroup(positions);
}
}
}
@@ -1599,8 +1600,8 @@ namespace orc {
class DecimalHive11ColumnReader : public Decimal64ColumnReader {
private:
- bool throwOnOverflow;
- std::ostream* errorStream;
+ bool throwOnOverflow_;
+ std::ostream* errorStream_;
/**
* Read an Int128 from the stream and correct it to the desired scale.
@@ -1649,8 +1650,8 @@ namespace orc {
DecimalHive11ColumnReader::DecimalHive11ColumnReader(const Type& type, StripeStreams& stripe)
: Decimal64ColumnReader(type, stripe) {
scale = stripe.getForcedScaleOnHive11Decimal();
- throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow();
- errorStream = stripe.getErrorStream();
+ throwOnOverflow_ = stripe.getThrowOnHive11DecimalOverflow();
+ errorStream_ = stripe.getErrorStream();
}
DecimalHive11ColumnReader::~DecimalHive11ColumnReader() {
@@ -1674,12 +1675,12 @@ namespace orc {
for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
- if (throwOnOverflow) {
+ if (throwOnOverflow_) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
- *errorStream << "Warning: "
- << "Hive 0.11 decimal with more than 38 digits "
- << "replaced by NULL.\n";
+ *errorStream_ << "Warning: "
+ << "Hive 0.11 decimal with more than 38 digits "
+ << "replaced by NULL.\n";
notNull[i] = false;
}
}
@@ -1688,12 +1689,12 @@ namespace orc {
} else {
for (size_t i = 0; i < numValues; ++i) {
if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
- if (throwOnOverflow) {
+ if (throwOnOverflow_) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
- *errorStream << "Warning: "
- << "Hive 0.11 decimal with more than 38 digits "
- << "replaced by NULL.\n";
+ *errorStream_ << "Warning: "
+ << "Hive 0.11 decimal with more than 38 digits "
+ << "replaced by NULL.\n";
batch.hasNulls = true;
batch.notNull[i] = false;
}
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
index f24be1f0b2..d31b1c65d4 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
@@ -24,6 +24,7 @@
#include "RLE.hh"
#include "Statistics.hh"
#include "Timezone.hh"
+#include "Utils.hh"
namespace orc {
StreamsFactory::~StreamsFactory() {
@@ -33,24 +34,25 @@ namespace orc {
class StreamsFactoryImpl : public StreamsFactory {
public:
StreamsFactoryImpl(const WriterOptions& writerOptions, OutputStream* outputStream)
- : options(writerOptions), outStream(outputStream) {}
+ : options_(writerOptions), outStream_(outputStream) {}
virtual std::unique_ptr<BufferedOutputStream> createStream(
proto::Stream_Kind kind) const override;
private:
- const WriterOptions& options;
- OutputStream* outStream;
+ const WriterOptions& options_;
+ OutputStream* outStream_;
};
std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(proto::Stream_Kind) const {
// In the future, we can decide compression strategy and modifier
// based on stream kind. But for now we just use the setting from
// WriterOption
- return createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(),
- // BufferedOutputStream initial capacity
- options.getOutputBufferCapacity(), options.getCompressionBlockSize(),
- *options.getMemoryPool(), options.getWriterMetrics());
+ return createCompressor(
+ options_.getCompression(), outStream_, options_.getCompressionStrategy(),
+ // BufferedOutputStream initial capacity
+ options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(),
+ options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics());
}
std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options,
@@ -252,6 +254,10 @@ namespace orc {
// PASS
}
+ void ColumnWriter::finishStreams() {
+ notNullEncoder->finishEncode();
+ }
+
class StructColumnWriter : public ColumnWriter {
public:
StructColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -281,8 +287,10 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
- std::vector<std::unique_ptr<ColumnWriter>> children;
+ std::vector<std::unique_ptr<ColumnWriter>> children_;
};
StructColumnWriter::StructColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -290,7 +298,7 @@ namespace orc {
: ColumnWriter(type, factory, options) {
for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
const Type& child = *type.getSubtype(i);
- children.push_back(buildWriter(child, factory, options));
+ children_.push_back(buildWriter(child, factory, options));
}
if (enableIndex) {
@@ -307,8 +315,8 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const char* notNull = structBatch->hasNulls ? structBatch->notNull.data() + offset : nullptr;
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->add(*structBatch->fields[i], offset, numValues, notNull);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->add(*structBatch->fields[i], offset, numValues, notNull);
}
// update stats
@@ -330,22 +338,22 @@ namespace orc {
void StructColumnWriter::flush(std::vector<proto::Stream>& streams) {
ColumnWriter::flush(streams);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->flush(streams);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->flush(streams);
}
}
void StructColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeIndex(streams);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->writeIndex(streams);
}
}
uint64_t StructColumnWriter::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- for (uint32_t i = 0; i < children.size(); ++i) {
- size += children[i]->getEstimatedSize();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ size += children_[i]->getEstimatedSize();
}
return size;
}
@@ -355,62 +363,69 @@ namespace orc {
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionary_size(0);
encodings.push_back(encoding);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getColumnEncoding(encodings);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->getColumnEncoding(encodings);
}
}
void StructColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getStripeStatistics(stats);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->getStripeStatistics(stats);
}
}
void StructColumnWriter::mergeStripeStatsIntoFileStats() {
ColumnWriter::mergeStripeStatsIntoFileStats();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeStripeStatsIntoFileStats();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->mergeStripeStatsIntoFileStats();
}
}
void StructColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getFileStatistics(stats);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->getFileStatistics(stats);
}
}
void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeRowGroupStatsIntoStripeStats();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->mergeRowGroupStatsIntoStripeStats();
}
}
void StructColumnWriter::createRowIndexEntry() {
ColumnWriter::createRowIndexEntry();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->createRowIndexEntry();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->createRowIndexEntry();
}
}
void StructColumnWriter::reset() {
ColumnWriter::reset();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->reset();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->reset();
}
}
void StructColumnWriter::writeDictionary() {
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeDictionary();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->writeDictionary();
+ }
+ }
+
+ void StructColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->finishStreams();
}
}
@@ -431,21 +446,23 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
std::unique_ptr<RleEncoder> rleEncoder;
private:
- RleVersion rleVersion;
+ RleVersion rleVersion_;
};
template <typename BatchType>
IntegerColumnWriter<BatchType>::IntegerColumnWriter(const Type& type,
const StreamsFactory& factory,
const WriterOptions& options)
- : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
+ : ColumnWriter(type, factory, options), rleVersion_(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool,
+ rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion_, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -512,7 +529,7 @@ namespace orc {
void IntegerColumnWriter<BatchType>::getColumnEncoding(
std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_kind(RleVersionMapper(rleVersion_));
encoding.set_dictionary_size(0);
if (enableBloomFilter) {
encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
@@ -527,6 +544,12 @@ namespace orc {
}
template <typename BatchType>
+ void IntegerColumnWriter<BatchType>::finishStreams() {
+ ColumnWriter::finishStreams();
+ rleEncoder->finishEncode();
+ }
+
+ template <typename BatchType>
class ByteColumnWriter : public ColumnWriter {
public:
ByteColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
@@ -542,8 +565,10 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
private:
- std::unique_ptr<ByteRleEncoder> byteRleEncoder;
+ std::unique_ptr<ByteRleEncoder> byteRleEncoder_;
};
template <typename BatchType>
@@ -552,7 +577,7 @@ namespace orc {
: ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
- byteRleEncoder = createByteRleEncoder(std::move(dataStream));
+ byteRleEncoder_ = createByteRleEncoder(std::move(dataStream));
if (enableIndex) {
recordPosition();
@@ -581,7 +606,7 @@ namespace orc {
for (uint64_t i = 0; i < numValues; ++i) {
byteData[i] = static_cast<char>(data[i]);
}
- byteRleEncoder->add(byteData, numValues, notNull);
+ byteRleEncoder_->add(byteData, numValues, notNull);
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
@@ -590,7 +615,7 @@ namespace orc {
if (enableBloomFilter) {
bloomFilter->addLong(data[i]);
}
- intStats->update(static_cast<int64_t>(byteData[i]), 1);
+ intStats->update(static_cast<int64_t>(static_cast<signed char>(byteData[i])), 1);
}
}
intStats->increase(count);
@@ -606,14 +631,14 @@ namespace orc {
proto::Stream stream;
stream.set_kind(proto::Stream_Kind_DATA);
stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(byteRleEncoder->flush());
+ stream.set_length(byteRleEncoder_->flush());
streams.push_back(stream);
}
template <typename BatchType>
uint64_t ByteColumnWriter<BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- size += byteRleEncoder->getBufferSize();
+ size += byteRleEncoder_->getBufferSize();
return size;
}
@@ -632,7 +657,13 @@ namespace orc {
template <typename BatchType>
void ByteColumnWriter<BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
- byteRleEncoder->recordPosition(rowIndexPosition.get());
+ byteRleEncoder_->recordPosition(rowIndexPosition.get());
+ }
+
+ template <typename BatchType>
+ void ByteColumnWriter<BatchType>::finishStreams() {
+ ColumnWriter::finishStreams();
+ byteRleEncoder_->finishEncode();
}
template <typename BatchType>
@@ -652,8 +683,10 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
private:
- std::unique_ptr<ByteRleEncoder> rleEncoder;
+ std::unique_ptr<ByteRleEncoder> rleEncoder_;
};
template <typename BatchType>
@@ -663,7 +696,7 @@ namespace orc {
: ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createBooleanRleEncoder(std::move(dataStream));
+ rleEncoder_ = createBooleanRleEncoder(std::move(dataStream));
if (enableIndex) {
recordPosition();
@@ -694,7 +727,7 @@ namespace orc {
for (uint64_t i = 0; i < numValues; ++i) {
byteData[i] = static_cast<char>(data[i]);
}
- rleEncoder->add(byteData, numValues, notNull);
+ rleEncoder_->add(byteData, numValues, notNull);
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
@@ -719,14 +752,14 @@ namespace orc {
proto::Stream stream;
stream.set_kind(proto::Stream_Kind_DATA);
stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(rleEncoder->flush());
+ stream.set_length(rleEncoder_->flush());
streams.push_back(stream);
}
template <typename BatchType>
uint64_t BooleanColumnWriter<BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- size += rleEncoder->getBufferSize();
+ size += rleEncoder_->getBufferSize();
return size;
}
@@ -745,7 +778,13 @@ namespace orc {
template <typename BatchType>
void BooleanColumnWriter<BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
- rleEncoder->recordPosition(rowIndexPosition.get());
+ rleEncoder_->recordPosition(rowIndexPosition.get());
+ }
+
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::finishStreams() {
+ ColumnWriter::finishStreams();
+ rleEncoder_->finishEncode();
}
template <typename ValueType, typename BatchType>
@@ -765,10 +804,12 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
private:
- bool isFloat;
- std::unique_ptr<AppendOnlyBufferedStream> dataStream;
- DataBuffer<char> buffer;
+ bool isFloat_;
+ std::unique_ptr<AppendOnlyBufferedStream> dataStream_;
+ DataBuffer<char> buffer_;
};
template <typename ValueType, typename BatchType>
@@ -777,10 +818,10 @@ namespace orc {
const WriterOptions& options,
bool isFloatType)
: ColumnWriter(type, factory, options),
- isFloat(isFloatType),
- buffer(*options.getMemoryPool()) {
- dataStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA)));
- buffer.resize(isFloat ? 4 : 8);
+ isFloat_(isFloatType),
+ buffer_(*options.getMemoryPool()) {
+ dataStream_.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA)));
+ buffer_.resize(isFloat_ ? 4 : 8);
if (enableIndex) {
recordPosition();
@@ -816,17 +857,17 @@ namespace orc {
const ValueType* doubleData = dblBatch->data.data() + offset;
const char* notNull = dblBatch->hasNulls ? dblBatch->notNull.data() + offset : nullptr;
- size_t bytes = isFloat ? 4 : 8;
- char* data = buffer.data();
+ size_t bytes = isFloat_ ? 4 : 8;
+ char* data = buffer_.data();
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- if (isFloat) {
+ if (isFloat_) {
encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data);
} else {
encodeFloatNum<double, int64_t>(static_cast<double>(doubleData[i]), data);
}
- dataStream->write(data, bytes);
+ dataStream_->write(data, bytes);
++count;
if (enableBloomFilter) {
bloomFilter->addDouble(static_cast<double>(doubleData[i]));
@@ -847,14 +888,14 @@ namespace orc {
proto::Stream stream;
stream.set_kind(proto::Stream_Kind_DATA);
stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(dataStream->flush());
+ stream.set_length(dataStream_->flush());
streams.push_back(stream);
}
template <typename ValueType, typename BatchType>
uint64_t FloatingColumnWriter<ValueType, BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- size += dataStream->getSize();
+ size += dataStream_->getSize();
return size;
}
@@ -873,7 +914,13 @@ namespace orc {
template <typename ValueType, typename BatchType>
void FloatingColumnWriter<ValueType, BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
- dataStream->recordPosition(rowIndexPosition.get());
+ dataStream_->recordPosition(rowIndexPosition.get());
+ }
+
+ template <typename ValueType, typename BatchType>
+ void FloatingColumnWriter<ValueType, BatchType>::finishStreams() {
+ ColumnWriter::finishStreams();
+ dataStream_->finishStream();
}
/**
@@ -887,10 +934,17 @@ namespace orc {
size_t length;
};
- SortedStringDictionary() : totalLength(0) {}
+ struct DictEntryWithIndex {
+ DictEntryWithIndex(const char* str, size_t len, size_t index)
+ : entry(str, len), index(index) {}
+ DictEntry entry;
+ size_t index;
+ };
+
+ SortedStringDictionary() : totalLength_(0) {}
// insert a new string into dictionary, return its insertion order
- size_t insert(const char* data, size_t len);
+ size_t insert(const char* str, size_t len);
// write dictionary data & length to output buffer
void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const;
@@ -911,7 +965,9 @@ namespace orc {
private:
struct LessThan {
- bool operator()(const DictEntry& left, const DictEntry& right) const {
+ bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& r) {
+ const auto& left = l.entry;
+ const auto& right = r.entry;
int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
if (ret != 0) {
return ret < 0;
@@ -920,29 +976,25 @@ namespace orc {
}
};
- std::map<DictEntry, size_t, LessThan> dict;
- std::vector<std::vector<char>> data;
- uint64_t totalLength;
+ mutable std::vector<DictEntryWithIndex> flatDict_;
+ std::unordered_map<std::string, size_t> keyToIndex_;
+ uint64_t totalLength_;
// use friend class here to avoid being bothered by const function calls
friend class StringColumnWriter;
friend class CharColumnWriter;
friend class VarCharColumnWriter;
// store indexes of insertion order in the dictionary for not-null rows
- std::vector<int64_t> idxInDictBuffer;
+ std::vector<int64_t> idxInDictBuffer_;
};
// insert a new string into dictionary, return its insertion order
size_t SortedStringDictionary::insert(const char* str, size_t len) {
- auto ret = dict.insert({DictEntry(str, len), dict.size()});
+ size_t index = flatDict_.size();
+ auto ret = keyToIndex_.emplace(std::string(str, len), index);
if (ret.second) {
- // make a copy to internal storage
- data.push_back(std::vector<char>(len));
- memcpy(data.back().data(), str, len);
- // update dictionary entry to link pointer to internal storage
- DictEntry* entry = const_cast<DictEntry*>(&(ret.first->first));
- entry->data = data.back().data();
- totalLength += len;
+ flatDict_.emplace_back(ret.first->first.data(), ret.first->first.size(), index);
+ totalLength_ += len;
}
return ret.first->second;
}
@@ -950,9 +1002,12 @@ namespace orc {
// write dictionary data & length to output buffer
void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
RleEncoder* lengthEncoder) const {
- for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
- dataStream->write(it->first.data, it->first.length);
- lengthEncoder->write(static_cast<int64_t>(it->first.length));
+ std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
+
+ for (const auto& entryWithIndex : flatDict_) {
+ const auto& entry = entryWithIndex.entry;
+ dataStream->write(entry.data, entry.length);
+ lengthEncoder->write(static_cast<int64_t>(entry.length));
}
}
@@ -968,10 +1023,9 @@ namespace orc {
*/
void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
// iterate the dictionary to get mapping from insertion order to value order
- std::vector<size_t> mapping(dict.size());
- size_t dictIdx = 0;
- for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
- mapping[it->second] = dictIdx++;
+ std::vector<size_t> mapping(flatDict_.size());
+ for (size_t i = 0; i < flatDict_.size(); ++i) {
+ mapping[flatDict_[i].index] = i;
}
// do the transformation
@@ -983,26 +1037,31 @@ namespace orc {
// get dict entries in insertion order
void SortedStringDictionary::getEntriesInInsertionOrder(
std::vector<const DictEntry*>& entries) const {
- entries.resize(dict.size());
- for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
- entries[it->second] = &(it->first);
+ std::sort(flatDict_.begin(), flatDict_.end(),
+ [](const DictEntryWithIndex& left, const DictEntryWithIndex& right) {
+ return left.index < right.index;
+ });
+
+ entries.resize(flatDict_.size());
+ for (size_t i = 0; i < flatDict_.size(); ++i) {
+ entries[i] = &(flatDict_[i].entry);
}
}
// return count of entries
size_t SortedStringDictionary::size() const {
- return dict.size();
+ return flatDict_.size();
}
// return total length of strings in the dictioanry
uint64_t SortedStringDictionary::length() const {
- return totalLength;
+ return totalLength_;
}
void SortedStringDictionary::clear() {
- totalLength = 0;
- data.clear();
- dict.clear();
+ totalLength_ = 0;
+ keyToIndex_.clear();
+ flatDict_.clear();
}
class StringColumnWriter : public ColumnWriter {
@@ -1027,6 +1086,8 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
/**
* dictionary related functions
@@ -1123,7 +1184,7 @@ namespace orc {
const size_t len = static_cast<size_t>(length[i]);
if (useDictionary) {
size_t index = dictionary.insert(data[i], len);
- dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
+ dictionary.idxInDictBuffer_.push_back(static_cast<int64_t>(index));
} else {
directDataStream->write(data[i], len);
}
@@ -1184,7 +1245,7 @@ namespace orc {
} else {
size += dictionary.length();
size += dictionary.size() * sizeof(int32_t);
- size += dictionary.idxInDictBuffer.size() * sizeof(int32_t);
+ size += dictionary.idxInDictBuffer_.size() * sizeof(int32_t);
if (useCompression) {
size /= 3; // estimated ratio is 3:1
}
@@ -1215,15 +1276,23 @@ namespace orc {
directLengthEncoder->recordPosition(rowIndexPosition.get());
} else {
if (enableIndex) {
- startOfRowGroups.push_back(dictionary.idxInDictBuffer.size());
+ startOfRowGroups.push_back(dictionary.idxInDictBuffer_.size());
}
}
}
+ void StringColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ if (!useDictionary) {
+ directDataStream->finishStream();
+ directLengthEncoder->finishEncode();
+ }
+ }
+
bool StringColumnWriter::checkDictionaryKeyRatio() {
if (!doneDictionaryCheck) {
useDictionary = dictionary.size() <=
- static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer.size()) *
+ static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer_.size()) *
dictSizeThreshold);
doneDictionaryCheck = true;
}
@@ -1244,7 +1313,7 @@ namespace orc {
ColumnWriter::reset();
dictionary.clear();
- dictionary.idxInDictBuffer.resize(0);
+ dictionary.idxInDictBuffer_.resize(0);
startOfRowGroups.clear();
startOfRowGroups.push_back(0);
}
@@ -1277,7 +1346,7 @@ namespace orc {
dictStream.reset(nullptr);
dictionary.clear();
- dictionary.idxInDictBuffer.clear();
+ dictionary.idxInDictBuffer_.clear();
startOfRowGroups.clear();
}
@@ -1295,10 +1364,10 @@ namespace orc {
dictionary.flush(dictStream.get(), dictLengthEncoder.get());
// convert index from insertion order to dictionary order
- dictionary.reorder(dictionary.idxInDictBuffer);
+ dictionary.reorder(dictionary.idxInDictBuffer_);
// write data sequences
- int64_t* data = dictionary.idxInDictBuffer.data();
+ int64_t* data = dictionary.idxInDictBuffer_.data();
if (enableIndex) {
size_t prevOffset = 0;
for (size_t i = 0; i < startOfRowGroups.size(); ++i) {
@@ -1319,10 +1388,10 @@ namespace orc {
prevOffset = offset;
}
- dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer.size() - prevOffset,
+ dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer_.size() - prevOffset,
nullptr);
} else {
- dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr);
+ dictDataEncoder->add(data, dictionary.idxInDictBuffer_.size(), nullptr);
}
}
}
@@ -1345,9 +1414,9 @@ namespace orc {
// store each length of the data into a vector
const SortedStringDictionary::DictEntry* dictEntry = nullptr;
- for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) {
+ for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
// write one row data in direct encoding
- dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])];
+ dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
directDataStream->write(dictEntry->data, dictEntry->length);
directLengthEncoder->write(static_cast<int64_t>(dictEntry->length));
}
@@ -1355,91 +1424,22 @@ namespace orc {
deleteDictStreams();
}
- struct Utf8Utils {
- /**
- * Counts how many utf-8 chars of the input data
- */
- static uint64_t charLength(const char* data, uint64_t length) {
- uint64_t chars = 0;
- for (uint64_t i = 0; i < length; i++) {
- if (isUtfStartByte(data[i])) {
- chars++;
- }
- }
- return chars;
- }
-
- /**
- * Return the number of bytes required to read at most maxCharLength
- * characters in full from a utf-8 encoded byte array provided
- * by data. This does not validate utf-8 data, but
- * operates correctly on already valid utf-8 data.
- *
- * @param maxCharLength number of characters required
- * @param data the bytes of UTF-8
- * @param length the length of data to truncate
- */
- static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) {
- uint64_t chars = 0;
- if (length <= maxCharLength) {
- return length;
- }
- for (uint64_t i = 0; i < length; i++) {
- if (isUtfStartByte(data[i])) {
- chars++;
- }
- if (chars > maxCharLength) {
- return i;
- }
- }
- // everything fits
- return length;
- }
-
- /**
- * Checks if b is the first byte of a UTF-8 character.
- */
- inline static bool isUtfStartByte(char b) {
- return (b & 0xC0) != 0x80;
- }
-
- /**
- * Find the start of the last character that ends in the current string.
- * @param text the bytes of the utf-8
- * @param from the first byte location
- * @param until the last byte location
- * @return the index of the last character
- */
- static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) {
- uint64_t posn = until;
- /* we don't expect characters more than 5 bytes */
- while (posn >= from) {
- if (isUtfStartByte(text[posn])) {
- return posn;
- }
- posn -= 1;
- }
- /* beginning of a valid char not found */
- throw std::logic_error("Could not truncate string, beginning of a valid char not found");
- }
- };
-
class CharColumnWriter : public StringColumnWriter {
public:
CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options)
: StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()),
- padBuffer(*options.getMemoryPool()) {
+ maxLength_(type.getMaximumLength()),
+ padBuffer_(*options.getMemoryPool()) {
// utf-8 is currently 4 bytes long, but it could be up to 6
- padBuffer.resize(maxLength * 6);
+ padBuffer_.resize(maxLength_ * 6);
}
virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
private:
- uint64_t maxLength;
- DataBuffer<char> padBuffer;
+ uint64_t maxLength_;
+ DataBuffer<char> padBuffer_;
};
void CharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
@@ -1467,22 +1467,22 @@ namespace orc {
const char* charData = nullptr;
uint64_t originLength = static_cast<uint64_t>(length[i]);
uint64_t charLength = Utf8Utils::charLength(data[i], originLength);
- if (charLength >= maxLength) {
+ if (charLength >= maxLength_) {
charData = data[i];
length[i] =
- static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
+ static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength_, data[i], originLength));
} else {
- charData = padBuffer.data();
+ charData = padBuffer_.data();
// the padding is exactly 1 byte per char
- length[i] = length[i] + static_cast<int64_t>(maxLength - charLength);
- memcpy(padBuffer.data(), data[i], originLength);
- memset(padBuffer.data() + originLength, ' ',
+ length[i] = length[i] + static_cast<int64_t>(maxLength_ - charLength);
+ memcpy(padBuffer_.data(), data[i], originLength);
+ memset(padBuffer_.data() + originLength, ' ',
static_cast<size_t>(length[i]) - originLength);
}
if (useDictionary) {
size_t index = dictionary.insert(charData, static_cast<size_t>(length[i]));
- dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
+ dictionary.idxInDictBuffer_.push_back(static_cast<int64_t>(index));
} else {
directDataStream->write(charData, static_cast<size_t>(length[i]));
}
@@ -1509,7 +1509,7 @@ namespace orc {
public:
VarCharColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options)
- : StringColumnWriter(type, factory, options), maxLength(type.getMaximumLength()) {
+ : StringColumnWriter(type, factory, options), maxLength_(type.getMaximumLength()) {
// PASS
}
@@ -1517,7 +1517,7 @@ namespace orc {
const char* incomingMask) override;
private:
- uint64_t maxLength;
+ uint64_t maxLength_;
};
void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
@@ -1543,12 +1543,12 @@ namespace orc {
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
uint64_t itemLength =
- Utf8Utils::truncateBytesTo(maxLength, data[i], static_cast<uint64_t>(length[i]));
+ Utf8Utils::truncateBytesTo(maxLength_, data[i], static_cast<uint64_t>(length[i]));
length[i] = static_cast<int64_t>(itemLength);
if (useDictionary) {
size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i]));
- dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
+ dictionary.idxInDictBuffer_.push_back(static_cast<int64_t>(index));
} else {
directDataStream->write(data[i], static_cast<size_t>(length[i]));
}
@@ -1638,28 +1638,30 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder;
private:
- RleVersion rleVersion;
- const Timezone* timezone;
- const bool isUTC;
+ RleVersion rleVersion_;
+ const Timezone* timezone_;
+ const bool isUTC_;
};
TimestampColumnWriter::TimestampColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options, bool isInstantType)
: ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- timezone(isInstantType ? &getTimezoneByName("GMT") : &options.getTimezone()),
- isUTC(isInstantType || options.getTimezoneName() == "GMT") {
+ rleVersion_(options.getRleVersion()),
+ timezone_(isInstantType ? &getTimezoneByName("GMT") : &options.getTimezone()),
+ isUTC_(isInstantType || options.getTimezoneName() == "GMT") {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
std::unique_ptr<BufferedOutputStream> secondaryStream =
factory.createStream(proto::Stream_Kind_SECONDARY);
- secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool,
+ secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion_, memPool,
options.getAlignedBitpacking());
- nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion, memPool,
+ nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion_, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -1712,8 +1714,8 @@ namespace orc {
if (notNull == nullptr || notNull[i]) {
// TimestampVectorBatch already stores data in UTC
int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
- if (!isUTC) {
- millsUTC = timezone->convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000;
+ if (!isUTC_) {
+ millsUTC = timezone_->convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000;
}
++count;
if (enableBloomFilter) {
@@ -1725,7 +1727,7 @@ namespace orc {
secs[i] += 1;
}
- secs[i] -= timezone->getEpoch();
+ secs[i] -= timezone_->getEpoch();
nanos[i] = formatNano(nanos[i]);
}
}
@@ -1764,7 +1766,7 @@ namespace orc {
void TimestampColumnWriter::getColumnEncoding(
std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_kind(RleVersionMapper(rleVersion_));
encoding.set_dictionary_size(0);
if (enableBloomFilter) {
encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
@@ -1778,6 +1780,12 @@ namespace orc {
nanoRleEncoder->recordPosition(rowIndexPosition.get());
}
+ void TimestampColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ secRleEncoder->finishEncode();
+ nanoRleEncoder->finishEncode();
+ }
+
class DateColumnWriter : public IntegerColumnWriter<LongVectorBatch> {
public:
DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
@@ -1847,6 +1855,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
RleVersion rleVersion;
uint64_t precision;
@@ -1855,7 +1865,7 @@ namespace orc {
std::unique_ptr<RleEncoder> scaleEncoder;
private:
- char buffer[10];
+ char buffer_[10];
};
Decimal64ColumnWriter::Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -1897,7 +1907,7 @@ namespace orc {
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
int64_t val = zigZag(values[i]);
- char* data = buffer;
+ char* data = buffer_;
while (true) {
if ((val & ~0x7f) == 0) {
*(data++) = (static_cast<char>(val));
@@ -1908,7 +1918,7 @@ namespace orc {
val = (static_cast<uint64_t>(val) >> 7);
}
}
- valueStream->write(buffer, static_cast<size_t>(data - buffer));
+ valueStream->write(buffer_, static_cast<size_t>(data - buffer_));
++count;
if (enableBloomFilter) {
std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true);
@@ -1965,6 +1975,12 @@ namespace orc {
scaleEncoder->recordPosition(rowIndexPosition.get());
}
+ void Decimal64ColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ valueStream->finishStream();
+ scaleEncoder->finishEncode();
+ }
+
class Decimal64ColumnWriterV2 : public ColumnWriter {
public:
Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory,
@@ -1981,6 +1997,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
uint64_t precision;
uint64_t scale;
@@ -2071,6 +2089,11 @@ namespace orc {
valueEncoder->recordPosition(rowIndexPosition.get());
}
+ void Decimal64ColumnWriterV2::finishStreams() {
+ ColumnWriter::finishStreams();
+ valueEncoder->finishEncode();
+ }
+
class Decimal128ColumnWriter : public Decimal64ColumnWriter {
public:
Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -2080,7 +2103,7 @@ namespace orc {
const char* incomingMask) override;
private:
- char buffer[20];
+ char buffer_[20];
};
Decimal128ColumnWriter::Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -2126,7 +2149,7 @@ namespace orc {
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
Int128 val = zigZagInt128(values[i]);
- char* data = buffer;
+ char* data = buffer_;
while (true) {
if ((val & ~0x7f) == 0) {
*(data++) = (static_cast<char>(val.getLowBits()));
@@ -2136,7 +2159,7 @@ namespace orc {
val >>= 7;
}
}
- valueStream->write(buffer, static_cast<size_t>(data - buffer));
+ valueStream->write(buffer_, static_cast<size_t>(data - buffer_));
++count;
if (enableBloomFilter) {
@@ -2186,22 +2209,24 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
- std::unique_ptr<RleEncoder> lengthEncoder;
- RleVersion rleVersion;
- std::unique_ptr<ColumnWriter> child;
+ std::unique_ptr<RleEncoder> lengthEncoder_;
+ RleVersion rleVersion_;
+ std::unique_ptr<ColumnWriter> child_;
};
ListColumnWriter::ListColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options)
- : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
+ : ColumnWriter(type, factory, options), rleVersion_(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> lengthStream =
factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool,
- options.getAlignedBitpacking());
+ lengthEncoder_ = createRleEncoder(std::move(lengthStream), false, rleVersion_, memPool,
+ options.getAlignedBitpacking());
if (type.getSubtypeCount() == 1) {
- child = buildWriter(*type.getSubtype(0), factory, options);
+ child_ = buildWriter(*type.getSubtype(0), factory, options);
}
if (enableIndex) {
@@ -2239,10 +2264,10 @@ namespace orc {
}
// unnecessary to deal with null as elements are packed together
- if (child.get()) {
- child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr);
+ if (child_.get()) {
+ child_->add(*listBatch->elements, elemOffset, totalNumValues, nullptr);
}
- lengthEncoder->add(offsets, numValues, notNull);
+ lengthEncoder_->add(offsets, numValues, notNull);
if (enableIndex) {
if (!notNull) {
@@ -2272,93 +2297,101 @@ namespace orc {
proto::Stream stream;
stream.set_kind(proto::Stream_Kind_LENGTH);
stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(lengthEncoder->flush());
+ stream.set_length(lengthEncoder_->flush());
streams.push_back(stream);
- if (child.get()) {
- child->flush(streams);
+ if (child_.get()) {
+ child_->flush(streams);
}
}
void ListColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
- if (child.get()) {
- child->writeIndex(streams);
+ if (child_.get()) {
+ child_->writeIndex(streams);
}
}
uint64_t ListColumnWriter::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- if (child.get()) {
- size += lengthEncoder->getBufferSize();
- size += child->getEstimatedSize();
+ if (child_.get()) {
+ size += lengthEncoder_->getBufferSize();
+ size += child_->getEstimatedSize();
}
return size;
}
void ListColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_kind(RleVersionMapper(rleVersion_));
encoding.set_dictionary_size(0);
if (enableBloomFilter) {
encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
- if (child.get()) {
- child->getColumnEncoding(encodings);
+ if (child_.get()) {
+ child_->getColumnEncoding(encodings);
}
}
void ListColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
- if (child.get()) {
- child->getStripeStatistics(stats);
+ if (child_.get()) {
+ child_->getStripeStatistics(stats);
}
}
void ListColumnWriter::mergeStripeStatsIntoFileStats() {
ColumnWriter::mergeStripeStatsIntoFileStats();
- if (child.get()) {
- child->mergeStripeStatsIntoFileStats();
+ if (child_.get()) {
+ child_->mergeStripeStatsIntoFileStats();
}
}
void ListColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
- if (child.get()) {
- child->getFileStatistics(stats);
+ if (child_.get()) {
+ child_->getFileStatistics(stats);
}
}
void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- if (child.get()) {
- child->mergeRowGroupStatsIntoStripeStats();
+ if (child_.get()) {
+ child_->mergeRowGroupStatsIntoStripeStats();
}
}
void ListColumnWriter::createRowIndexEntry() {
ColumnWriter::createRowIndexEntry();
- if (child.get()) {
- child->createRowIndexEntry();
+ if (child_.get()) {
+ child_->createRowIndexEntry();
}
}
void ListColumnWriter::recordPosition() const {
ColumnWriter::recordPosition();
- lengthEncoder->recordPosition(rowIndexPosition.get());
+ lengthEncoder_->recordPosition(rowIndexPosition.get());
}
void ListColumnWriter::reset() {
ColumnWriter::reset();
- if (child) {
- child->reset();
+ if (child_) {
+ child_->reset();
}
}
void ListColumnWriter::writeDictionary() {
- if (child) {
- child->writeDictionary();
+ if (child_) {
+ child_->writeDictionary();
+ }
+ }
+
+ void ListColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ lengthEncoder_->finishEncode();
+ if (child_) {
+ child_->finishStreams();
}
}
@@ -2394,27 +2427,29 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
- std::unique_ptr<ColumnWriter> keyWriter;
- std::unique_ptr<ColumnWriter> elemWriter;
- std::unique_ptr<RleEncoder> lengthEncoder;
- RleVersion rleVersion;
+ std::unique_ptr<ColumnWriter> keyWriter_;
+ std::unique_ptr<ColumnWriter> elemWriter_;
+ std::unique_ptr<RleEncoder> lengthEncoder_;
+ RleVersion rleVersion_;
};
MapColumnWriter::MapColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options)
- : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
+ : ColumnWriter(type, factory, options), rleVersion_(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> lengthStream =
factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool,
- options.getAlignedBitpacking());
+ lengthEncoder_ = createRleEncoder(std::move(lengthStream), false, rleVersion_, memPool,
+ options.getAlignedBitpacking());
if (type.getSubtypeCount() > 0) {
- keyWriter = buildWriter(*type.getSubtype(0), factory, options);
+ keyWriter_ = buildWriter(*type.getSubtype(0), factory, options);
}
if (type.getSubtypeCount() > 1) {
- elemWriter = buildWriter(*type.getSubtype(1), factory, options);
+ elemWriter_ = buildWriter(*type.getSubtype(1), factory, options);
}
if (enableIndex) {
@@ -2451,14 +2486,14 @@ namespace orc {
offsets[i] = offsets[i + 1] - offsets[i];
}
- lengthEncoder->add(offsets, numValues, notNull);
+ lengthEncoder_->add(offsets, numValues, notNull);
// unnecessary to deal with null as keys and values are packed together
- if (keyWriter.get()) {
- keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr);
+ if (keyWriter_.get()) {
+ keyWriter_->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr);
}
- if (elemWriter.get()) {
- elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr);
+ if (elemWriter_.get()) {
+ elemWriter_->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr);
}
if (enableIndex) {
@@ -2489,126 +2524,137 @@ namespace orc {
proto::Stream stream;
stream.set_kind(proto::Stream_Kind_LENGTH);
stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(lengthEncoder->flush());
+ stream.set_length(lengthEncoder_->flush());
streams.push_back(stream);
- if (keyWriter.get()) {
- keyWriter->flush(streams);
+ if (keyWriter_.get()) {
+ keyWriter_->flush(streams);
}
- if (elemWriter.get()) {
- elemWriter->flush(streams);
+ if (elemWriter_.get()) {
+ elemWriter_->flush(streams);
}
}
void MapColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
- if (keyWriter.get()) {
- keyWriter->writeIndex(streams);
+ if (keyWriter_.get()) {
+ keyWriter_->writeIndex(streams);
}
- if (elemWriter.get()) {
- elemWriter->writeIndex(streams);
+ if (elemWriter_.get()) {
+ elemWriter_->writeIndex(streams);
}
}
uint64_t MapColumnWriter::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- size += lengthEncoder->getBufferSize();
- if (keyWriter.get()) {
- size += keyWriter->getEstimatedSize();
+ size += lengthEncoder_->getBufferSize();
+ if (keyWriter_.get()) {
+ size += keyWriter_->getEstimatedSize();
}
- if (elemWriter.get()) {
- size += elemWriter->getEstimatedSize();
+ if (elemWriter_.get()) {
+ size += elemWriter_->getEstimatedSize();
}
return size;
}
void MapColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_kind(RleVersionMapper(rleVersion_));
encoding.set_dictionary_size(0);
if (enableBloomFilter) {
encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
- if (keyWriter.get()) {
- keyWriter->getColumnEncoding(encodings);
+ if (keyWriter_.get()) {
+ keyWriter_->getColumnEncoding(encodings);
}
- if (elemWriter.get()) {
- elemWriter->getColumnEncoding(encodings);
+ if (elemWriter_.get()) {
+ elemWriter_->getColumnEncoding(encodings);
}
}
void MapColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
- if (keyWriter.get()) {
- keyWriter->getStripeStatistics(stats);
+ if (keyWriter_.get()) {
+ keyWriter_->getStripeStatistics(stats);
}
- if (elemWriter.get()) {
- elemWriter->getStripeStatistics(stats);
+ if (elemWriter_.get()) {
+ elemWriter_->getStripeStatistics(stats);
}
}
void MapColumnWriter::mergeStripeStatsIntoFileStats() {
ColumnWriter::mergeStripeStatsIntoFileStats();
- if (keyWriter.get()) {
- keyWriter->mergeStripeStatsIntoFileStats();
+ if (keyWriter_.get()) {
+ keyWriter_->mergeStripeStatsIntoFileStats();
}
- if (elemWriter.get()) {
- elemWriter->mergeStripeStatsIntoFileStats();
+ if (elemWriter_.get()) {
+ elemWriter_->mergeStripeStatsIntoFileStats();
}
}
void MapColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
- if (keyWriter.get()) {
- keyWriter->getFileStatistics(stats);
+ if (keyWriter_.get()) {
+ keyWriter_->getFileStatistics(stats);
}
- if (elemWriter.get()) {
- elemWriter->getFileStatistics(stats);
+ if (elemWriter_.get()) {
+ elemWriter_->getFileStatistics(stats);
}
}
void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- if (keyWriter.get()) {
- keyWriter->mergeRowGroupStatsIntoStripeStats();
+ if (keyWriter_.get()) {
+ keyWriter_->mergeRowGroupStatsIntoStripeStats();
}
- if (elemWriter.get()) {
- elemWriter->mergeRowGroupStatsIntoStripeStats();
+ if (elemWriter_.get()) {
+ elemWriter_->mergeRowGroupStatsIntoStripeStats();
}
}
void MapColumnWriter::createRowIndexEntry() {
ColumnWriter::createRowIndexEntry();
- if (keyWriter.get()) {
- keyWriter->createRowIndexEntry();
+ if (keyWriter_.get()) {
+ keyWriter_->createRowIndexEntry();
}
- if (elemWriter.get()) {
- elemWriter->createRowIndexEntry();
+ if (elemWriter_.get()) {
+ elemWriter_->createRowIndexEntry();
}
}
void MapColumnWriter::recordPosition() const {
ColumnWriter::recordPosition();
- lengthEncoder->recordPosition(rowIndexPosition.get());
+ lengthEncoder_->recordPosition(rowIndexPosition.get());
}
void MapColumnWriter::reset() {
ColumnWriter::reset();
- if (keyWriter) {
- keyWriter->reset();
+ if (keyWriter_) {
+ keyWriter_->reset();
}
- if (elemWriter) {
- elemWriter->reset();
+ if (elemWriter_) {
+ elemWriter_->reset();
}
}
void MapColumnWriter::writeDictionary() {
- if (keyWriter) {
- keyWriter->writeDictionary();
+ if (keyWriter_) {
+ keyWriter_->writeDictionary();
+ }
+ if (elemWriter_) {
+ elemWriter_->writeDictionary();
+ }
+ }
+
+ void MapColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ lengthEncoder_->finishEncode();
+ if (keyWriter_) {
+ keyWriter_->finishStreams();
}
- if (elemWriter) {
- elemWriter->writeDictionary();
+ if (elemWriter_) {
+ elemWriter_->finishStreams();
}
}
@@ -2644,9 +2690,11 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
- std::unique_ptr<ByteRleEncoder> rleEncoder;
- std::vector<std::unique_ptr<ColumnWriter>> children;
+ std::unique_ptr<ByteRleEncoder> rleEncoder_;
+ std::vector<std::unique_ptr<ColumnWriter>> children_;
};
UnionColumnWriter::UnionColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -2654,10 +2702,10 @@ namespace orc {
: ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createByteRleEncoder(std::move(dataStream));
+ rleEncoder_ = createByteRleEncoder(std::move(dataStream));
for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) {
- children.push_back(buildWriter(*type.getSubtype(i), factory, options));
+ children_.push_back(buildWriter(*type.getSubtype(i), factory, options));
}
if (enableIndex) {
@@ -2678,8 +2726,8 @@ namespace orc {
unsigned char* tags = unionBatch->tags.data() + offset;
uint64_t* offsets = unionBatch->offsets.data() + offset;
- std::vector<int64_t> childOffset(children.size(), -1);
- std::vector<uint64_t> childLength(children.size(), 0);
+ std::vector<int64_t> childOffset(children_.size(), -1);
+ std::vector<uint64_t> childLength(children_.size(), 0);
for (uint64_t i = 0; i != numValues; ++i) {
if (childOffset[tags[i]] == -1) {
@@ -2688,12 +2736,12 @@ namespace orc {
++childLength[tags[i]];
}
- rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull);
+ rleEncoder_->add(reinterpret_cast<char*>(tags), numValues, notNull);
- for (uint32_t i = 0; i < children.size(); ++i) {
+ for (uint32_t i = 0; i < children_.size(); ++i) {
if (childLength[i] > 0) {
- children[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]),
- childLength[i], nullptr);
+ children_[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]),
+ childLength[i], nullptr);
}
}
@@ -2725,26 +2773,26 @@ namespace orc {
proto::Stream stream;
stream.set_kind(proto::Stream_Kind_DATA);
stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(rleEncoder->flush());
+ stream.set_length(rleEncoder_->flush());
streams.push_back(stream);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->flush(streams);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->flush(streams);
}
}
void UnionColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeIndex(streams);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->writeIndex(streams);
}
}
uint64_t UnionColumnWriter::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
- size += rleEncoder->getBufferSize();
- for (uint32_t i = 0; i < children.size(); ++i) {
- size += children[i]->getEstimatedSize();
+ size += rleEncoder_->getBufferSize();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ size += children_[i]->getEstimatedSize();
}
return size;
}
@@ -2757,61 +2805,69 @@ namespace orc {
encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getColumnEncoding(encodings);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->getColumnEncoding(encodings);
}
}
void UnionColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getStripeStatistics(stats);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->getStripeStatistics(stats);
}
}
void UnionColumnWriter::mergeStripeStatsIntoFileStats() {
ColumnWriter::mergeStripeStatsIntoFileStats();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeStripeStatsIntoFileStats();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->mergeStripeStatsIntoFileStats();
}
}
void UnionColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getFileStatistics(stats);
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->getFileStatistics(stats);
}
}
void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeRowGroupStatsIntoStripeStats();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->mergeRowGroupStatsIntoStripeStats();
}
}
void UnionColumnWriter::createRowIndexEntry() {
ColumnWriter::createRowIndexEntry();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->createRowIndexEntry();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->createRowIndexEntry();
}
}
void UnionColumnWriter::recordPosition() const {
ColumnWriter::recordPosition();
- rleEncoder->recordPosition(rowIndexPosition.get());
+ rleEncoder_->recordPosition(rowIndexPosition.get());
}
void UnionColumnWriter::reset() {
ColumnWriter::reset();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->reset();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->reset();
}
}
void UnionColumnWriter::writeDictionary() {
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeDictionary();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->writeDictionary();
+ }
+ }
+
+ void UnionColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ rleEncoder_->finishEncode();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->finishStreams();
}
}
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
index f21ffd6f83..1c5e15d707 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
@@ -53,14 +53,14 @@ namespace orc {
public:
virtual ~RowIndexPositionRecorder() override;
- RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry(entry) {}
+ RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry_(entry) {}
virtual void add(uint64_t pos) override {
- rowIndexEntry.add_positions(pos);
+ rowIndexEntry_.add_positions(pos);
}
private:
- proto::RowIndexEntry& rowIndexEntry;
+ proto::RowIndexEntry& rowIndexEntry_;
};
/**
@@ -179,6 +179,18 @@ namespace orc {
*/
virtual void writeDictionary();
+ /**
+ * Finalize the encoding and compressing process. This function should be
+ * called after all data required for encoding has been added. It ensures
+ * that any remaining data is processed and the final state of the streams
+ * is set.
+ * Note: boolean type cannot cut off the current byte if it is not filled
+ * with 8 bits, otherwise Boolean RLE may incorrectly read the unfilled
+ * trailing bits. In this case, the last byte will be the head of the next
+ * compression block.
+ */
+ virtual void finishStreams();
+
protected:
/**
* Utility function to translate ColumnStatistics into protobuf form and
diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc
index cf2ff27ef1..52efa12d94 100644
--- a/contrib/libs/apache/orc/c++/src/Common.cc
+++ b/contrib/libs/apache/orc/c++/src/Common.cc
@@ -133,11 +133,11 @@ namespace orc {
}
std::string FileVersion::toString() const {
- if (majorVersion == 1 && minorVersion == 9999) {
+ if (majorVersion_ == 1 && minorVersion_ == 9999) {
return "UNSTABLE-PRE-2.0";
}
std::stringstream ss;
- ss << majorVersion << '.' << minorVersion;
+ ss << majorVersion_ << '.' << minorVersion_;
return ss.str();
}
diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc
index 94be774ab4..f373a75bff 100644
--- a/contrib/libs/apache/orc/c++/src/Compression.cc
+++ b/contrib/libs/apache/orc/c++/src/Compression.cc
@@ -52,19 +52,22 @@ namespace orc {
class CompressionStreamBase : public BufferedOutputStream {
public:
CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity,
- uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+ uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics);
virtual bool Next(void** data, int* size) override = 0;
- virtual void BackUp(int count) override;
+ virtual void BackUp(int count) override = 0;
virtual std::string getName() const override = 0;
- virtual uint64_t flush() override;
- virtual void suppress() override;
+ virtual uint64_t flush() override = 0;
+ virtual void suppress() override = 0;
virtual bool isCompressed() const override {
return true;
}
virtual uint64_t getSize() const override;
+ virtual uint64_t getRawInputBufferSize() const override = 0;
+ virtual void finishStream() override = 0;
protected:
void writeData(const unsigned char* data, int size);
@@ -78,9 +81,6 @@ namespace orc {
// ensure enough room for compression block header
void ensureHeader();
- // Buffer to hold uncompressed data until user calls Next()
- DataBuffer<unsigned char> rawInputBuffer;
-
// Compress level
int level;
@@ -99,46 +99,26 @@ namespace orc {
// Compression block header pointer array
static const uint32_t HEADER_SIZE = 3;
std::array<char*, HEADER_SIZE> header;
+
+ // Compression block size
+ uint64_t compressionBlockSize;
};
CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel,
- uint64_t capacity, uint64_t blockSize,
- MemoryPool& pool, WriterMetrics* metrics)
- : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics),
- rawInputBuffer(pool, blockSize),
+ uint64_t capacity, uint64_t compressionBlockSize,
+ uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics)
+ : BufferedOutputStream(pool, outStream, capacity, memoryBlockSize, metrics),
level(compressionLevel),
outputBuffer(nullptr),
bufferSize(0),
outputPosition(0),
- outputSize(0) {
+ outputSize(0),
+ compressionBlockSize(compressionBlockSize) {
// init header pointer array
header.fill(nullptr);
}
- void CompressionStreamBase::BackUp(int count) {
- if (count > bufferSize) {
- throw std::logic_error("Can't backup that much!");
- }
- bufferSize -= count;
- }
-
- uint64_t CompressionStreamBase::flush() {
- void* data;
- int size;
- if (!Next(&data, &size)) {
- throw std::runtime_error("Failed to flush compression buffer.");
- }
- BufferedOutputStream::BackUp(outputSize - outputPosition);
- bufferSize = outputSize = outputPosition = 0;
- return BufferedOutputStream::flush();
- }
-
- void CompressionStreamBase::suppress() {
- outputBuffer = nullptr;
- bufferSize = outputPosition = outputSize = 0;
- BufferedOutputStream::suppress();
- }
-
uint64_t CompressionStreamBase::getSize() const {
return BufferedOutputStream::getSize() - static_cast<uint64_t>(outputSize - outputPosition);
}
@@ -149,12 +129,12 @@ namespace orc {
while (offset < size) {
if (outputPosition == outputSize) {
if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
- throw std::runtime_error("Failed to get next output buffer from output stream.");
+ throw CompressionError("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
} else if (outputPosition > outputSize) {
// for safety this will unlikely happen
- throw std::logic_error("Write to an out-of-bound place during compression!");
+ throw CompressionError("Write to an out-of-bound place during compression!");
}
int currentSize = std::min(outputSize - outputPosition, size - offset);
memcpy(outputBuffer + outputPosition, data + offset, static_cast<size_t>(currentSize));
@@ -168,7 +148,7 @@ namespace orc {
for (uint32_t i = 0; i < HEADER_SIZE; ++i) {
if (outputPosition >= outputSize) {
if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
- throw std::runtime_error("Failed to get next output buffer from output stream.");
+ throw CompressionError("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
}
@@ -183,31 +163,74 @@ namespace orc {
class CompressionStream : public CompressionStreamBase {
public:
CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
- uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+ uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics);
virtual bool Next(void** data, int* size) override;
virtual std::string getName() const override = 0;
+ virtual void BackUp(int count) override;
+ virtual void suppress() override;
+ virtual uint64_t flush() override;
+ uint64_t getRawInputBufferSize() const override {
+ return rawInputBuffer.size();
+ }
+ virtual void finishStream() override {
+ compressInternal();
+ BufferedOutputStream::finishStream();
+ }
protected:
// return total compressed size
virtual uint64_t doStreamingCompression() = 0;
+
+ // Buffer to hold uncompressed data until user calls Next()
+ BlockBuffer rawInputBuffer;
+
+ void compressInternal();
};
+ void CompressionStream::BackUp(int count) {
+ uint64_t backup = static_cast<uint64_t>(count);
+ uint64_t currSize = rawInputBuffer.size();
+ if (backup > currSize) {
+ throw CompressionError("Can't backup that much!");
+ }
+ rawInputBuffer.resize(currSize - backup);
+ }
+
+ uint64_t CompressionStream::flush() {
+ compressInternal();
+ BufferedOutputStream::BackUp(outputSize - outputPosition);
+ rawInputBuffer.resize(0);
+ outputSize = outputPosition = 0;
+ return BufferedOutputStream::flush();
+ }
+
+ void CompressionStream::suppress() {
+ outputBuffer = nullptr;
+ outputPosition = outputSize = 0;
+ rawInputBuffer.resize(0);
+ BufferedOutputStream::suppress();
+ }
+
CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel,
- uint64_t capacity, uint64_t blockSize, MemoryPool& pool,
+ uint64_t capacity, uint64_t compressionBlockSize,
+ uint64_t memoryBlockSize, MemoryPool& pool,
WriterMetrics* metrics)
- : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
+ : CompressionStreamBase(outStream, compressionLevel, capacity, compressionBlockSize,
+ memoryBlockSize, pool, metrics),
+ rawInputBuffer(pool, memoryBlockSize) {
// PASS
}
- bool CompressionStream::Next(void** data, int* size) {
- if (bufferSize != 0) {
+ void CompressionStream::compressInternal() {
+ if (rawInputBuffer.size() != 0) {
ensureHeader();
uint64_t preSize = getSize();
uint64_t totalCompressedSize = doStreamingCompression();
- if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) {
- writeHeader(static_cast<size_t>(bufferSize), true);
+ if (totalCompressedSize >= static_cast<unsigned long>(rawInputBuffer.size())) {
+ writeHeader(static_cast<size_t>(rawInputBuffer.size()), true);
// reset output buffer
outputBuffer = nullptr;
outputPosition = outputSize = 0;
@@ -215,23 +238,42 @@ namespace orc {
BufferedOutputStream::BackUp(static_cast<int>(backup));
// copy raw input buffer into block buffer
- writeData(rawInputBuffer.data(), bufferSize);
+ uint64_t blockNumber = rawInputBuffer.getBlockNumber();
+ for (uint64_t i = 0; i < blockNumber; ++i) {
+ auto block = rawInputBuffer.getBlock(i);
+ writeData(reinterpret_cast<const unsigned char*>(block.data), block.size);
+ }
} else {
writeHeader(totalCompressedSize, false);
}
+ rawInputBuffer.resize(0);
}
+ }
- *data = rawInputBuffer.data();
- *size = static_cast<int>(rawInputBuffer.size());
- bufferSize = *size;
+ bool CompressionStream::Next(void** data, int* size) {
+ if (rawInputBuffer.size() > compressionBlockSize) {
+ std::stringstream ss;
+ ss << "uncompressed data size " << rawInputBuffer.size()
+ << " is larger than compression block size " << compressionBlockSize;
+ throw CompressionError(ss.str());
+ }
+
+ // compress data in the rawInputBuffer when it is full
+ if (rawInputBuffer.size() == compressionBlockSize) {
+ compressInternal();
+ }
+ auto block = rawInputBuffer.getNextBlock();
+ *data = block.data;
+ *size = static_cast<int>(block.size);
return true;
}
class ZlibCompressionStream : public CompressionStream {
public:
- ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
- uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+ ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t bufferCapacity,
+ uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics);
virtual ~ZlibCompressionStream() override {
end();
@@ -245,47 +287,62 @@ namespace orc {
private:
void init();
void end();
- z_stream strm;
+ z_stream strm_;
};
ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel,
- uint64_t capacity, uint64_t blockSize,
- MemoryPool& pool, WriterMetrics* metrics)
- : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics)
+ : CompressionStream(outStream, compressionLevel, bufferCapacity, compressionBlockSize,
+ memoryBlockSize, pool, metrics) {
init();
}
uint64_t ZlibCompressionStream::doStreamingCompression() {
- if (deflateReset(&strm) != Z_OK) {
- throw std::runtime_error("Failed to reset inflate.");
+ if (deflateReset(&strm_) != Z_OK) {
+ throw CompressionError("Failed to reset inflate.");
}
- strm.avail_in = static_cast<unsigned int>(bufferSize);
- strm.next_in = rawInputBuffer.data();
+ // iterate through all blocks
+ uint64_t blockId = 0;
+ bool finish = false;
do {
- if (outputPosition >= outputSize) {
- if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
- throw std::runtime_error("Failed to get next output buffer from output stream.");
- }
- outputPosition = 0;
+ if (blockId == rawInputBuffer.getBlockNumber()) {
+ finish = true;
+ strm_.avail_in = 0;
+ strm_.next_in = nullptr;
+ } else {
+ auto block = rawInputBuffer.getBlock(blockId++);
+ strm_.avail_in = static_cast<unsigned int>(block.size);
+ strm_.next_in = reinterpret_cast<unsigned char*>(block.data);
}
- strm.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition);
- strm.avail_out = static_cast<unsigned int>(outputSize - outputPosition);
- int ret = deflate(&strm, Z_FINISH);
- outputPosition = outputSize - static_cast<int>(strm.avail_out);
+ do {
+ if (outputPosition >= outputSize) {
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw CompressionError("Failed to get next output buffer from output stream.");
+ }
+ outputPosition = 0;
+ }
+ strm_.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition);
+ strm_.avail_out = static_cast<unsigned int>(outputSize - outputPosition);
- if (ret == Z_STREAM_END) {
- break;
- } else if (ret == Z_OK) {
- // needs more buffer so will continue the loop
- } else {
- throw std::runtime_error("Failed to deflate input data.");
- }
- } while (strm.avail_out == 0);
+ int ret = deflate(&strm_, finish ? Z_FINISH : Z_NO_FLUSH);
+ outputPosition = outputSize - static_cast<int>(strm_.avail_out);
- return strm.total_out;
+ if (ret == Z_STREAM_END) {
+ break;
+ } else if (ret == Z_OK) {
+ // needs more buffer so will continue the loop
+ } else {
+ throw CompressionError("Failed to deflate input data.");
+ }
+ } while (strm_.avail_out == 0);
+ } while (!finish);
+ return strm_.total_out;
}
std::string ZlibCompressionStream::getName() const {
@@ -299,18 +356,18 @@ namespace orc {
#endif
void ZlibCompressionStream::init() {
- strm.zalloc = nullptr;
- strm.zfree = nullptr;
- strm.opaque = nullptr;
- strm.next_in = nullptr;
+ strm_.zalloc = nullptr;
+ strm_.zfree = nullptr;
+ strm_.opaque = nullptr;
+ strm_.next_in = nullptr;
- if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
- throw std::runtime_error("Error while calling deflateInit2() for zlib.");
+ if (deflateInit2(&strm_, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
+ throw CompressionError("Error while calling deflateInit2() for zlib.");
}
}
void ZlibCompressionStream::end() {
- (void)deflateEnd(&strm);
+ (void)deflateEnd(&strm_);
}
DIAGNOSTIC_PUSH
@@ -399,9 +456,9 @@ namespace orc {
};
DecompressionStream::DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize, MemoryPool& _pool,
- ReaderMetrics* _metrics)
- : pool(_pool),
+ size_t bufferSize, MemoryPool& pool,
+ ReaderMetrics* metrics)
+ : pool(pool),
input(std::move(inStream)),
outputDataBuffer(pool, bufferSize),
state(DECOMPRESS_HEADER),
@@ -416,7 +473,7 @@ namespace orc {
headerPosition(0),
inputBufferStartPosition(0),
bytesReturned(0),
- metrics(_metrics) {}
+ metrics(metrics) {}
std::string DecompressionStream::getStreamName() const {
return input->getName();
@@ -505,7 +562,7 @@ namespace orc {
} else if (state == DECOMPRESS_START) {
NextDecompress(data, size, availableSize);
} else {
- throw std::logic_error(
+ throw CompressionError(
"Unknown compression state in "
"DecompressionStream::Next");
}
@@ -519,7 +576,7 @@ namespace orc {
void DecompressionStream::BackUp(int count) {
if (outputBuffer == nullptr || outputBufferLength != 0) {
- throw std::logic_error("Backup without previous Next in " + getName());
+ throw CompressionError("Backup without previous Next in " + getName());
}
outputBuffer -= static_cast<size_t>(count);
outputBufferLength = static_cast<size_t>(count);
@@ -622,7 +679,7 @@ namespace orc {
virtual void NextDecompress(const void** data, int* size, size_t availableSize) override;
private:
- z_stream zstream;
+ z_stream zstream_;
};
DIAGNOSTIC_PUSH
@@ -632,35 +689,39 @@ namespace orc {
#endif
ZlibDecompressionStream::ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize, MemoryPool& _pool,
- ReaderMetrics* _metrics)
- : DecompressionStream(std::move(inStream), bufferSize, _pool, _metrics) {
- zstream.next_in = nullptr;
- zstream.avail_in = 0;
- zstream.zalloc = nullptr;
- zstream.zfree = nullptr;
- zstream.opaque = nullptr;
- zstream.next_out = reinterpret_cast<Bytef*>(outputDataBuffer.data());
- zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
- int64_t result = inflateInit2(&zstream, -15);
+ size_t bufferSize, MemoryPool& pool,
+ ReaderMetrics* metrics)
+ : DecompressionStream(std::move(inStream), bufferSize, pool, metrics) {
+ zstream_.next_in = nullptr;
+ zstream_.avail_in = 0;
+ zstream_.zalloc = nullptr;
+ zstream_.zfree = nullptr;
+ zstream_.opaque = nullptr;
+ zstream_.next_out = reinterpret_cast<Bytef*>(outputDataBuffer.data());
+ zstream_.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
+ int64_t result = inflateInit2(&zstream_, -15);
switch (result) {
case Z_OK:
break;
case Z_MEM_ERROR:
- throw std::logic_error("Memory error from inflateInit2");
+ throw CompressionError(
+ "Memory error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
case Z_VERSION_ERROR:
- throw std::logic_error("Version error from inflateInit2");
+ throw CompressionError(
+ "Version error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
case Z_STREAM_ERROR:
- throw std::logic_error("Stream error from inflateInit2");
+ throw CompressionError(
+ "Stream error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
default:
- throw std::logic_error("Unknown error from inflateInit2");
+ throw CompressionError(
+ "Unknown error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
}
}
DIAGNOSTIC_POP
ZlibDecompressionStream::~ZlibDecompressionStream() {
- int64_t result = inflateEnd(&zstream);
+ int64_t result = inflateEnd(&zstream_);
if (result != Z_OK) {
// really can't throw in destructors
std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n";
@@ -668,19 +729,19 @@ namespace orc {
}
void ZlibDecompressionStream::NextDecompress(const void** data, int* size, size_t availableSize) {
- zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
- zstream.avail_in = static_cast<uInt>(availableSize);
+ zstream_.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream_.avail_in = static_cast<uInt>(availableSize);
outputBuffer = outputDataBuffer.data();
- zstream.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
- zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
- if (inflateReset(&zstream) != Z_OK) {
- throw std::logic_error(
+ zstream_.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
+ zstream_.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
+ if (inflateReset(&zstream_) != Z_OK) {
+ throw CompressionError(
"Bad inflateReset in "
"ZlibDecompressionStream::NextDecompress");
}
int64_t result;
do {
- result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH);
+ result = inflate(&zstream_, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH);
switch (result) {
case Z_OK:
remainingLength -= availableSize;
@@ -688,30 +749,30 @@ namespace orc {
readBuffer(true);
availableSize =
std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength);
- zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
- zstream.avail_in = static_cast<uInt>(availableSize);
+ zstream_.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream_.avail_in = static_cast<uInt>(availableSize);
break;
case Z_STREAM_END:
break;
case Z_BUF_ERROR:
- throw std::logic_error(
+ throw CompressionError(
"Buffer error in "
"ZlibDecompressionStream::NextDecompress");
case Z_DATA_ERROR:
- throw std::logic_error(
+ throw CompressionError(
"Data error in "
"ZlibDecompressionStream::NextDecompress");
case Z_STREAM_ERROR:
- throw std::logic_error(
+ throw CompressionError(
"Stream error in "
"ZlibDecompressionStream::NextDecompress");
default:
- throw std::logic_error(
+ throw CompressionError(
"Unknown error in "
"ZlibDecompressionStream::NextDecompress");
}
} while (result != Z_STREAM_END);
- *size = static_cast<int>(outputDataBuffer.capacity() - zstream.avail_out);
+ *size = static_cast<int>(outputDataBuffer.capacity() - zstream_.avail_out);
*data = outputBuffer;
outputBufferLength = 0;
outputBuffer += *size;
@@ -742,14 +803,14 @@ namespace orc {
private:
// may need to stitch together multiple input buffers;
// to give snappy a contiguous block
- DataBuffer<char> inputDataBuffer;
+ DataBuffer<char> inputDataBuffer_;
};
BlockDecompressionStream::BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize, MemoryPool& _pool,
- ReaderMetrics* _metrics)
- : DecompressionStream(std::move(inStream), blockSize, _pool, _metrics),
- inputDataBuffer(pool, blockSize) {}
+ size_t blockSize, MemoryPool& pool,
+ ReaderMetrics* metrics)
+ : DecompressionStream(std::move(inStream), blockSize, pool, metrics),
+ inputDataBuffer_(pool, blockSize) {}
void BlockDecompressionStream::NextDecompress(const void** data, int* size,
size_t availableSize) {
@@ -759,18 +820,18 @@ namespace orc {
inputBuffer += availableSize;
} else {
// Did not read enough from input.
- if (inputDataBuffer.capacity() < remainingLength) {
- inputDataBuffer.resize(remainingLength);
+ if (inputDataBuffer_.capacity() < remainingLength) {
+ inputDataBuffer_.resize(remainingLength);
}
- ::memcpy(inputDataBuffer.data(), inputBuffer, availableSize);
+ ::memcpy(inputDataBuffer_.data(), inputBuffer, availableSize);
inputBuffer += availableSize;
- compressed = inputDataBuffer.data();
+ compressed = inputDataBuffer_.data();
for (size_t pos = availableSize; pos < remainingLength;) {
readBuffer(true);
size_t avail =
std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength - pos);
- ::memcpy(inputDataBuffer.data() + pos, inputBuffer, avail);
+ ::memcpy(inputDataBuffer_.data() + pos, inputBuffer, avail);
pos += avail;
inputBuffer += avail;
}
@@ -788,8 +849,8 @@ namespace orc {
class SnappyDecompressionStream : public BlockDecompressionStream {
public:
SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
- MemoryPool& _pool, ReaderMetrics* _metrics)
- : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
+ MemoryPool& pool, ReaderMetrics* metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) {
// PASS
}
@@ -804,18 +865,18 @@ namespace orc {
size_t maxOutputLength) override;
};
- uint64_t SnappyDecompressionStream::decompress(const char* _input, uint64_t length, char* output,
+ uint64_t SnappyDecompressionStream::decompress(const char* input, uint64_t length, char* output,
size_t maxOutputLength) {
size_t outLength;
- if (!snappy::GetUncompressedLength(_input, length, &outLength)) {
+ if (!snappy::GetUncompressedLength(input, length, &outLength)) {
throw ParseError("SnappyDecompressionStream choked on corrupt input");
}
if (outLength > maxOutputLength) {
- throw std::logic_error("Snappy length exceeds block size");
+ throw CompressionError("Snappy length exceeds block size");
}
- if (!snappy::RawUncompress(_input, length, output)) {
+ if (!snappy::RawUncompress(input, length, output)) {
throw ParseError("SnappyDecompressionStream choked on corrupt input");
}
return outLength;
@@ -824,8 +885,8 @@ namespace orc {
class LzoDecompressionStream : public BlockDecompressionStream {
public:
LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
- MemoryPool& _pool, ReaderMetrics* _metrics)
- : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
+ MemoryPool& pool, ReaderMetrics* metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) {
// PASS
}
@@ -848,8 +909,8 @@ namespace orc {
class Lz4DecompressionStream : public BlockDecompressionStream {
public:
Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
- MemoryPool& _pool, ReaderMetrics* _metrics)
- : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
+ MemoryPool& pool, ReaderMetrics* metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) {
// PASS
}
@@ -881,14 +942,23 @@ namespace orc {
public:
BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
- : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics),
- compressorBuffer(pool) {
+ : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, blockSize, pool,
+ metrics),
+ compressorBuffer(pool),
+ rawInputBuffer(pool, blockSize) {
// PASS
}
virtual bool Next(void** data, int* size) override;
virtual void suppress() override;
+ virtual void BackUp(int count) override;
+ virtual uint64_t flush() override;
virtual std::string getName() const override = 0;
+ uint64_t getRawInputBufferSize() const override {
+ return bufferSize;
+ }
+
+ virtual void finishStream() override;
protected:
// compresses a block and returns the compressed size
@@ -900,8 +970,23 @@ namespace orc {
// should allocate max possible compressed size
DataBuffer<unsigned char> compressorBuffer;
+
+ // Buffer to hold uncompressed data until user calls Next()
+ DataBuffer<unsigned char> rawInputBuffer;
};
+ void BlockCompressionStream::BackUp(int count) {
+ if (count > bufferSize) {
+ throw CompressionError("Can't backup that much!");
+ }
+ bufferSize -= count;
+ }
+
+ uint64_t BlockCompressionStream::flush() {
+ finishStream();
+ return BufferedOutputStream::flush();
+ }
+
bool BlockCompressionStream::Next(void** data, int* size) {
if (bufferSize != 0) {
ensureHeader();
@@ -935,7 +1020,19 @@ namespace orc {
void BlockCompressionStream::suppress() {
compressorBuffer.resize(0);
- CompressionStreamBase::suppress();
+ outputBuffer = nullptr;
+ bufferSize = outputPosition = outputSize = 0;
+ BufferedOutputStream::suppress();
+ }
+
+ void BlockCompressionStream::finishStream() {
+ void* data;
+ int size;
+ if (!Next(&data, &size)) {
+ throw CompressionError("Failed to flush compression buffer.");
+ }
+ BufferedOutputStream::BackUp(outputSize - outputPosition);
+ bufferSize = outputSize = outputPosition = 0;
}
/**
@@ -967,30 +1064,30 @@ namespace orc {
private:
void init();
void end();
- LZ4_stream_t* state;
+ LZ4_stream_t* state_;
};
uint64_t Lz4CompressionSteam::doBlockCompression() {
int result = LZ4_compress_fast_extState(
- static_cast<void*>(state), reinterpret_cast<const char*>(rawInputBuffer.data()),
+ static_cast<void*>(state_), reinterpret_cast<const char*>(rawInputBuffer.data()),
reinterpret_cast<char*>(compressorBuffer.data()), bufferSize,
static_cast<int>(compressorBuffer.size()), level);
if (result == 0) {
- throw std::runtime_error("Error during block compression using lz4.");
+ throw CompressionError("Error during block compression using lz4.");
}
return static_cast<uint64_t>(result);
}
void Lz4CompressionSteam::init() {
- state = LZ4_createStream();
- if (!state) {
- throw std::runtime_error("Error while allocating state for lz4.");
+ state_ = LZ4_createStream();
+ if (!state_) {
+ throw CompressionError("Error while allocating state for lz4.");
}
}
void Lz4CompressionSteam::end() {
- (void)LZ4_freeStream(state);
- state = nullptr;
+ (void)LZ4_freeStream(state_);
+ state_ = nullptr;
}
/**
@@ -1055,11 +1152,11 @@ namespace orc {
private:
void init();
void end();
- ZSTD_CCtx* cctx;
+ ZSTD_CCtx* cctx_;
};
uint64_t ZSTDCompressionStream::doBlockCompression() {
- return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(),
+ return ZSTD_compressCCtx(cctx_, compressorBuffer.data(), compressorBuffer.size(),
rawInputBuffer.data(), static_cast<size_t>(bufferSize), level);
}
@@ -1070,15 +1167,15 @@ namespace orc {
#endif
void ZSTDCompressionStream::init() {
- cctx = ZSTD_createCCtx();
- if (!cctx) {
- throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd.");
+ cctx_ = ZSTD_createCCtx();
+ if (!cctx_) {
+ throw CompressionError("Error while calling ZSTD_createCCtx() for zstd.");
}
}
void ZSTDCompressionStream::end() {
- (void)ZSTD_freeCCtx(cctx);
- cctx = nullptr;
+ (void)ZSTD_freeCCtx(cctx_);
+ cctx_ = nullptr;
}
DIAGNOSTIC_PUSH
@@ -1089,8 +1186,8 @@ namespace orc {
class ZSTDDecompressionStream : public BlockDecompressionStream {
public:
ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
- MemoryPool& _pool, ReaderMetrics* _metrics)
- : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
+ MemoryPool& pool, ReaderMetrics* metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool, metrics) {
this->init();
}
@@ -1111,13 +1208,13 @@ namespace orc {
private:
void init();
void end();
- ZSTD_DCtx* dctx;
+ ZSTD_DCtx* dctx_;
};
uint64_t ZSTDDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
return static_cast<uint64_t>(
- ZSTD_decompressDCtx(dctx, output, maxOutputLength, inputPtr, length));
+ ZSTD_decompressDCtx(dctx_, output, maxOutputLength, inputPtr, length));
}
DIAGNOSTIC_PUSH
@@ -1127,25 +1224,23 @@ namespace orc {
#endif
void ZSTDDecompressionStream::init() {
- dctx = ZSTD_createDCtx();
- if (!dctx) {
- throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd.");
+ dctx_ = ZSTD_createDCtx();
+ if (!dctx_) {
+ throw CompressionError("Error while calling ZSTD_createDCtx() for zstd.");
}
}
void ZSTDDecompressionStream::end() {
- (void)ZSTD_freeDCtx(dctx);
- dctx = nullptr;
+ (void)ZSTD_freeDCtx(dctx_);
+ dctx_ = nullptr;
}
DIAGNOSTIC_PUSH
- std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind,
- OutputStream* outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool, WriterMetrics* metrics) {
+ std::unique_ptr<BufferedOutputStream> createCompressor(
+ CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy,
+ uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics) {
switch (static_cast<int64_t>(kind)) {
case CompressionKind_NONE: {
return std::make_unique<BufferedOutputStream>(pool, outStream, bufferCapacity,
@@ -1154,8 +1249,8 @@ namespace orc {
case CompressionKind_ZLIB: {
int level =
(strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
- return std::make_unique<ZlibCompressionStream>(outStream, level, bufferCapacity,
- compressionBlockSize, pool, metrics);
+ return std::make_unique<ZlibCompressionStream>(
+ outStream, level, bufferCapacity, compressionBlockSize, memoryBlockSize, pool, metrics);
}
case CompressionKind_ZSTD: {
int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT;
diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh
index 55b152dd63..24170c56b4 100644
--- a/contrib/libs/apache/orc/c++/src/Compression.hh
+++ b/contrib/libs/apache/orc/c++/src/Compression.hh
@@ -42,15 +42,16 @@ namespace orc {
* @param outStream the output stream that is the underlying target
* @param strategy compression strategy
* @param bufferCapacity compression stream buffer total capacity
- * @param compressionBlockSize compression buffer block size
+ * @param compressionBlockSize compression is triggered when the original input buffer size
+ * reaches this size
+ * @param memoryBlockSize the block size for original input buffer
* @param pool the memory pool
+ * @param metrics the writer metrics
*/
- std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind,
- OutputStream* outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool, WriterMetrics* metrics);
+ std::unique_ptr<BufferedOutputStream> createCompressor(
+ CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy,
+ uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics);
} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
index 459cafa1a0..a9003bc163 100644
--- a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
+++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
@@ -17,15 +17,18 @@
*/
#include "ConvertColumnReader.hh"
+#include "Utils.hh"
+
+#include <optional>
namespace orc {
// Assume that we are using tight numeric vector batch
using BooleanVectorBatch = ByteVectorBatch;
- ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
- StripeStreams& stripe, bool _throwOnOverflow)
- : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+ ConvertColumnReader::ConvertColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ColumnReader(readType, stripe), readType(readType), throwOnOverflow(throwOnOverflow) {
reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
/*throwOnOverflow=*/false, /*convertToReadType*/ false);
data =
@@ -72,6 +75,23 @@ namespace orc {
}
}
+ static inline void handleParseFromStringError(ColumnVectorBatch& dstBatch, uint64_t idx,
+ bool shouldThrow, const std::string& typeName,
+ const std::string& str,
+ const std::string& expectedFormat = "") {
+ if (!shouldThrow) {
+ dstBatch.notNull.data()[idx] = 0;
+ dstBatch.hasNulls = true;
+ } else {
+ std::ostringstream ss;
+ ss << "Failed to parse " << typeName << " from string:" << str;
+ if (expectedFormat != "") {
+ ss << " the following format \"" << expectedFormat << "\" is expected";
+ }
+ throw SchemaEvolutionError(ss.str());
+ }
+ }
+
// return false if overflow
template <typename ReadType>
static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
@@ -135,9 +155,9 @@ namespace orc {
template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
class NumericConvertColumnReader : public ConvertColumnReader {
public:
- NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
- bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ NumericConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
@@ -164,9 +184,9 @@ namespace orc {
class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
: public ConvertColumnReader {
public:
- NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
- bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ NumericConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
@@ -188,9 +208,9 @@ namespace orc {
class ConvertToStringVariantColumnReader : public ConvertColumnReader {
public:
- ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
- StripeStreams& stripe, bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ ConvertToStringVariantColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
@@ -225,19 +245,19 @@ namespace orc {
class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
public:
- BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
- StripeStreams& stripe, bool _throwOnOverflow)
- : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
- trueValue = "TRUE";
- falseValue = "FALSE";
+ BooleanToStringVariantColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {
+ trueValue_ = "TRUE";
+ falseValue_ = "FALSE";
if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) {
if (readType.getMaximumLength() < 5) {
throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
std::to_string(readType.getMaximumLength()));
}
if (readType.getKind() == CHAR) {
- trueValue.resize(readType.getMaximumLength(), ' ');
- falseValue.resize(readType.getMaximumLength(), ' ');
+ trueValue_.resize(readType.getMaximumLength(), ' ');
+ falseValue_.resize(readType.getMaximumLength(), ' ');
}
}
}
@@ -245,8 +265,8 @@ namespace orc {
uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
private:
- std::string trueValue;
- std::string falseValue;
+ std::string trueValue_;
+ std::string falseValue_;
};
uint64_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
@@ -257,7 +277,7 @@ namespace orc {
// cast the bool value to string
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
- strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+ strBuffer[i] = (srcBatch.data[i] ? trueValue_ : falseValue_);
size += strBuffer[i].size();
}
}
@@ -267,9 +287,9 @@ namespace orc {
template <typename FileTypeBatch>
class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
public:
- NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
- StripeStreams& stripe, bool _throwOnOverflow)
- : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ NumericToStringVariantColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {}
uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
};
@@ -321,13 +341,13 @@ namespace orc {
template <typename FileTypeBatch, typename ReadTypeBatch, bool isFloatingFileType>
class NumericToDecimalColumnReader : public ConvertColumnReader {
public:
- NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
- bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
- precision = static_cast<int32_t>(readType.getPrecision());
- scale = static_cast<int32_t>(readType.getScale());
+ NumericToDecimalColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {
+ precision_ = static_cast<int32_t>(readType.getPrecision());
+ scale_ = static_cast<int32_t>(readType.getScale());
bool overflow = false;
- upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+ upperBound_ = scaleUpInt128ByPowerOfTen(1, precision_, overflow);
}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
@@ -335,8 +355,8 @@ namespace orc {
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
- dstBatch.precision = precision;
- dstBatch.scale = scale;
+ dstBatch.precision = precision_;
+ dstBatch.scale = scale_;
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
if constexpr (isFloatingFileType) {
@@ -351,7 +371,7 @@ namespace orc {
private:
template <typename SrcType>
void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {
- const auto result = convertDecimal(value, precision, scale);
+ const auto result = convertDecimal(value, precision_, scale_);
Int128 i128 = result.second;
if (result.first) {
handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
@@ -372,7 +392,7 @@ namespace orc {
template <typename SrcType>
void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {
int fromScale = 0;
- auto result = convertDecimal(value, fromScale, precision, scale);
+ auto result = convertDecimal(value, fromScale, precision_, scale_);
if (result.first) {
handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
} else {
@@ -388,24 +408,25 @@ namespace orc {
}
}
- int32_t precision;
- int32_t scale;
- int64_t scaleMultiplier;
- Int128 upperBound;
+ int32_t precision_;
+ int32_t scale_;
+ int64_t scaleMultiplier_;
+ Int128 upperBound_;
};
class ConvertToTimestampColumnReader : public ConvertColumnReader {
public:
- ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
- StripeStreams& stripe, bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
- readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT")
- : &stripe.getReaderTimezone()),
+ ConvertToTimestampColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow),
+ isInstant(readType.getKind() == TIMESTAMP_INSTANT),
+ readerTimezone(isInstant ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()),
needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
protected:
+ const bool isInstant;
const orc::Timezone* readerTimezone;
const bool needConvertTimezone;
};
@@ -419,9 +440,9 @@ namespace orc {
template <typename FileTypeBatch>
class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader {
public:
- NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
- StripeStreams& stripe, bool _throwOnOverflow)
- : ConvertToTimestampColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ NumericToTimestampColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
@@ -469,14 +490,14 @@ namespace orc {
template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
class DecimalToNumericColumnReader : public ConvertColumnReader {
public:
- DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
- bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
- precision = fileType.getPrecision();
- scale = fileType.getScale();
- factor = 1;
- for (int i = 0; i < scale; i++) {
- factor *= 10;
+ DecimalToNumericColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {
+ precision_ = fileType.getPrecision();
+ scale_ = fileType.getScale();
+ factor_ = 1;
+ for (int i = 0; i < scale_; i++) {
+ factor_ *= 10;
}
}
@@ -500,7 +521,7 @@ namespace orc {
void convertDecimalToInteger(ReadTypeBatch& dstBatch, uint64_t idx,
const FileTypeBatch& srcBatch) {
using FileType = decltype(srcBatch.values[idx]);
- Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale);
+ Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale_);
if (!result.fitsInLong()) {
handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
return;
@@ -512,21 +533,21 @@ namespace orc {
void convertDecimalToDouble(ReadTypeBatch& dstBatch, uint64_t idx,
const FileTypeBatch& srcBatch) {
double doubleValue = Int128(srcBatch.values[idx]).toDouble();
- dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor);
+ dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor_);
}
- int32_t precision;
- int32_t scale;
- int64_t factor;
+ int32_t precision_;
+ int32_t scale_;
+ int64_t factor_;
};
template <typename FileTypeBatch>
class DecimalToNumericColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
: public ConvertColumnReader {
public:
- DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
- bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ DecimalToNumericColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
@@ -544,13 +565,13 @@ namespace orc {
template <typename FileTypeBatch, typename ReadTypeBatch>
class DecimalConvertColumnReader : public ConvertColumnReader {
public:
- DecimalConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
- bool _throwOnOverflow)
- : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
- fromPrecision = fileType.getPrecision();
- fromScale = fileType.getScale();
- toPrecision = _readType.getPrecision();
- toScale = _readType.getScale();
+ DecimalConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {
+ fromPrecision_ = fileType.getPrecision();
+ fromScale_ = fileType.getScale();
+ toPrecision_ = readType.getPrecision();
+ toScale_ = readType.getScale();
}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
@@ -572,7 +593,7 @@ namespace orc {
using ReadType = decltype(dstBatch.values[idx]);
auto [overflows, resultI128] =
- convertDecimal(srcBatch.values[idx], fromScale, toPrecision, toScale);
+ convertDecimal(srcBatch.values[idx], fromScale_, toPrecision_, toScale_);
if (overflows) {
handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
}
@@ -587,10 +608,423 @@ namespace orc {
}
}
- int32_t fromPrecision;
- int32_t fromScale;
- int32_t toPrecision;
- int32_t toScale;
+ int32_t fromPrecision_;
+ int32_t fromScale_;
+ int32_t toPrecision_;
+ int32_t toScale_;
+ };
+
+ template <typename FileTypeBatch>
+ class DecimalToTimestampColumnReader : public ConvertToTimestampColumnReader {
+ public:
+ DecimalToTimestampColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow),
+ precision_(static_cast<int32_t>(fileType.getPrecision())),
+ scale_(static_cast<int32_t>(fileType.getScale())) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertDecimalToTimestamp(dstBatch, i, srcBatch);
+ }
+ }
+ }
+
+ private:
+ void convertDecimalToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
+ const FileTypeBatch& srcBatch) {
+ constexpr int SecondToNanoFactor = 9;
+ // Following constant comes from java.time.Instant
+ // '-1000000000-01-01T00:00Z'
+ constexpr int64_t MIN_EPOCH_SECONDS = -31557014167219200L;
+ // '1000000000-12-31T23:59:59.999999999Z'
+ constexpr int64_t MAX_EPOCH_SECONDS = 31556889864403199L;
+ // dummy variable, there's no risk of overflow
+ bool overflow = false;
+
+ Int128 i128(srcBatch.values[idx]);
+ Int128 integerPortion = scaleDownInt128ByPowerOfTen(i128, scale_);
+ if (integerPortion < MIN_EPOCH_SECONDS || integerPortion > MAX_EPOCH_SECONDS) {
+ handleOverflow<Decimal, int64_t>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ i128 -= scaleUpInt128ByPowerOfTen(integerPortion, scale_, overflow);
+ Int128 fractionPortion = std::move(i128);
+ if (scale_ < SecondToNanoFactor) {
+ fractionPortion =
+ scaleUpInt128ByPowerOfTen(fractionPortion, SecondToNanoFactor - scale_, overflow);
+ } else {
+ fractionPortion = scaleDownInt128ByPowerOfTen(fractionPortion, scale_ - SecondToNanoFactor);
+ }
+ if (fractionPortion < 0) {
+ fractionPortion += 1e9;
+ integerPortion -= 1;
+ }
+ // line 630 has guaranteed toLong() will not overflow
+ dstBatch.data[idx] = integerPortion.toLong();
+ dstBatch.nanoseconds[idx] = fractionPortion.toLong();
+
+ if (needConvertTimezone) {
+ dstBatch.data[idx] = readerTimezone->convertFromUTC(dstBatch.data[idx]);
+ }
+ }
+
+ const int32_t precision_;
+ const int32_t scale_;
+ };
+
+ template <typename FileTypeBatch>
+ class DecimalToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+ public:
+ DecimalToStringVariantColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow),
+ scale_(fileType.getScale()) {}
+
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = Int128(srcBatch.values[i]).toDecimalString(scale_, true);
+ size += strBuffer[i].size();
+ }
+ }
+ } else {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = Int128(srcBatch.values[i]).toDecimalString(scale_, true);
+ }
+ if (strBuffer[i].size() > maxLength) {
+ strBuffer[i].resize(maxLength);
+ }
+ size += strBuffer[i].size();
+ }
+ }
+ return size;
+ }
+
+ private:
+ const int32_t scale_;
+ };
+
+ template <typename ReadTypeBatch, typename ReadType>
+ class StringVariantToNumericColumnReader : public ConvertColumnReader {
+ public:
+ StringVariantToNumericColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (std::is_floating_point_v<ReadType>) {
+ convertToDouble(dstBatch, srcBatch, i);
+ } else {
+ convertToInteger(dstBatch, srcBatch, i);
+ }
+ }
+ }
+ }
+
+ private:
+ void convertToInteger(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch,
+ uint64_t idx) {
+ int64_t longValue = 0;
+ const std::string longStr(srcBatch.data[idx], srcBatch.length[idx]);
+ try {
+ longValue = std::stoll(longStr);
+ } catch (...) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Long", longStr);
+ return;
+ }
+ if constexpr (std::is_same_v<ReadType, bool>) {
+ dstBatch.data[idx] = longValue == 0 ? 0 : 1;
+ } else {
+ if (!downCastToInteger(dstBatch.data[idx], longValue)) {
+ handleOverflow<std::string, ReadType>(dstBatch, idx, throwOnOverflow);
+ }
+ }
+ }
+
+ void convertToDouble(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, uint64_t idx) {
+ const std::string floatValue(srcBatch.data[idx], srcBatch.length[idx]);
+ try {
+ if constexpr (std::is_same_v<ReadType, float>) {
+ dstBatch.data[idx] = std::stof(floatValue);
+ } else {
+ dstBatch.data[idx] = std::stod(floatValue);
+ }
+ } catch (...) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, typeid(readType).name(),
+ floatValue);
+ }
+ }
+ };
+
+ class StringVariantConvertColumnReader : public ConvertToStringVariantColumnReader {
+ public:
+ StringVariantConvertColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {}
+
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
+ const auto maxLength = readType.getMaximumLength();
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::string(srcBatch.data[i], srcBatch.length[i]);
+ size += strBuffer[i].size();
+ }
+ }
+ } else if (readType.getKind() == VARCHAR) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const char* charData = srcBatch.data[i];
+ uint64_t originLength = srcBatch.length[i];
+ uint64_t itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength);
+ strBuffer[i] = std::string(charData, itemLength);
+ size += strBuffer[i].length();
+ }
+ }
+ } else if (readType.getKind() == CHAR) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const char* charData = srcBatch.data[i];
+ uint64_t originLength = srcBatch.length[i];
+ uint64_t charLength = Utf8Utils::charLength(charData, originLength);
+ auto itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength);
+ strBuffer[i] = std::string(srcBatch.data[i], itemLength);
+ // the padding is exactly 1 byte per char
+ if (charLength < maxLength) {
+ strBuffer[i].resize(itemLength + maxLength - charLength, ' ');
+ }
+ size += strBuffer[i].length();
+ }
+ }
+ } else {
+ throw SchemaEvolutionError("Invalid type for numeric to string conversion: " +
+ readType.toString());
+ }
+ return size;
+ }
+ };
+
+ class StringVariantToTimestampColumnReader : public ConvertToTimestampColumnReader {
+ public:
+ StringVariantToTimestampColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i]));
+ }
+ }
+ }
+
+ private:
+ // Algorithm: http://howardhinnant.github.io/date_algorithms.html
+ // The algorithm implements a proleptic Gregorian calendar.
+ int64_t daysFromProlepticGregorianCalendar(int32_t y, int32_t m, int32_t d) {
+ y -= m <= 2;
+ int32_t era = y / 400;
+ int32_t yoe = y - era * 400; // [0, 399]
+ int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365]
+ int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
+ return 1ll * era * 146097 + doe - 719468;
+ }
+
+ std::optional<std::pair<int64_t, int64_t>> tryBestToParseFromString(
+ const std::string& timeStr) {
+ int32_t year, month, day, hour, min, sec, nanos = 0;
+ int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d %2d:%2d:%2d.%d", &year, &month,
+ &day, &hour, &min, &sec, &nanos);
+ if (matched != 6 && matched != 7) {
+ return std::nullopt;
+ }
+ if (nanos) {
+ if (nanos < 0 || nanos >= 1e9) {
+ return std::nullopt;
+ }
+ while (nanos < static_cast<int64_t>(1e8)) {
+ nanos *= 10;
+ }
+ }
+ int64_t daysSinceEpoch = daysFromProlepticGregorianCalendar(year, month, day);
+ int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + min) + sec;
+ return std::make_optional(std::pair<int64_t, int64_t>{secondSinceEpoch, nanos});
+ }
+
+ void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
+ const std::string& timeStr) {
+ // Expected timestamp_instant format string : yyyy-mm-dd hh:mm:ss[.xxx] timezone
+ // Eg. "2019-07-09 13:11:00 America/Los_Angeles"
+ // Expected timestamp format string : yyyy-mm-dd hh:mm:ss[.xxx]
+ // Eg. "2019-07-09 13:11:00"
+ static std::string expectedTimestampInstantFormat = "yyyy-mm-dd hh:mm:ss[.xxx] timezone";
+ static std::string expectedTimestampFormat = "yyyy-mm-dd hh:mm:ss[.xxx]";
+ auto timestamp = tryBestToParseFromString(timeStr);
+ if (!timestamp.has_value()) {
+ if (!isInstant) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp", timeStr,
+ expectedTimestampFormat);
+ return;
+ }
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
+ expectedTimestampInstantFormat);
+ return;
+ }
+
+ auto& [second, nanos] = timestamp.value();
+
+ if (isInstant) {
+ size_t pos = 0; // get the name of timezone
+ pos = timeStr.find(' ', pos) + 1;
+ pos = timeStr.find(' ', pos);
+ if (pos == std::string::npos) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
+ expectedTimestampInstantFormat);
+ return;
+ }
+ pos += 1;
+ size_t subStrLength = timeStr.length() - pos;
+ try {
+ second = getTimezoneByName(timeStr.substr(pos, subStrLength)).convertFromUTC(second);
+ } catch (const TimezoneError&) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
+ expectedTimestampInstantFormat);
+ return;
+ }
+ } else {
+ if (needConvertTimezone) {
+ second = readerTimezone->convertFromUTC(second);
+ }
+ }
+ dstBatch.data[idx] = second;
+ dstBatch.nanoseconds[idx] = nanos;
+ }
+ };
+
+ template <typename ReadTypeBatch>
+ class StringVariantToDecimalColumnReader : public ConvertColumnReader {
+ public:
+ StringVariantToDecimalColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow),
+ precision_(static_cast<int32_t>(readType.getPrecision())),
+ scale_(static_cast<int32_t>(readType.getScale())) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToDecimal(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i]));
+ }
+ }
+ }
+
+ private:
+ void convertToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, const std::string& decimalStr) {
+ constexpr int32_t MAX_PRECISION_128 = 38;
+ int32_t fromPrecision = 0;
+ int32_t fromScale = 0;
+ uint32_t start = 0;
+ bool negative = false;
+ if (decimalStr.empty()) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+ auto dotPos = decimalStr.find('.');
+ if (dotPos == std::string::npos) {
+ fromScale = 0;
+ fromPrecision = decimalStr.length();
+ dotPos = decimalStr.length();
+ } else {
+ if (dotPos + 1 == decimalStr.length()) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+ fromPrecision = decimalStr.length() - 1;
+ fromScale = decimalStr.length() - dotPos - 1;
+ }
+ if (decimalStr.front() == '-') {
+ negative = true;
+ start++;
+ fromPrecision--;
+ }
+ const std::string integerPortion = decimalStr.substr(start, dotPos - start);
+ if (dotPos == start || fromPrecision > MAX_PRECISION_128 || fromPrecision <= 0 ||
+ !std::all_of(integerPortion.begin(), integerPortion.end(), ::isdigit)) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+
+ Int128 i128;
+ try {
+ bool overflow = false;
+ i128 = Int128(integerPortion);
+ // overflow won't happen
+ i128 *= scaleUpInt128ByPowerOfTen(Int128(1), fromScale, overflow);
+ } catch (const std::exception& e) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+ if (dotPos + 1 < decimalStr.length()) {
+ const std::string fractionPortion = decimalStr.substr(dotPos + 1, fromScale);
+ if (!std::all_of(fractionPortion.begin(), fractionPortion.end(), ::isdigit)) {
+ handleOverflow<std::string, Int128>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ i128 += Int128(fractionPortion);
+ }
+
+ auto [overflow, result] = convertDecimal(i128, fromScale, precision_, scale_);
+ if (overflow) {
+ handleOverflow<std::string, Int128>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ if (negative) {
+ result.negate();
+ }
+
+ if constexpr (std::is_same_v<ReadTypeBatch, Decimal128VectorBatch>) {
+ dstBatch.values[idx] = result;
+ } else {
+ if (!result.fitsInLong()) {
+ handleOverflow<std::string, decltype(dstBatch.values[idx])>(dstBatch, idx,
+ throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = result.toLong();
+ }
+ }
+ }
+
+ const int32_t precision_;
+ const int32_t scale_;
};
#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
@@ -621,6 +1055,26 @@ namespace orc {
using Decimal128##To##TO##ColumnReader = \
DecimalConvertColumnReader<Decimal128VectorBatch, TO##VectorBatch>;
+#define DEFINE_DECIMAL_CONVERT_TO_TIMESTAMP_READER \
+ using Decimal64ToTimestampColumnReader = DecimalToTimestampColumnReader<Decimal64VectorBatch>; \
+ using Decimal128ToTimestampColumnReader = DecimalToTimestampColumnReader<Decimal128VectorBatch>;
+
+#define DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(TO) \
+ using Decimal64To##TO##ColumnReader = DecimalToStringVariantColumnReader<Decimal64VectorBatch>; \
+ using Decimal128To##TO##ColumnReader = DecimalToStringVariantColumnReader<Decimal128VectorBatch>;
+
+#define DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(FROM, TO, TYPE) \
+ using FROM##To##TO##ColumnReader = StringVariantToNumericColumnReader<TO##VectorBatch, TYPE>;
+
+#define DEFINE_STRING_VARIANT_CONVERT_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = StringVariantConvertColumnReader;
+
+#define DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = StringVariantToTimestampColumnReader;
+
+#define DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = StringVariantToDecimalColumnReader<TO##VectorBatch>;
+
DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
@@ -720,8 +1174,62 @@ namespace orc {
DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal64)
DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal128)
+ DEFINE_DECIMAL_CONVERT_TO_TIMESTAMP_READER
+ DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(String)
+ DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Char)
+ DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Varchar)
+
+ // String variant to numeric
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Boolean, bool)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Byte, int8_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Short, int16_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Int, int32_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Long, int64_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Float, float)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Double, double)
+
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Boolean, bool)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Byte, int8_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Short, int16_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Int, int32_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Long, int64_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Float, float)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Double, double)
+
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Boolean, bool)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Byte, int8_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Short, int16_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Int, int32_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Long, int64_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Float, float)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Double, double)
+
+ // String variant to string variant
+ DEFINE_STRING_VARIANT_CONVERT_READER(String, String)
+ DEFINE_STRING_VARIANT_CONVERT_READER(String, Char)
+ DEFINE_STRING_VARIANT_CONVERT_READER(String, Varchar)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Char, Char)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Char, String)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Char, Varchar)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, String)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Char)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Varchar)
+
+ // String variant to timestamp
+ DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(String, Timestamp)
+ DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Char, Timestamp)
+ DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Varchar, Timestamp)
+
+ // String variant to decimal
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal64)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal128)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal64)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal128)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal64)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal128)
+
#define CREATE_READER(NAME) \
- return std::make_unique<NAME>(_readType, fileType, stripe, throwOnOverflow);
+ return std::make_unique<NAME>(readType, fileType, stripe, throwOnOverflow);
#define CASE_CREATE_READER(TYPE, CONVERT) \
case TYPE: \
@@ -744,7 +1252,7 @@ namespace orc {
#define CASE_CREATE_DECIMAL_READER(FROM) \
case DECIMAL: { \
- if (isDecimal64(_readType)) { \
+ if (isDecimal64(readType)) { \
CREATE_READER(FROM##ToDecimal64ColumnReader) \
} else { \
CREATE_READER(FROM##ToDecimal128ColumnReader) \
@@ -754,7 +1262,7 @@ namespace orc {
#define CASE_EXCEPTION \
default: \
throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
- _readType.toString());
+ readType.toString());
std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
bool useTightNumericVector,
@@ -764,11 +1272,11 @@ namespace orc {
"SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
"option useTightNumericVector");
}
- const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType);
+ const auto& readType = *stripe.getSchemaEvolution()->getReadType(fileType);
switch (fileType.getKind()) {
case BOOLEAN: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BYTE, BooleanToByte)
CASE_CREATE_READER(SHORT, BooleanToShort)
CASE_CREATE_READER(INT, BooleanToInt)
@@ -792,7 +1300,7 @@ namespace orc {
}
}
case BYTE: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, ByteToBoolean)
CASE_CREATE_READER(SHORT, ByteToShort)
CASE_CREATE_READER(INT, ByteToInt)
@@ -816,7 +1324,7 @@ namespace orc {
}
}
case SHORT: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, ShortToBoolean)
CASE_CREATE_READER(BYTE, ShortToByte)
CASE_CREATE_READER(INT, ShortToInt)
@@ -840,7 +1348,7 @@ namespace orc {
}
}
case INT: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, IntToBoolean)
CASE_CREATE_READER(BYTE, IntToByte)
CASE_CREATE_READER(SHORT, IntToShort)
@@ -864,7 +1372,7 @@ namespace orc {
}
}
case LONG: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, LongToBoolean)
CASE_CREATE_READER(BYTE, LongToByte)
CASE_CREATE_READER(SHORT, LongToShort)
@@ -888,7 +1396,7 @@ namespace orc {
}
}
case FLOAT: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, FloatToBoolean)
CASE_CREATE_READER(BYTE, FloatToByte)
CASE_CREATE_READER(SHORT, FloatToShort)
@@ -912,7 +1420,7 @@ namespace orc {
}
}
case DOUBLE: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, DoubleToBoolean)
CASE_CREATE_READER(BYTE, DoubleToByte)
CASE_CREATE_READER(SHORT, DoubleToShort)
@@ -935,15 +1443,8 @@ namespace orc {
CASE_EXCEPTION
}
}
- case STRING:
- case BINARY:
- case TIMESTAMP:
- case LIST:
- case MAP:
- case STRUCT:
- case UNION:
case DECIMAL: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean)
CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte)
CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short)
@@ -951,26 +1452,26 @@ namespace orc {
CASE_CREATE_FROM_DECIMAL_READER(LONG, Long)
CASE_CREATE_FROM_DECIMAL_READER(FLOAT, Float)
CASE_CREATE_FROM_DECIMAL_READER(DOUBLE, Double)
+ CASE_CREATE_FROM_DECIMAL_READER(STRING, String)
+ CASE_CREATE_FROM_DECIMAL_READER(CHAR, Char)
+ CASE_CREATE_FROM_DECIMAL_READER(VARCHAR, Varchar)
+ CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP, Timestamp)
+ CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP_INSTANT, Timestamp)
case DECIMAL: {
if (isDecimal64(fileType)) {
- if (isDecimal64(_readType)) {
+ if (isDecimal64(readType)) {
CREATE_READER(Decimal64ToDecimal64ColumnReader)
} else {
CREATE_READER(Decimal64ToDecimal128ColumnReader)
}
} else {
- if (isDecimal64(_readType)) {
+ if (isDecimal64(readType)) {
CREATE_READER(Decimal128ToDecimal64ColumnReader)
} else {
CREATE_READER(Decimal128ToDecimal128ColumnReader)
}
}
}
- case STRING:
- case CHAR:
- case VARCHAR:
- case TIMESTAMP:
- case TIMESTAMP_INSTANT:
case BINARY:
case LIST:
case MAP:
@@ -980,22 +1481,106 @@ namespace orc {
CASE_EXCEPTION
}
}
+ case STRING: {
+ switch (readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, StringToBoolean)
+ CASE_CREATE_READER(BYTE, StringToByte)
+ CASE_CREATE_READER(SHORT, StringToShort)
+ CASE_CREATE_READER(INT, StringToInt)
+ CASE_CREATE_READER(LONG, StringToLong)
+ CASE_CREATE_READER(FLOAT, StringToFloat)
+ CASE_CREATE_READER(DOUBLE, StringToDouble)
+ CASE_CREATE_READER(STRING, StringToString)
+ CASE_CREATE_READER(CHAR, StringToChar)
+ CASE_CREATE_READER(VARCHAR, StringToVarchar)
+ CASE_CREATE_READER(TIMESTAMP, StringToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, StringToTimestamp)
+ case DECIMAL: {
+ if (isDecimal64(readType)) {
+ CREATE_READER(StringToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(StringToDecimal128ColumnReader)
+ }
+ }
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case CHAR: {
+ switch (readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, CharToBoolean)
+ CASE_CREATE_READER(BYTE, CharToByte)
+ CASE_CREATE_READER(SHORT, CharToShort)
+ CASE_CREATE_READER(INT, CharToInt)
+ CASE_CREATE_READER(LONG, CharToLong)
+ CASE_CREATE_READER(FLOAT, CharToFloat)
+ CASE_CREATE_READER(DOUBLE, CharToDouble)
+ CASE_CREATE_READER(STRING, CharToString)
+ CASE_CREATE_READER(CHAR, CharToChar)
+ CASE_CREATE_READER(VARCHAR, CharToVarchar)
+ CASE_CREATE_READER(TIMESTAMP, CharToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, CharToTimestamp)
+ case DECIMAL: {
+ if (isDecimal64(readType)) {
+ CREATE_READER(CharToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(CharToDecimal128ColumnReader)
+ }
+ }
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case VARCHAR: {
+ switch (readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, VarcharToBoolean)
+ CASE_CREATE_READER(BYTE, VarcharToByte)
+ CASE_CREATE_READER(SHORT, VarcharToShort)
+ CASE_CREATE_READER(INT, VarcharToInt)
+ CASE_CREATE_READER(LONG, VarcharToLong)
+ CASE_CREATE_READER(FLOAT, VarcharToFloat)
+ CASE_CREATE_READER(DOUBLE, VarcharToDouble)
+ CASE_CREATE_READER(STRING, VarcharToString)
+ CASE_CREATE_READER(CHAR, VarcharToChar)
+ CASE_CREATE_READER(VARCHAR, VarcharToVarchar)
+ CASE_CREATE_READER(TIMESTAMP, VarcharToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, VarcharToTimestamp)
+ case DECIMAL: {
+ if (isDecimal64(readType)) {
+ CREATE_READER(VarcharToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(VarcharToDecimal128ColumnReader)
+ }
+ }
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case BINARY:
+ case TIMESTAMP:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
case DATE:
- case VARCHAR:
- case CHAR:
case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
-#undef DEFINE_NUMERIC_CONVERT_READER
-#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER
-#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER
-#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER
-#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER
-#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER
-#undef CASE_CREATE_FROM_DECIMAL_READER
-#undef CASE_CREATE_READER
-#undef CASE_EXCEPTION
-
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc
index 7e6958deef..588f8dc96a 100644
--- a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc
+++ b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc
@@ -74,7 +74,7 @@ namespace orc {
#if defined(_WIN32)
//------------------------------ WINDOWS ------------------------------//
- void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
+ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cacheSizes) {
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
DWORD buffer_size = 0;
@@ -108,8 +108,8 @@ namespace orc {
if (RelationCache == buffer_position->Relationship) {
PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
if (cache->Level >= 1 && cache->Level <= kCacheLevels) {
- const int64_t current = (*cache_sizes)[cache->Level - 1];
- (*cache_sizes)[cache->Level - 1] = std::max<int64_t>(current, cache->Size);
+ const int64_t current = (*cacheSizes)[cache->Level - 1];
+ (*cacheSizes)[cache->Level - 1] = std::max<int64_t>(current, cache->Size);
}
}
offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
@@ -136,23 +136,22 @@ namespace orc {
}
#endif // MINGW
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
int register_EAX_id = 1;
int highest_valid_id = 0;
int highest_extended_valid_id = 0;
std::bitset<32> features_ECX;
- std::array<int, 4> cpu_info;
+ std::array<int, 4> cpuInfo;
// Get highest valid id
- __cpuid(cpu_info.data(), 0);
- highest_valid_id = cpu_info[0];
+ __cpuid(cpuInfo.data(), 0);
+ highest_valid_id = cpuInfo[0];
// HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
// HEX of "AuthenticAMD": 41757468 656E7469 63414D44
- if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) {
+ if (cpuInfo[1] == 0x756e6547 && cpuInfo[3] == 0x49656e69 && cpuInfo[2] == 0x6c65746e) {
*vendor = CpuInfo::Vendor::Intel;
- } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 &&
- cpu_info[2] == 0x444d4163) {
+ } else if (cpuInfo[1] == 0x68747541 && cpuInfo[3] == 0x69746e65 && cpuInfo[2] == 0x444d4163) {
*vendor = CpuInfo::Vendor::AMD;
}
@@ -161,19 +160,19 @@ namespace orc {
}
// EAX=1: Processor Info and Feature Bits
- __cpuidex(cpu_info.data(), register_EAX_id, 0);
- features_ECX = cpu_info[2];
+ __cpuidex(cpuInfo.data(), register_EAX_id, 0);
+ features_ECX = cpuInfo[2];
// Get highest extended id
- __cpuid(cpu_info.data(), 0x80000000);
- highest_extended_valid_id = cpu_info[0];
+ __cpuid(cpuInfo.data(), 0x80000000);
+ highest_extended_valid_id = cpuInfo[0];
// Retrieve CPU model name
if (highest_extended_valid_id >= static_cast<int>(0x80000004)) {
- model_name->clear();
+ modelName->clear();
for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) {
- __cpuidex(cpu_info.data(), i, 0);
- *model_name += std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info));
+ __cpuidex(cpuInfo.data(), i, 0);
+ *modelName += std::string(reinterpret_cast<char*>(cpuInfo.data()), sizeof(cpuInfo));
}
}
@@ -184,37 +183,37 @@ namespace orc {
zmm_enabled = (xcr0 & 0xE0) == 0xE0;
}
- if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
- if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
- if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
- if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT;
- if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX;
+ if (features_ECX[9]) *hardwareFlags |= CpuInfo::SSSE3;
+ if (features_ECX[19]) *hardwareFlags |= CpuInfo::SSE4_1;
+ if (features_ECX[20]) *hardwareFlags |= CpuInfo::SSE4_2;
+ if (features_ECX[23]) *hardwareFlags |= CpuInfo::POPCNT;
+ if (features_ECX[28]) *hardwareFlags |= CpuInfo::AVX;
// cpuid with EAX=7, ECX=0: Extended Features
register_EAX_id = 7;
if (highest_valid_id > register_EAX_id) {
- __cpuidex(cpu_info.data(), register_EAX_id, 0);
- std::bitset<32> features_EBX = cpu_info[1];
+ __cpuidex(cpuInfo.data(), register_EAX_id, 0);
+ std::bitset<32> features_EBX = cpuInfo[1];
- if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
- if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
- if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
+ if (features_EBX[3]) *hardwareFlags |= CpuInfo::BMI1;
+ if (features_EBX[5]) *hardwareFlags |= CpuInfo::AVX2;
+ if (features_EBX[8]) *hardwareFlags |= CpuInfo::BMI2;
if (zmm_enabled) {
- if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
- if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
- if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
- if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
- if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
+ if (features_EBX[16]) *hardwareFlags |= CpuInfo::AVX512F;
+ if (features_EBX[17]) *hardwareFlags |= CpuInfo::AVX512DQ;
+ if (features_EBX[28]) *hardwareFlags |= CpuInfo::AVX512CD;
+ if (features_EBX[30]) *hardwareFlags |= CpuInfo::AVX512BW;
+ if (features_EBX[31]) *hardwareFlags |= CpuInfo::AVX512VL;
}
}
}
#elif defined(CPUINFO_ARCH_ARM)
// Windows on Arm
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
- *hardware_flags |= CpuInfo::ASIMD;
- // TODO: vendor, model_name
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
+ *hardwareFlags |= CpuInfo::ASIMD;
+ // TODO: vendor, modelName
}
#endif
@@ -236,25 +235,25 @@ namespace orc {
return std::nullopt;
}
- void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
+ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cacheSizes) {
static_assert(kCacheLevels >= 3, "");
auto c = IntegerSysCtlByName("hw.l1dcachesize");
if (c.has_value()) {
- (*cache_sizes)[0] = *c;
+ (*cacheSizes)[0] = *c;
}
c = IntegerSysCtlByName("hw.l2cachesize");
if (c.has_value()) {
- (*cache_sizes)[1] = *c;
+ (*cacheSizes)[1] = *c;
}
c = IntegerSysCtlByName("hw.l3cachesize");
if (c.has_value()) {
- (*cache_sizes)[2] = *c;
+ (*cacheSizes)[2] = *c;
}
}
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
- // hardware_flags
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
+ // hardwareFlags
struct SysCtlCpuFeature {
const char* name;
int64_t flag;
@@ -280,13 +279,13 @@ namespace orc {
for (const auto& feature : features) {
auto v = IntegerSysCtlByName(feature.name);
if (v.value_or(0)) {
- *hardware_flags |= feature.flag;
+ *hardwareFlags |= feature.flag;
}
}
- // TODO: vendor, model_name
+ // TODO: vendor, modelName
*vendor = CpuInfo::Vendor::Unknown;
- *model_name = "Unknown";
+ *modelName = "Unknown";
}
#else
@@ -345,7 +344,7 @@ namespace orc {
const struct {
std::string name;
int64_t flag;
- } flag_mappings[] = {
+ } flagMappings[] = {
#if defined(CPUINFO_ARCH_X86)
{"ssse3", CpuInfo::SSSE3},
{"sse4_1", CpuInfo::SSE4_1},
@@ -364,22 +363,22 @@ namespace orc {
{"asimd", CpuInfo::ASIMD},
#endif
};
- const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+ const int64_t num_flags = sizeof(flagMappings) / sizeof(flagMappings[0]);
int64_t flags = 0;
for (int i = 0; i < num_flags; ++i) {
- if (values.find(flag_mappings[i].name) != std::string::npos) {
- flags |= flag_mappings[i].flag;
+ if (values.find(flagMappings[i].name) != std::string::npos) {
+ flags |= flagMappings[i].flag;
}
}
return flags;
}
- void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
+ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cacheSizes) {
for (int i = 0; i < kCacheLevels; ++i) {
const int64_t cache_size = LinuxGetCacheSize(i);
if (cache_size > 0) {
- (*cache_sizes)[i] = cache_size;
+ (*cacheSizes)[i] = cache_size;
}
}
}
@@ -403,8 +402,8 @@ namespace orc {
}
// Read from /proc/cpuinfo
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
while (cpuinfo) {
std::string line;
@@ -414,9 +413,9 @@ namespace orc {
const std::string name = TrimString(line.substr(0, colon - 1));
const std::string value = TrimString(line.substr(colon + 1, std::string::npos));
if (name.compare("flags") == 0 || name.compare("Features") == 0) {
- *hardware_flags |= LinuxParseCpuFlags(value);
+ *hardwareFlags |= LinuxParseCpuFlags(value);
} else if (name.compare("model name") == 0) {
- *model_name = value;
+ *modelName = value;
} else if (name.compare("vendor_id") == 0) {
if (value.compare("GenuineIntel") == 0) {
*vendor = CpuInfo::Vendor::Intel;
@@ -433,7 +432,7 @@ namespace orc {
#if defined(CPUINFO_ARCH_X86)
//------------------------------ X86_64 ------------------------------//
- bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
+ bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) {
enum {
USER_SIMD_NONE,
USER_SIMD_AVX512,
@@ -442,9 +441,9 @@ namespace orc {
int level = USER_SIMD_MAX;
// Parse the level
- if (simd_level == "AVX512") {
+ if (simdLevel == "AVX512") {
level = USER_SIMD_AVX512;
- } else if (simd_level == "NONE") {
+ } else if (simdLevel == "NONE") {
level = USER_SIMD_NONE;
} else {
return false;
@@ -452,7 +451,7 @@ namespace orc {
// Disable feature as the level
if (level < USER_SIMD_AVX512) {
- *hardware_flags &= ~CpuInfo::AVX512;
+ *hardwareFlags &= ~CpuInfo::AVX512;
}
return true;
}
@@ -469,9 +468,9 @@ namespace orc {
#elif defined(CPUINFO_ARCH_ARM)
//------------------------------ AARCH64 ------------------------------//
- bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
- if (simd_level == "NONE") {
- *hardware_flags &= ~CpuInfo::ASIMD;
+ bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) {
+ if (simdLevel == "NONE") {
+ *hardwareFlags &= ~CpuInfo::ASIMD;
return true;
}
return false;
@@ -485,7 +484,7 @@ namespace orc {
#else
//------------------------------ PPC, ... ------------------------------//
- bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
+ bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) {
return true;
}
@@ -496,17 +495,17 @@ namespace orc {
} // namespace
struct CpuInfo::Impl {
- int64_t hardware_flags = 0;
+ int64_t hardwareFlags = 0;
int numCores = 0;
- int64_t original_hardware_flags = 0;
+ int64_t originalHardwareFlags = 0;
Vendor vendor = Vendor::Unknown;
- std::string model_name = "Unknown";
- std::array<int64_t, kCacheLevels> cache_sizes{};
+ std::string modelName = "Unknown";
+ std::array<int64_t, kCacheLevels> cacheSizes{};
Impl() {
- OsRetrieveCacheSize(&cache_sizes);
- OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name);
- original_hardware_flags = hardware_flags;
+ OsRetrieveCacheSize(&cacheSizes);
+ OsRetrieveCpuInfo(&hardwareFlags, &vendor, &modelName);
+ originalHardwareFlags = hardwareFlags;
numCores = std::max(static_cast<int>(std::thread::hardware_concurrency()), 1);
// parse user simd level
@@ -514,7 +513,7 @@ namespace orc {
std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var);
std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(),
[](unsigned char c) { return std::toupper(c); });
- if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) {
+ if (!ArchParseUserSimdLevel(userSimdLevel, &hardwareFlags)) {
throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel);
}
}
@@ -530,8 +529,8 @@ namespace orc {
#endif
const CpuInfo* CpuInfo::getInstance() {
- static CpuInfo cpu_info;
- return &cpu_info;
+ static CpuInfo cpuInfo;
+ return &cpuInfo;
}
#ifdef __clang__
@@ -539,7 +538,7 @@ namespace orc {
#endif
int64_t CpuInfo::hardwareFlags() const {
- return impl_->hardware_flags;
+ return impl_->hardwareFlags;
}
int CpuInfo::numCores() const {
@@ -551,7 +550,7 @@ namespace orc {
}
const std::string& CpuInfo::modelName() const {
- return impl_->model_name;
+ return impl_->modelName;
}
int64_t CpuInfo::cacheSize(CacheLevel level) const {
@@ -564,18 +563,18 @@ namespace orc {
static_assert(static_cast<int>(CacheLevel::L1) == 0, "");
const int i = static_cast<int>(level);
- if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i];
+ if (impl_->cacheSizes[i] > 0) return impl_->cacheSizes[i];
if (i == 0) return kDefaultCacheSizes[0];
// l3 may be not available, return maximum of l2 or default size
- return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]);
+ return std::max(kDefaultCacheSizes[i], impl_->cacheSizes[i - 1]);
}
bool CpuInfo::isSupported(int64_t flags) const {
- return (impl_->hardware_flags & flags) == flags;
+ return (impl_->hardwareFlags & flags) == flags;
}
bool CpuInfo::isDetected(int64_t flags) const {
- return (impl_->original_hardware_flags & flags) == flags;
+ return (impl_->originalHardwareFlags & flags) == flags;
}
void CpuInfo::verifyCpuRequirements() const {
diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc
index 23703ff324..2ba1ab404c 100644
--- a/contrib/libs/apache/orc/c++/src/Exceptions.cc
+++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc
@@ -20,11 +20,11 @@
namespace orc {
- NotImplementedYet::NotImplementedYet(const std::string& what_arg) : logic_error(what_arg) {
+ NotImplementedYet::NotImplementedYet(const std::string& whatArg) : logic_error(whatArg) {
// PASS
}
- NotImplementedYet::NotImplementedYet(const char* what_arg) : logic_error(what_arg) {
+ NotImplementedYet::NotImplementedYet(const char* whatArg) : logic_error(whatArg) {
// PASS
}
@@ -36,11 +36,11 @@ namespace orc {
// PASS
}
- ParseError::ParseError(const std::string& what_arg) : runtime_error(what_arg) {
+ ParseError::ParseError(const std::string& whatArg) : runtime_error(whatArg) {
// PASS
}
- ParseError::ParseError(const char* what_arg) : runtime_error(what_arg) {
+ ParseError::ParseError(const char* whatArg) : runtime_error(whatArg) {
// PASS
}
@@ -52,11 +52,11 @@ namespace orc {
// PASS
}
- InvalidArgument::InvalidArgument(const std::string& what_arg) : runtime_error(what_arg) {
+ InvalidArgument::InvalidArgument(const std::string& whatArg) : runtime_error(whatArg) {
// PASS
}
- InvalidArgument::InvalidArgument(const char* what_arg) : runtime_error(what_arg) {
+ InvalidArgument::InvalidArgument(const char* whatArg) : runtime_error(whatArg) {
// PASS
}
@@ -68,11 +68,11 @@ namespace orc {
// PASS
}
- SchemaEvolutionError::SchemaEvolutionError(const std::string& what_arg) : logic_error(what_arg) {
+ SchemaEvolutionError::SchemaEvolutionError(const std::string& whatArg) : logic_error(whatArg) {
// PASS
}
- SchemaEvolutionError::SchemaEvolutionError(const char* what_arg) : logic_error(what_arg) {
+ SchemaEvolutionError::SchemaEvolutionError(const char* whatArg) : logic_error(whatArg) {
// PASS
}
@@ -84,4 +84,20 @@ namespace orc {
SchemaEvolutionError::~SchemaEvolutionError() noexcept {
// PASS
}
+
+ CompressionError::CompressionError(const std::string& whatArg) : runtime_error(whatArg) {
+ // PASS
+ }
+
+ CompressionError::CompressionError(const char* whatArg) : runtime_error(whatArg) {
+ // PASS
+ }
+
+ CompressionError::CompressionError(const CompressionError& error) : runtime_error(error) {
+ // PASS
+ }
+
+ CompressionError::~CompressionError() noexcept {
+ // PASS
+ }
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc
index 434a8dda80..1e059fd4e2 100644
--- a/contrib/libs/apache/orc/c++/src/Int128.cc
+++ b/contrib/libs/apache/orc/c++/src/Int128.cc
@@ -35,8 +35,8 @@ namespace orc {
}
Int128::Int128(const std::string& str) {
- lowbits = 0;
- highbits = 0;
+ lowbits_ = 0;
+ highbits_ = 0;
size_t length = str.length();
if (length > 0) {
bool isNegative = str[0] == '-';
@@ -64,30 +64,30 @@ namespace orc {
// Break the left and right numbers into 32 bit chunks
// so that we can multiply them without overflow.
- uint64_t L0 = static_cast<uint64_t>(highbits) >> 32;
- uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK;
- uint64_t L2 = lowbits >> 32;
- uint64_t L3 = lowbits & INT_MASK;
- uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32;
- uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK;
- uint64_t R2 = right.lowbits >> 32;
- uint64_t R3 = right.lowbits & INT_MASK;
+ uint64_t L0 = static_cast<uint64_t>(highbits_) >> 32;
+ uint64_t L1 = static_cast<uint64_t>(highbits_) & INT_MASK;
+ uint64_t L2 = lowbits_ >> 32;
+ uint64_t L3 = lowbits_ & INT_MASK;
+ uint64_t R0 = static_cast<uint64_t>(right.highbits_) >> 32;
+ uint64_t R1 = static_cast<uint64_t>(right.highbits_) & INT_MASK;
+ uint64_t R2 = right.lowbits_ >> 32;
+ uint64_t R3 = right.lowbits_ & INT_MASK;
uint64_t product = L3 * R3;
- lowbits = product & INT_MASK;
+ lowbits_ = product & INT_MASK;
uint64_t sum = product >> 32;
product = L2 * R3;
sum += product;
- highbits = sum < product ? CARRY_BIT : 0;
+ highbits_ = sum < product ? CARRY_BIT : 0;
product = L3 * R2;
sum += product;
if (sum < product) {
- highbits += CARRY_BIT;
+ highbits_ += CARRY_BIT;
}
- lowbits += sum << 32;
- highbits += static_cast<int64_t>(sum >> 32);
- highbits += L1 * R3 + L2 * R2 + L3 * R1;
- highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32;
+ lowbits_ += sum << 32;
+ highbits_ += static_cast<int64_t>(sum >> 32);
+ highbits_ += L1 * R3 + L2 * R2 + L3 * R1;
+ highbits_ += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32;
return *this;
}
@@ -103,16 +103,16 @@ namespace orc {
int64_t Int128::fillInArray(uint32_t* array, bool& wasNegative) const {
uint64_t high;
uint64_t low;
- if (highbits < 0) {
- low = ~lowbits + 1;
- high = static_cast<uint64_t>(~highbits);
+ if (highbits_ < 0) {
+ low = ~lowbits_ + 1;
+ high = static_cast<uint64_t>(~highbits_);
if (low == 0) {
high += 1;
}
wasNegative = true;
} else {
- low = lowbits;
- high = static_cast<uint64_t>(highbits);
+ low = lowbits_;
+ high = static_cast<uint64_t>(highbits_);
wasNegative = false;
}
if (high != 0) {
@@ -430,8 +430,8 @@ namespace orc {
std::string Int128::toHexString() const {
std::stringstream buf;
- buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits << std::setw(16)
- << std::setfill('0') << lowbits;
+ buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits_ << std::setw(16)
+ << std::setfill('0') << lowbits_;
return buf.str();
}
@@ -439,7 +439,7 @@ namespace orc {
if (fitsInLong()) {
return static_cast<double>(toLong());
}
- return static_cast<double>(lowbits) + std::ldexp(static_cast<double>(highbits), 64);
+ return static_cast<double>(lowbits_) + std::ldexp(static_cast<double>(highbits_), 64);
}
const static int32_t MAX_PRECISION_64 = 18;
diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc
index 8c8837aa64..ed7fee7373 100644
--- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc
+++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc
@@ -53,72 +53,72 @@ namespace orc {
template <class T>
DataBuffer<T>::DataBuffer(MemoryPool& pool, uint64_t newSize)
- : memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) {
+ : memoryPool_(pool), buf_(nullptr), currentSize_(0), currentCapacity_(0) {
reserve(newSize);
- currentSize = newSize;
+ currentSize_ = newSize;
}
template <class T>
DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer) noexcept
- : memoryPool(buffer.memoryPool),
- buf(buffer.buf),
- currentSize(buffer.currentSize),
- currentCapacity(buffer.currentCapacity) {
- buffer.buf = nullptr;
- buffer.currentSize = 0;
- buffer.currentCapacity = 0;
+ : memoryPool_(buffer.memoryPool_),
+ buf_(buffer.buf_),
+ currentSize_(buffer.currentSize_),
+ currentCapacity_(buffer.currentCapacity_) {
+ buffer.buf_ = nullptr;
+ buffer.currentSize_ = 0;
+ buffer.currentCapacity_ = 0;
}
template <class T>
DataBuffer<T>::~DataBuffer() {
- for (uint64_t i = currentSize; i > 0; --i) {
- (buf + i - 1)->~T();
+ for (uint64_t i = currentSize_; i > 0; --i) {
+ (buf_ + i - 1)->~T();
}
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <class T>
void DataBuffer<T>::resize(uint64_t newSize) {
reserve(newSize);
- if (currentSize > newSize) {
- for (uint64_t i = currentSize; i > newSize; --i) {
- (buf + i - 1)->~T();
+ if (currentSize_ > newSize) {
+ for (uint64_t i = currentSize_; i > newSize; --i) {
+ (buf_ + i - 1)->~T();
}
- } else if (newSize > currentSize) {
- for (uint64_t i = currentSize; i < newSize; ++i) {
- new (buf + i) T();
+ } else if (newSize > currentSize_) {
+ for (uint64_t i = currentSize_; i < newSize; ++i) {
+ new (buf_ + i) T();
}
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
template <class T>
void DataBuffer<T>::reserve(uint64_t newCapacity) {
- if (newCapacity > currentCapacity || !buf) {
- if (buf) {
- T* buf_old = buf;
- buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
- memcpy(buf, buf_old, sizeof(T) * currentSize);
- memoryPool.free(reinterpret_cast<char*>(buf_old));
+ if (newCapacity > currentCapacity_ || !buf_) {
+ if (buf_) {
+ T* buf_old = buf_;
+ buf_ = reinterpret_cast<T*>(memoryPool_.malloc(sizeof(T) * newCapacity));
+ memcpy(buf_, buf_old, sizeof(T) * currentSize_);
+ memoryPool_.free(reinterpret_cast<char*>(buf_old));
} else {
- buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
+ buf_ = reinterpret_cast<T*>(memoryPool_.malloc(sizeof(T) * newCapacity));
}
- currentCapacity = newCapacity;
+ currentCapacity_ = newCapacity;
}
}
template <class T>
void DataBuffer<T>::zeroOut() {
- memset(buf, 0, sizeof(T) * currentCapacity);
+ memset(buf_, 0, sizeof(T) * currentCapacity_);
}
// Specializations for Int128
template <>
void DataBuffer<Int128>::zeroOut() {
- for (uint64_t i = 0; i < currentCapacity; ++i) {
- new (buf + i) Int128();
+ for (uint64_t i = 0; i < currentCapacity_; ++i) {
+ new (buf_ + i) Int128();
}
}
@@ -126,180 +126,180 @@ namespace orc {
template <>
DataBuffer<char>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<char>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, newSize - currentSize);
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, newSize - currentSize_);
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for char*
template <>
DataBuffer<char*>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<char*>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(char*));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for double
template <>
DataBuffer<double>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<double>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(double));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for float
template <>
DataBuffer<float>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<float>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(float));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(float));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for int64_t
template <>
DataBuffer<int64_t>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<int64_t>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int64_t));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for int32_t
template <>
DataBuffer<int32_t>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<int32_t>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int32_t));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int32_t));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for int16_t
template <>
DataBuffer<int16_t>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<int16_t>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int16_t));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int16_t));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for int8_t
template <>
DataBuffer<int8_t>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<int8_t>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int8_t));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(int8_t));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for uint64_t
template <>
DataBuffer<uint64_t>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<uint64_t>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t));
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, (newSize - currentSize_) * sizeof(uint64_t));
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
// Specializations for unsigned char
template <>
DataBuffer<unsigned char>::~DataBuffer() {
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
+ if (buf_) {
+ memoryPool_.free(reinterpret_cast<char*>(buf_));
}
}
template <>
void DataBuffer<unsigned char>::resize(uint64_t newSize) {
reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, newSize - currentSize);
+ if (newSize > currentSize_) {
+ memset(buf_ + currentSize_, 0, newSize - currentSize_);
}
- currentSize = newSize;
+ currentSize_ = newSize;
}
#ifdef __clang__
diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh
index 51cd8efd64..0a4bd56d8f 100644
--- a/contrib/libs/apache/orc/c++/src/Options.hh
+++ b/contrib/libs/apache/orc/c++/src/Options.hh
@@ -23,6 +23,8 @@
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
+#include "io/Cache.hh"
+
#include <limits>
namespace orc {
@@ -43,6 +45,7 @@ namespace orc {
MemoryPool* memoryPool;
std::string serializedTail;
ReaderMetrics* metrics;
+ CacheOptions cacheOptions;
ReaderOptionsPrivate() {
tailLocation = std::numeric_limits<uint64_t>::max();
@@ -52,23 +55,23 @@ namespace orc {
}
};
- ReaderOptions::ReaderOptions() : privateBits(std::make_unique<ReaderOptionsPrivate>()) {
+ ReaderOptions::ReaderOptions() : privateBits_(std::make_unique<ReaderOptionsPrivate>()) {
// PASS
}
ReaderOptions::ReaderOptions(const ReaderOptions& rhs)
- : privateBits(std::make_unique<ReaderOptionsPrivate>(*(rhs.privateBits.get()))) {
+ : privateBits_(std::make_unique<ReaderOptionsPrivate>(*(rhs.privateBits_.get()))) {
// PASS
}
ReaderOptions::ReaderOptions(ReaderOptions& rhs) {
// swap privateBits with rhs
- privateBits.swap(rhs.privateBits);
+ privateBits_.swap(rhs.privateBits_);
}
ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) {
if (this != &rhs) {
- privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get())));
+ privateBits_.reset(new ReaderOptionsPrivate(*(rhs.privateBits_.get())));
}
return *this;
}
@@ -78,48 +81,57 @@ namespace orc {
}
ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) {
- privateBits->memoryPool = &pool;
+ privateBits_->memoryPool = &pool;
return *this;
}
MemoryPool* ReaderOptions::getMemoryPool() const {
- return privateBits->memoryPool;
+ return privateBits_->memoryPool;
}
ReaderOptions& ReaderOptions::setReaderMetrics(ReaderMetrics* metrics) {
- privateBits->metrics = metrics;
+ privateBits_->metrics = metrics;
return *this;
}
ReaderMetrics* ReaderOptions::getReaderMetrics() const {
- return privateBits->metrics;
+ return privateBits_->metrics;
}
ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) {
- privateBits->tailLocation = offset;
+ privateBits_->tailLocation = offset;
return *this;
}
uint64_t ReaderOptions::getTailLocation() const {
- return privateBits->tailLocation;
+ return privateBits_->tailLocation;
}
ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value) {
- privateBits->serializedTail = value;
+ privateBits_->serializedTail = value;
return *this;
}
std::string ReaderOptions::getSerializedFileTail() const {
- return privateBits->serializedTail;
+ return privateBits_->serializedTail;
}
ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) {
- privateBits->errorStream = &stream;
+ privateBits_->errorStream = &stream;
return *this;
}
std::ostream* ReaderOptions::getErrorStream() const {
- return privateBits->errorStream;
+ return privateBits_->errorStream;
+ }
+
+ ReaderOptions& ReaderOptions::setCacheOptions(const CacheOptions& cacheOptions) {
+ privateBits_->cacheOptions = cacheOptions;
+ return *this;
+ }
+
+ const CacheOptions& ReaderOptions::getCacheOptions() const {
+ return privateBits_->cacheOptions;
}
/**
@@ -155,23 +167,23 @@ namespace orc {
}
};
- RowReaderOptions::RowReaderOptions() : privateBits(std::make_unique<RowReaderOptionsPrivate>()) {
+ RowReaderOptions::RowReaderOptions() : privateBits_(std::make_unique<RowReaderOptionsPrivate>()) {
// PASS
}
RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs)
- : privateBits(std::make_unique<RowReaderOptionsPrivate>(*(rhs.privateBits.get()))) {
+ : privateBits_(std::make_unique<RowReaderOptionsPrivate>(*(rhs.privateBits_.get()))) {
// PASS
}
RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) {
// swap privateBits with rhs
- privateBits.swap(rhs.privateBits);
+ privateBits_.swap(rhs.privateBits_);
}
RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) {
if (this != &rhs) {
- privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get())));
+ privateBits_.reset(new RowReaderOptionsPrivate(*(rhs.privateBits_.get())));
}
return *this;
}
@@ -181,150 +193,150 @@ namespace orc {
}
RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) {
- privateBits->selection = ColumnSelection_FIELD_IDS;
- privateBits->includedColumnIndexes.assign(include.begin(), include.end());
- privateBits->includedColumnNames.clear();
- privateBits->idReadIntentMap.clear();
+ privateBits_->selection = ColumnSelection_FIELD_IDS;
+ privateBits_->includedColumnIndexes.assign(include.begin(), include.end());
+ privateBits_->includedColumnNames.clear();
+ privateBits_->idReadIntentMap.clear();
return *this;
}
RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) {
- privateBits->selection = ColumnSelection_NAMES;
- privateBits->includedColumnNames.assign(include.begin(), include.end());
- privateBits->includedColumnIndexes.clear();
- privateBits->idReadIntentMap.clear();
+ privateBits_->selection = ColumnSelection_NAMES;
+ privateBits_->includedColumnNames.assign(include.begin(), include.end());
+ privateBits_->includedColumnIndexes.clear();
+ privateBits_->idReadIntentMap.clear();
return *this;
}
RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) {
- privateBits->selection = ColumnSelection_TYPE_IDS;
- privateBits->includedColumnIndexes.assign(types.begin(), types.end());
- privateBits->includedColumnNames.clear();
- privateBits->idReadIntentMap.clear();
+ privateBits_->selection = ColumnSelection_TYPE_IDS;
+ privateBits_->includedColumnIndexes.assign(types.begin(), types.end());
+ privateBits_->includedColumnNames.clear();
+ privateBits_->idReadIntentMap.clear();
return *this;
}
RowReaderOptions& RowReaderOptions::includeTypesWithIntents(
const IdReadIntentMap& idReadIntentMap) {
- privateBits->selection = ColumnSelection_TYPE_IDS;
- privateBits->includedColumnIndexes.clear();
- privateBits->idReadIntentMap.clear();
+ privateBits_->selection = ColumnSelection_TYPE_IDS;
+ privateBits_->includedColumnIndexes.clear();
+ privateBits_->idReadIntentMap.clear();
for (const auto& typeIntentPair : idReadIntentMap) {
- privateBits->idReadIntentMap[typeIntentPair.first] = typeIntentPair.second;
- privateBits->includedColumnIndexes.push_back(typeIntentPair.first);
+ privateBits_->idReadIntentMap[typeIntentPair.first] = typeIntentPair.second;
+ privateBits_->includedColumnIndexes.push_back(typeIntentPair.first);
}
- privateBits->includedColumnNames.clear();
+ privateBits_->includedColumnNames.clear();
return *this;
}
RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
- privateBits->dataStart = offset;
- privateBits->dataLength = length;
+ privateBits_->dataStart = offset;
+ privateBits_->dataLength = length;
return *this;
}
bool RowReaderOptions::getIndexesSet() const {
- return privateBits->selection == ColumnSelection_FIELD_IDS;
+ return privateBits_->selection == ColumnSelection_FIELD_IDS;
}
bool RowReaderOptions::getTypeIdsSet() const {
- return privateBits->selection == ColumnSelection_TYPE_IDS;
+ return privateBits_->selection == ColumnSelection_TYPE_IDS;
}
const std::list<uint64_t>& RowReaderOptions::getInclude() const {
- return privateBits->includedColumnIndexes;
+ return privateBits_->includedColumnIndexes;
}
bool RowReaderOptions::getNamesSet() const {
- return privateBits->selection == ColumnSelection_NAMES;
+ return privateBits_->selection == ColumnSelection_NAMES;
}
const std::list<std::string>& RowReaderOptions::getIncludeNames() const {
- return privateBits->includedColumnNames;
+ return privateBits_->includedColumnNames;
}
uint64_t RowReaderOptions::getOffset() const {
- return privateBits->dataStart;
+ return privateBits_->dataStart;
}
uint64_t RowReaderOptions::getLength() const {
- return privateBits->dataLength;
+ return privateBits_->dataLength;
}
RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow) {
- privateBits->throwOnHive11DecimalOverflow = shouldThrow;
+ privateBits_->throwOnHive11DecimalOverflow = shouldThrow;
return *this;
}
bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const {
- return privateBits->throwOnHive11DecimalOverflow;
+ return privateBits_->throwOnHive11DecimalOverflow;
}
RowReaderOptions& RowReaderOptions::throwOnSchemaEvolutionOverflow(bool shouldThrow) {
- privateBits->throwOnSchemaEvolutionOverflow = shouldThrow;
+ privateBits_->throwOnSchemaEvolutionOverflow = shouldThrow;
return *this;
}
bool RowReaderOptions::getThrowOnSchemaEvolutionOverflow() const {
- return privateBits->throwOnSchemaEvolutionOverflow;
+ return privateBits_->throwOnSchemaEvolutionOverflow;
}
RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale) {
- privateBits->forcedScaleOnHive11Decimal = forcedScale;
+ privateBits_->forcedScaleOnHive11Decimal = forcedScale;
return *this;
}
int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const {
- return privateBits->forcedScaleOnHive11Decimal;
+ return privateBits_->forcedScaleOnHive11Decimal;
}
bool RowReaderOptions::getEnableLazyDecoding() const {
- return privateBits->enableLazyDecoding;
+ return privateBits_->enableLazyDecoding;
}
RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) {
- privateBits->enableLazyDecoding = enable;
+ privateBits_->enableLazyDecoding = enable;
return *this;
}
RowReaderOptions& RowReaderOptions::searchArgument(std::unique_ptr<SearchArgument> sargs) {
- privateBits->sargs = std::move(sargs);
+ privateBits_->sargs = std::move(sargs);
return *this;
}
std::shared_ptr<SearchArgument> RowReaderOptions::getSearchArgument() const {
- return privateBits->sargs;
+ return privateBits_->sargs;
}
RowReaderOptions& RowReaderOptions::setTimezoneName(const std::string& zoneName) {
- privateBits->readerTimezone = zoneName;
+ privateBits_->readerTimezone = zoneName;
return *this;
}
const std::string& RowReaderOptions::getTimezoneName() const {
- return privateBits->readerTimezone;
+ return privateBits_->readerTimezone;
}
const RowReaderOptions::IdReadIntentMap RowReaderOptions::getIdReadIntentMap() const {
- return privateBits->idReadIntentMap;
+ return privateBits_->idReadIntentMap;
}
RowReaderOptions& RowReaderOptions::setUseTightNumericVector(bool useTightNumericVector) {
- privateBits->useTightNumericVector = useTightNumericVector;
+ privateBits_->useTightNumericVector = useTightNumericVector;
return *this;
}
bool RowReaderOptions::getUseTightNumericVector() const {
- return privateBits->useTightNumericVector;
+ return privateBits_->useTightNumericVector;
}
RowReaderOptions& RowReaderOptions::setReadType(std::shared_ptr<Type> type) {
- privateBits->readType = std::move(type);
+ privateBits_->readType = std::move(type);
return *this;
}
std::shared_ptr<Type>& RowReaderOptions::getReadType() const {
- return privateBits->readType;
+ return privateBits_->readType;
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc
index d4b6a86e2f..be86724329 100644
--- a/contrib/libs/apache/orc/c++/src/OrcFile.cc
+++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc
@@ -49,29 +49,29 @@ namespace orc {
class FileInputStream : public InputStream {
private:
- std::string filename;
- int file;
- uint64_t totalLength;
- ReaderMetrics* metrics;
+ std::string filename_;
+ int file_;
+ uint64_t totalLength_;
+ ReaderMetrics* metrics_;
public:
- FileInputStream(std::string _filename, ReaderMetrics* _metrics)
- : filename(_filename), metrics(_metrics) {
- file = open(filename.c_str(), O_BINARY | O_RDONLY);
- if (file == -1) {
- throw ParseError("Can't open " + filename);
+ FileInputStream(std::string filename, ReaderMetrics* metrics)
+ : filename_(filename), metrics_(metrics) {
+ file_ = open(filename_.c_str(), O_BINARY | O_RDONLY);
+ if (file_ == -1) {
+ throw ParseError("Can't open " + filename_);
}
struct stat fileStat;
- if (fstat(file, &fileStat) == -1) {
- throw ParseError("Can't stat " + filename);
+ if (fstat(file_, &fileStat) == -1) {
+ throw ParseError("Can't stat " + filename_);
}
- totalLength = static_cast<uint64_t>(fileStat.st_size);
+ totalLength_ = static_cast<uint64_t>(fileStat.st_size);
}
~FileInputStream() override;
uint64_t getLength() const override {
- return totalLength;
+ return totalLength_;
}
uint64_t getNaturalReadSize() const override {
@@ -79,27 +79,27 @@ namespace orc {
}
void read(void* buf, uint64_t length, uint64_t offset) override {
- SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount);
+ SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount);
if (!buf) {
throw ParseError("Buffer is null");
}
- ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset));
+ ssize_t bytesRead = pread(file_, buf, length, static_cast<off_t>(offset));
if (bytesRead == -1) {
- throw ParseError("Bad read of " + filename);
+ throw ParseError("Bad read of " + filename_);
}
if (static_cast<uint64_t>(bytesRead) != length) {
- throw ParseError("Short read of " + filename);
+ throw ParseError("Short read of " + filename_);
}
}
const std::string& getName() const override {
- return filename;
+ return filename_;
}
};
FileInputStream::~FileInputStream() {
- close(file);
+ close(file_);
}
std::unique_ptr<InputStream> readFile(const std::string& path, ReaderMetrics* metrics) {
@@ -126,26 +126,26 @@ namespace orc {
class FileOutputStream : public OutputStream {
private:
- std::string filename;
- int file;
- uint64_t bytesWritten;
- bool closed;
+ std::string filename_;
+ int file_;
+ uint64_t bytesWritten_;
+ bool closed_;
public:
- FileOutputStream(std::string _filename) {
- bytesWritten = 0;
- filename = _filename;
- closed = false;
- file = open(filename.c_str(), O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR);
- if (file == -1) {
- throw ParseError("Can't open " + filename);
+ FileOutputStream(std::string filename) {
+ bytesWritten_ = 0;
+ filename_ = filename;
+ closed_ = false;
+ file_ = open(filename_.c_str(), O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR);
+ if (file_ == -1) {
+ throw ParseError("Can't open " + filename_);
}
}
~FileOutputStream() override;
uint64_t getLength() const override {
- return bytesWritten;
+ return bytesWritten_;
}
uint64_t getNaturalWriteSize() const override {
@@ -153,41 +153,41 @@ namespace orc {
}
void write(const void* buf, size_t length) override {
- if (closed) {
+ if (closed_) {
throw std::logic_error("Cannot write to closed stream.");
}
- ssize_t bytesWrite = ::write(file, buf, length);
+ ssize_t bytesWrite = ::write(file_, buf, length);
if (bytesWrite == -1) {
- throw ParseError("Bad write of " + filename);
+ throw ParseError("Bad write of " + filename_);
}
if (static_cast<uint64_t>(bytesWrite) != length) {
- throw ParseError("Short write of " + filename);
+ throw ParseError("Short write of " + filename_);
}
- bytesWritten += static_cast<uint64_t>(bytesWrite);
+ bytesWritten_ += static_cast<uint64_t>(bytesWrite);
}
const std::string& getName() const override {
- return filename;
+ return filename_;
}
void close() override {
- if (!closed) {
- ::close(file);
- closed = true;
+ if (!closed_) {
+ ::close(file_);
+ closed_ = true;
}
}
void flush() override {
- if (!closed) {
- ::fsync(file);
+ if (!closed_) {
+ ::fsync(file_);
}
}
};
FileOutputStream::~FileOutputStream() {
- if (!closed) {
- ::close(file);
- closed = true;
+ if (!closed_) {
+ ::close(file_);
+ closed_ = true;
}
}
diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc
index 89aca6a10e..cb831c80f7 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.cc
+++ b/contrib/libs/apache/orc/c++/src/RLE.cc
@@ -108,15 +108,23 @@ namespace orc {
void RleEncoder::recordPosition(PositionRecorder* recorder) const {
uint64_t flushedSize = outputStream->getSize();
- uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
+ uint64_t unusedBufferSize = static_cast<uint64_t>(bufferLength - bufferPosition);
if (outputStream->isCompressed()) {
recorder->add(flushedSize);
- recorder->add(unflushedSize);
+ // There are multiple blocks in the input buffer, but bufferPosition only records the
+ // effective length of the last block. We need rawInputBufferSize to record the total length
+ // of all variable blocks.
+ recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize);
} else {
- flushedSize -= static_cast<uint64_t>(bufferLength);
- recorder->add(flushedSize + unflushedSize);
+ recorder->add(flushedSize - unusedBufferSize);
}
recorder->add(static_cast<uint64_t>(numLiterals));
}
+ void RleEncoder::finishEncode() {
+ outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
+ outputStream->finishStream();
+ bufferLength = bufferPosition = 0;
+ }
+
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh
index 51f9b6f58a..e46504e885 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.hh
+++ b/contrib/libs/apache/orc/c++/src/RLE.hh
@@ -84,6 +84,13 @@ namespace orc {
virtual void write(int64_t val) = 0;
+ /**
+ * Finalize the encoding process. This function should be called after all data required for
+ * encoding has been added. It ensures that any remaining data is processed and the final state
+ * of the encoder is set.
+ */
+ virtual void finishEncode();
+
protected:
std::unique_ptr<BufferedOutputStream> outputStream;
size_t bufferPosition;
@@ -105,7 +112,7 @@ namespace orc {
// must be non-inline!
virtual ~RleDecoder();
- RleDecoder(ReaderMetrics* _metrics) : metrics(_metrics) {
+ RleDecoder(ReaderMetrics* metrics) : metrics(metrics) {
// pass
}
diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc
index b221e8b8aa..72c555e610 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv1.cc
+++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc
@@ -38,9 +38,9 @@ namespace orc {
RleEncoderV1::RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned)
: RleEncoder(std::move(outStream), hasSigned) {
literals = new int64_t[MAX_LITERAL_SIZE];
- delta = 0;
- repeat = false;
- tailRunLength = 0;
+ delta_ = 0;
+ repeat_ = false;
+ tailRunLength_ = 0;
}
RleEncoderV1::~RleEncoderV1() {
@@ -49,9 +49,9 @@ namespace orc {
void RleEncoderV1::writeValues() {
if (numLiterals != 0) {
- if (repeat) {
+ if (repeat_) {
writeByte(static_cast<char>(static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT));
- writeByte(static_cast<char>(delta));
+ writeByte(static_cast<char>(delta_));
if (isSigned) {
writeVslong(literals[0]);
} else {
@@ -67,26 +67,24 @@ namespace orc {
}
}
}
- repeat = false;
+ repeat_ = false;
numLiterals = 0;
- tailRunLength = 0;
+ tailRunLength_ = 0;
}
}
uint64_t RleEncoderV1::flush() {
- writeValues();
- outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
+ finishEncode();
uint64_t dataSize = outputStream->flush();
- bufferLength = bufferPosition = 0;
return dataSize;
}
void RleEncoderV1::write(int64_t value) {
if (numLiterals == 0) {
literals[numLiterals++] = value;
- tailRunLength = 1;
- } else if (repeat) {
- if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) {
+ tailRunLength_ = 1;
+ } else if (repeat_) {
+ if (value == literals[0] + delta_ * static_cast<int64_t>(numLiterals)) {
numLiterals += 1;
if (numLiterals == MAXIMUM_REPEAT) {
writeValues();
@@ -94,36 +92,36 @@ namespace orc {
} else {
writeValues();
literals[numLiterals++] = value;
- tailRunLength = 1;
+ tailRunLength_ = 1;
}
} else {
- if (tailRunLength == 1) {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
+ if (tailRunLength_ == 1) {
+ delta_ = value - literals[numLiterals - 1];
+ if (delta_ < MIN_DELTA || delta_ > MAX_DELTA) {
+ tailRunLength_ = 1;
} else {
- tailRunLength = 2;
+ tailRunLength_ = 2;
}
- } else if (value == literals[numLiterals - 1] + delta) {
- tailRunLength += 1;
+ } else if (value == literals[numLiterals - 1] + delta_) {
+ tailRunLength_ += 1;
} else {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
+ delta_ = value - literals[numLiterals - 1];
+ if (delta_ < MIN_DELTA || delta_ > MAX_DELTA) {
+ tailRunLength_ = 1;
} else {
- tailRunLength = 2;
+ tailRunLength_ = 2;
}
}
- if (tailRunLength == MINIMUM_REPEAT) {
+ if (tailRunLength_ == MINIMUM_REPEAT) {
if (numLiterals + 1 == MINIMUM_REPEAT) {
- repeat = true;
+ repeat_ = true;
numLiterals += 1;
} else {
numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
int64_t base = literals[numLiterals];
writeValues();
literals[0] = base;
- repeat = true;
+ repeat_ = true;
numLiterals = MINIMUM_REPEAT;
}
} else {
@@ -135,18 +133,23 @@ namespace orc {
}
}
+ void RleEncoderV1::finishEncode() {
+ writeValues();
+ RleEncoder::finishEncode();
+ }
+
signed char RleDecoderV1::readByte() {
SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs);
- if (bufferStart == bufferEnd) {
+ if (bufferStart_ == bufferEnd_) {
int bufferLength;
const void* bufferPointer;
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ if (!inputStream_->Next(&bufferPointer, &bufferLength)) {
throw ParseError("bad read in readByte");
}
- bufferStart = static_cast<const char*>(bufferPointer);
- bufferEnd = bufferStart + bufferLength;
+ bufferStart_ = static_cast<const char*>(bufferPointer);
+ bufferEnd_ = bufferStart_ + bufferLength;
}
- return static_cast<signed char>(*(bufferStart++));
+ return static_cast<signed char>(*(bufferStart_++));
}
uint64_t RleDecoderV1::readLong() {
@@ -177,34 +180,34 @@ namespace orc {
void RleDecoderV1::readHeader() {
signed char ch = readByte();
if (ch < 0) {
- remainingValues = static_cast<uint64_t>(-ch);
- repeating = false;
+ remainingValues_ = static_cast<uint64_t>(-ch);
+ repeating_ = false;
} else {
- remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
- repeating = true;
- delta = readByte();
- value = isSigned ? unZigZag(readLong()) : static_cast<int64_t>(readLong());
+ remainingValues_ = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
+ repeating_ = true;
+ delta_ = readByte();
+ value_ = isSigned_ ? unZigZag(readLong()) : static_cast<int64_t>(readLong());
}
}
void RleDecoderV1::reset() {
- remainingValues = 0;
- value = 0;
- bufferStart = nullptr;
- bufferEnd = nullptr;
- delta = 0;
- repeating = false;
+ remainingValues_ = 0;
+ value_ = 0;
+ bufferStart_ = nullptr;
+ bufferEnd_ = nullptr;
+ delta_ = 0;
+ repeating_ = false;
}
RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool hasSigned,
- ReaderMetrics* _metrics)
- : RleDecoder(_metrics), inputStream(std::move(input)), isSigned(hasSigned) {
+ ReaderMetrics* metrics)
+ : RleDecoder(metrics), inputStream_(std::move(input)), isSigned_(hasSigned) {
reset();
}
void RleDecoderV1::seek(PositionProvider& location) {
// move the input stream
- inputStream->seek(location);
+ inputStream_->seek(location);
// reset the decoder status and lazily call readHeader()
reset();
// skip ahead the given number of records
@@ -213,14 +216,14 @@ namespace orc {
void RleDecoderV1::skip(uint64_t numValues) {
while (numValues > 0) {
- if (remainingValues == 0) {
+ if (remainingValues_ == 0) {
readHeader();
}
- uint64_t count = std::min(numValues, remainingValues);
- remainingValues -= count;
+ uint64_t count = std::min(numValues, remainingValues_);
+ remainingValues_ -= count;
numValues -= count;
- if (repeating) {
- value += delta * static_cast<int64_t>(count);
+ if (repeating_) {
+ value_ += delta_ * static_cast<int64_t>(count);
} else {
skipLongs(count);
}
@@ -240,38 +243,38 @@ namespace orc {
}
while (position < numValues) {
// If we are out of values, read more.
- if (remainingValues == 0) {
+ if (remainingValues_ == 0) {
readHeader();
}
// How many do we read out of this block?
- uint64_t count = std::min(numValues - position, remainingValues);
+ uint64_t count = std::min(numValues - position, remainingValues_);
uint64_t consumed = 0;
- if (repeating) {
+ if (repeating_) {
if (notNull) {
for (uint64_t i = 0; i < count; ++i) {
if (notNull[position + i]) {
- data[position + i] = static_cast<T>(value + static_cast<int64_t>(consumed) * delta);
+ data[position + i] = static_cast<T>(value_ + static_cast<int64_t>(consumed) * delta_);
consumed += 1;
}
}
} else {
for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = static_cast<T>(value + static_cast<int64_t>(i) * delta);
+ data[position + i] = static_cast<T>(value_ + static_cast<int64_t>(i) * delta_);
}
consumed = count;
}
- value += static_cast<int64_t>(consumed) * delta;
+ value_ += static_cast<int64_t>(consumed) * delta_;
} else {
if (notNull) {
for (uint64_t i = 0; i < count; ++i) {
if (notNull[position + i]) {
data[position + i] =
- isSigned ? static_cast<T>(unZigZag(readLong())) : static_cast<T>(readLong());
+ isSigned_ ? static_cast<T>(unZigZag(readLong())) : static_cast<T>(readLong());
++consumed;
}
}
} else {
- if (isSigned) {
+ if (isSigned_) {
for (uint64_t i = 0; i < count; ++i) {
data[position + i] = static_cast<T>(unZigZag(readLong()));
}
@@ -283,7 +286,7 @@ namespace orc {
consumed = count;
}
}
- remainingValues -= consumed;
+ remainingValues_ -= consumed;
position += count;
// skipNulls()
diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh
index fbe6b0f9c6..024b1e5e97 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv1.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh
@@ -38,10 +38,12 @@ namespace orc {
void write(int64_t val) override;
+ void finishEncode() override;
+
private:
- int64_t delta;
- bool repeat;
- uint64_t tailRunLength;
+ int64_t delta_;
+ bool repeat_;
+ uint64_t tailRunLength_;
void writeValues();
};
@@ -83,14 +85,14 @@ namespace orc {
inline void reset();
- const std::unique_ptr<SeekableInputStream> inputStream;
- const bool isSigned;
- uint64_t remainingValues;
- int64_t value;
- const char* bufferStart;
- const char* bufferEnd;
- int64_t delta;
- bool repeating;
+ const std::unique_ptr<SeekableInputStream> inputStream_;
+ const bool isSigned_;
+ uint64_t remainingValues_;
+ int64_t value_;
+ const char* bufferStart_;
+ const char* bufferEnd_;
+ int64_t delta_;
+ bool repeating_;
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh
index 1cee59d0a6..8ceb7f125b 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv2.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh
@@ -96,10 +96,10 @@ namespace orc {
~RleEncoderV2() override {
delete[] literals;
- delete[] gapVsPatchList;
- delete[] zigzagLiterals;
- delete[] baseRedLiterals;
- delete[] adjDeltas;
+ delete[] gapVsPatchList_;
+ delete[] zigzagLiterals_;
+ delete[] baseRedLiterals_;
+ delete[] adjDeltas_;
}
/**
* Flushing underlying BufferedOutputStream
@@ -108,19 +108,21 @@ namespace orc {
void write(int64_t val) override;
+ void finishEncode() override;
+
private:
- const bool alignedBitPacking;
- uint32_t fixedRunLength;
- uint32_t variableRunLength;
- int64_t prevDelta;
- int32_t histgram[HIST_LEN];
+ const bool alignedBitPacking_;
+ uint32_t fixedRunLength_;
+ uint32_t variableRunLength_;
+ int64_t prevDelta_;
+ int32_t histgram_[HIST_LEN];
// The four list below should actually belong to EncodingOption since it only holds temporal
// values in write(int64_t val), it is move here for performance consideration.
- int64_t* gapVsPatchList;
- int64_t* zigzagLiterals;
- int64_t* baseRedLiterals;
- int64_t* adjDeltas;
+ int64_t* gapVsPatchList_;
+ int64_t* zigzagLiterals_;
+ int64_t* baseRedLiterals_;
+ int64_t* adjDeltas_;
uint32_t getOpCode(EncodingType encoding);
int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
@@ -169,39 +171,39 @@ namespace orc {
unsigned char readByte();
void setBufStart(const char* start) {
- bufferStart = const_cast<char*>(start);
+ bufferStart_ = const_cast<char*>(start);
}
char* getBufStart() {
- return bufferStart;
+ return bufferStart_;
}
void setBufEnd(const char* end) {
- bufferEnd = const_cast<char*>(end);
+ bufferEnd_ = const_cast<char*>(end);
}
char* getBufEnd() {
- return bufferEnd;
+ return bufferEnd_;
}
uint64_t bufLength() {
- return bufferEnd - bufferStart;
+ return bufferEnd_ - bufferStart_;
}
void setBitsLeft(const uint32_t bits) {
- bitsLeft = bits;
+ bitsLeft_ = bits;
}
void setCurByte(const uint32_t byte) {
- curByte = byte;
+ curByte_ = byte;
}
uint32_t getBitsLeft() {
- return bitsLeft;
+ return bitsLeft_;
}
uint32_t getCurByte() {
- return curByte;
+ return curByte_;
}
/**
@@ -225,8 +227,8 @@ namespace orc {
int64_t* resPatch, uint64_t* patchIdx);
void resetReadLongs() {
- bitsLeft = 0;
- curByte = 0;
+ bitsLeft_ = 0;
+ curByte_ = 0;
}
void resetRun() {
@@ -249,17 +251,17 @@ namespace orc {
template <typename T>
uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
- const std::unique_ptr<SeekableInputStream> inputStream;
- const bool isSigned;
- unsigned char firstByte;
- char* bufferStart;
- char* bufferEnd;
- uint64_t runLength; // Length of the current run
- uint64_t runRead; // Number of returned values of the current run
- uint32_t bitsLeft; // Used by readLongs when bitSize < 8
- uint32_t curByte; // Used by anything that uses readLongs
- DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
- DataBuffer<int64_t> literals; // Values of the current run
+ const std::unique_ptr<SeekableInputStream> inputStream_;
+ const bool isSigned_;
+ unsigned char firstByte_;
+ char* bufferStart_;
+ char* bufferEnd_;
+ uint64_t runLength_; // Length of the current run
+ uint64_t runRead_; // Number of returned values of the current run
+ uint32_t bitsLeft_; // Used by readLongs when bitSize < 8
+ uint32_t curByte_; // Used by anything that uses readLongs
+ DataBuffer<int64_t> unpackedPatch_; // Used by PATCHED_BASE
+ DataBuffer<int64_t> literals_; // Values of the current run
};
inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) {
@@ -268,20 +270,20 @@ namespace orc {
const void* bufferPointer = nullptr;
if (backupByteLen != 0) {
- inputStream->BackUp(backupByteLen);
+ inputStream_->BackUp(backupByteLen);
}
if (len >= remainingLen && resetBuf) {
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ if (!inputStream_->Next(&bufferPointer, &bufferLength)) {
throw ParseError("bad read in RleDecoderV2::resetBufferStart");
}
}
if (bufferPointer == nullptr) {
- bufferStart += len;
+ bufferStart_ += len;
} else {
- bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer));
- bufferEnd = bufferStart + bufferLength;
+ bufferStart_ = const_cast<char*>(static_cast<const char*>(bufferPointer));
+ bufferEnd_ = bufferStart_ + bufferLength;
}
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc
index 82e77e4705..f47c40ebbe 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.cc
+++ b/contrib/libs/apache/orc/c++/src/Reader.cc
@@ -73,11 +73,11 @@ namespace orc {
}
std::string ColumnSelector::toDotColumnPath() {
- if (columns.empty()) {
+ if (columns_.empty()) {
return std::string();
}
std::ostringstream columnStream;
- std::copy(columns.begin(), columns.end(),
+ std::copy(columns_.begin(), columns_.end(),
std::ostream_iterator<std::string>(columnStream, "."));
std::string columnPath = columnStream.str();
return columnPath.substr(0, columnPath.length() - 1);
@@ -150,15 +150,15 @@ namespace orc {
*/
void ColumnSelector::buildTypeNameIdMap(const Type* type) {
// map<type_id, Type*>
- idTypeMap[type->getColumnId()] = type;
+ idTypeMap_[type->getColumnId()] = type;
if (STRUCT == type->getKind()) {
for (size_t i = 0; i < type->getSubtypeCount(); ++i) {
const std::string& fieldName = type->getFieldName(i);
- columns.push_back(fieldName);
- nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId();
+ columns_.push_back(fieldName);
+ nameIdMap_[toDotColumnPath()] = type->getSubtype(i)->getColumnId();
buildTypeNameIdMap(type->getSubtype(i));
- columns.pop_back();
+ columns_.pop_back();
}
} else {
// other non-primitive type
@@ -170,13 +170,13 @@ namespace orc {
void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns,
const RowReaderOptions& options) {
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) {
+ selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false);
+ if (contents_->schema->getKind() == STRUCT && options.getIndexesSet()) {
for (std::list<uint64_t>::const_iterator field = options.getInclude().begin();
field != options.getInclude().end(); ++field) {
updateSelectedByFieldId(selectedColumns, *field);
}
- } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) {
+ } else if (contents_->schema->getKind() == STRUCT && options.getNamesSet()) {
for (std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
field != options.getIncludeNames().end(); ++field) {
updateSelectedByName(selectedColumns, *field);
@@ -191,18 +191,18 @@ namespace orc {
// default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
- selectParents(selectedColumns, *contents->schema.get());
+ selectParents(selectedColumns, *contents_->schema.get());
selectedColumns[0] = true; // column 0 is selected by default
}
void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns,
uint64_t fieldId) {
- if (fieldId < contents->schema->getSubtypeCount()) {
- selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId));
+ if (fieldId < contents_->schema->getSubtypeCount()) {
+ selectChildren(selectedColumns, *contents_->schema->getSubtype(fieldId));
} else {
std::stringstream buffer;
buffer << "Invalid column selected " << fieldId << " out of "
- << contents->schema->getSubtypeCount();
+ << contents_->schema->getSubtypeCount();
throw ParseError(buffer.str());
}
}
@@ -215,7 +215,7 @@ namespace orc {
std::vector<bool>& selectedColumns, uint64_t typeId,
const RowReaderOptions::IdReadIntentMap& idReadIntentMap) {
if (typeId < selectedColumns.size()) {
- const Type& type = *idTypeMap[typeId];
+ const Type& type = *idTypeMap_[typeId];
selectChildren(selectedColumns, type, idReadIntentMap);
} else {
std::stringstream buffer;
@@ -226,14 +226,14 @@ namespace orc {
void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns,
const std::string& fieldName) {
- std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName);
- if (ite != nameIdMap.end()) {
+ std::map<std::string, uint64_t>::const_iterator ite = nameIdMap_.find(fieldName);
+ if (ite != nameIdMap_.end()) {
updateSelectedByTypeId(selectedColumns, ite->second);
} else {
bool first = true;
std::ostringstream ss;
ss << "Invalid column selected " << fieldName << ". Valid names are ";
- for (auto it = nameIdMap.begin(); it != nameIdMap.end(); ++it) {
+ for (auto it = nameIdMap_.begin(); it != nameIdMap_.end(); ++it) {
if (!first) ss << ", ";
ss << it->first;
first = false;
@@ -242,89 +242,88 @@ namespace orc {
}
}
- ColumnSelector::ColumnSelector(const FileContents* _contents) : contents(_contents) {
- buildTypeNameIdMap(contents->schema.get());
+ ColumnSelector::ColumnSelector(const FileContents* contents) : contents_(contents) {
+ buildTypeNameIdMap(contents_->schema.get());
}
- RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents,
- const RowReaderOptions& opts)
- : localTimezone(getLocalTimezone()),
- contents(_contents),
- throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()),
- forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()),
- footer(contents->footer.get()),
- firstRowOfStripe(*contents->pool, 0),
- enableEncodedBlock(opts.getEnableLazyDecoding()),
- readerTimezone(getTimezoneByName(opts.getTimezoneName())),
- schemaEvolution(opts.getReadType(), contents->schema.get()) {
+ RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> contents, const RowReaderOptions& opts)
+ : localTimezone_(getLocalTimezone()),
+ contents_(contents),
+ throwOnHive11DecimalOverflow_(opts.getThrowOnHive11DecimalOverflow()),
+ forcedScaleOnHive11Decimal_(opts.getForcedScaleOnHive11Decimal()),
+ footer_(contents_->footer.get()),
+ firstRowOfStripe_(*contents_->pool, 0),
+ enableEncodedBlock_(opts.getEnableLazyDecoding()),
+ readerTimezone_(getTimezoneByName(opts.getTimezoneName())),
+ schemaEvolution_(opts.getReadType(), contents_->schema.get()) {
uint64_t numberOfStripes;
- numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
- currentStripe = numberOfStripes;
- lastStripe = 0;
- currentRowInStripe = 0;
- rowsInCurrentStripe = 0;
- numRowGroupsInStripeRange = 0;
- useTightNumericVector = opts.getUseTightNumericVector();
- throwOnSchemaEvolutionOverflow = opts.getThrowOnSchemaEvolutionOverflow();
+ numberOfStripes = static_cast<uint64_t>(footer_->stripes_size());
+ currentStripe_ = numberOfStripes;
+ lastStripe_ = 0;
+ currentRowInStripe_ = 0;
+ rowsInCurrentStripe_ = 0;
+ numRowGroupsInStripeRange_ = 0;
+ useTightNumericVector_ = opts.getUseTightNumericVector();
+ throwOnSchemaEvolutionOverflow_ = opts.getThrowOnSchemaEvolutionOverflow();
uint64_t rowTotal = 0;
- firstRowOfStripe.resize(numberOfStripes);
+ firstRowOfStripe_.resize(numberOfStripes);
for (size_t i = 0; i < numberOfStripes; ++i) {
- firstRowOfStripe[i] = rowTotal;
- proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(i));
+ firstRowOfStripe_[i] = rowTotal;
+ proto::StripeInformation stripeInfo = footer_->stripes(static_cast<int>(i));
rowTotal += stripeInfo.number_of_rows();
bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() &&
stripeInfo.offset() < opts.getOffset() + opts.getLength();
if (isStripeInRange) {
- if (i < currentStripe) {
- currentStripe = i;
+ if (i < currentStripe_) {
+ currentStripe_ = i;
}
- if (i >= lastStripe) {
- lastStripe = i + 1;
+ if (i >= lastStripe_) {
+ lastStripe_ = i + 1;
}
- if (footer->row_index_stride() > 0) {
- numRowGroupsInStripeRange +=
- (stripeInfo.number_of_rows() + footer->row_index_stride() - 1) /
- footer->row_index_stride();
+ if (footer_->row_index_stride() > 0) {
+ numRowGroupsInStripeRange_ +=
+ (stripeInfo.number_of_rows() + footer_->row_index_stride() - 1) /
+ footer_->row_index_stride();
}
}
}
- firstStripe = currentStripe;
- processingStripe = lastStripe;
+ firstStripe_ = currentStripe_;
+ processingStripe_ = lastStripe_;
- if (currentStripe == 0) {
- previousRow = (std::numeric_limits<uint64_t>::max)();
- } else if (currentStripe == numberOfStripes) {
- previousRow = footer->number_of_rows();
+ if (currentStripe_ == 0) {
+ previousRow_ = (std::numeric_limits<uint64_t>::max)();
+ } else if (currentStripe_ == numberOfStripes) {
+ previousRow_ = footer_->number_of_rows();
} else {
- previousRow = firstRowOfStripe[firstStripe] - 1;
+ previousRow_ = firstRowOfStripe_[firstStripe_] - 1;
}
- ColumnSelector column_selector(contents.get());
- column_selector.updateSelected(selectedColumns, opts);
+ ColumnSelector column_selector(contents_.get());
+ column_selector.updateSelected(selectedColumns_, opts);
// prepare SargsApplier if SearchArgument is available
- if (opts.getSearchArgument() && footer->row_index_stride() > 0) {
- sargs = opts.getSearchArgument();
- sargsApplier.reset(
- new SargsApplier(*contents->schema, sargs.get(), footer->row_index_stride(),
- getWriterVersionImpl(_contents.get()), contents->readerMetrics));
+ if (opts.getSearchArgument() && footer_->row_index_stride() > 0) {
+ sargs_ = opts.getSearchArgument();
+ sargsApplier_.reset(
+ new SargsApplier(*contents_->schema, sargs_.get(), footer_->row_index_stride(),
+ getWriterVersionImpl(contents.get()), contents_->readerMetrics));
}
- skipBloomFilters = hasBadBloomFilters();
+ skipBloomFilters_ = hasBadBloomFilters();
}
// Check if the file has inconsistent bloom filters.
bool RowReaderImpl::hasBadBloomFilters() {
// Only C++ writer in old releases could have bad bloom filters.
- if (footer->writer() != ORC_CPP_WRITER) return false;
+ if (footer_->writer() != ORC_CPP_WRITER) return false;
// 'softwareVersion' is added in 1.5.13, 1.6.11, and 1.7.0.
// 1.6.x releases before 1.6.11 won't have it. On the other side, the C++ writer
// supports writing bloom filters since 1.6.0. So files written by the C++ writer
// and with 'softwareVersion' unset would have bad bloom filters.
- if (!footer->has_software_version()) return true;
+ if (!footer_->has_software_version()) return true;
- const std::string& fullVersion = footer->software_version();
+ const std::string& fullVersion = footer_->software_version();
std::string version;
// Deal with snapshot versions, e.g. 1.6.12-SNAPSHOT.
if (fullVersion.find('-') != std::string::npos) {
@@ -341,31 +340,31 @@ namespace orc {
}
CompressionKind RowReaderImpl::getCompression() const {
- return contents->compression;
+ return contents_->compression;
}
uint64_t RowReaderImpl::getCompressionSize() const {
- return contents->blockSize;
+ return contents_->blockSize;
}
const std::vector<bool> RowReaderImpl::getSelectedColumns() const {
- return selectedColumns;
+ return selectedColumns_;
}
const Type& RowReaderImpl::getSelectedType() const {
- if (selectedSchema.get() == nullptr) {
- selectedSchema = buildSelectedType(contents->schema.get(), selectedColumns);
+ if (selectedSchema_.get() == nullptr) {
+ selectedSchema_ = buildSelectedType(contents_->schema.get(), selectedColumns_);
}
- return *(selectedSchema.get());
+ return *(selectedSchema_.get());
}
uint64_t RowReaderImpl::getRowNumber() const {
- return previousRow;
+ return previousRow_;
}
void RowReaderImpl::seekToRow(uint64_t rowNumber) {
// Empty file
- if (lastStripe == 0) {
+ if (lastStripe_ == 0) {
return;
}
@@ -375,53 +374,53 @@ namespace orc {
// Implement this by setting previousRow to the number of rows in the file.
// seeking past lastStripe
- uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size());
- if ((lastStripe == num_stripes && rowNumber >= footer->number_of_rows()) ||
- (lastStripe < num_stripes && rowNumber >= firstRowOfStripe[lastStripe])) {
- currentStripe = num_stripes;
- previousRow = footer->number_of_rows();
+ uint64_t num_stripes = static_cast<uint64_t>(footer_->stripes_size());
+ if ((lastStripe_ == num_stripes && rowNumber >= footer_->number_of_rows()) ||
+ (lastStripe_ < num_stripes && rowNumber >= firstRowOfStripe_[lastStripe_])) {
+ currentStripe_ = num_stripes;
+ previousRow_ = footer_->number_of_rows();
return;
}
uint64_t seekToStripe = 0;
- while (seekToStripe + 1 < lastStripe && firstRowOfStripe[seekToStripe + 1] <= rowNumber) {
+ while (seekToStripe + 1 < lastStripe_ && firstRowOfStripe_[seekToStripe + 1] <= rowNumber) {
seekToStripe++;
}
// seeking before the first stripe
- if (seekToStripe < firstStripe) {
- currentStripe = num_stripes;
- previousRow = footer->number_of_rows();
+ if (seekToStripe < firstStripe_) {
+ currentStripe_ = num_stripes;
+ previousRow_ = footer_->number_of_rows();
return;
}
- previousRow = rowNumber;
- auto rowIndexStride = footer->row_index_stride();
- if (!isCurrentStripeInited() || currentStripe != seekToStripe || rowIndexStride == 0 ||
- currentStripeInfo.index_length() == 0) {
+ previousRow_ = rowNumber;
+ auto rowIndexStride = footer_->row_index_stride();
+ if (!isCurrentStripeInited() || currentStripe_ != seekToStripe || rowIndexStride == 0 ||
+ currentStripeInfo_.index_length() == 0) {
// current stripe is not initialized or
// target stripe is not current stripe or
// current stripe doesn't have row indexes
- currentStripe = seekToStripe;
- currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
+ currentStripe_ = seekToStripe;
+ currentRowInStripe_ = rowNumber - firstRowOfStripe_[currentStripe_];
startNextStripe();
- if (currentStripe >= lastStripe) {
+ if (currentStripe_ >= lastStripe_) {
return;
}
} else {
- currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
- if (sargsApplier) {
+ currentRowInStripe_ = rowNumber - firstRowOfStripe_[currentStripe_];
+ if (sargsApplier_) {
// advance to selected row group if predicate pushdown is enabled
- currentRowInStripe =
- advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe,
- footer->row_index_stride(), sargsApplier->getNextSkippedRows());
+ currentRowInStripe_ =
+ advanceToNextRowGroup(currentRowInStripe_, rowsInCurrentStripe_,
+ footer_->row_index_stride(), sargsApplier_->getNextSkippedRows());
}
}
- uint64_t rowsToSkip = currentRowInStripe;
+ uint64_t rowsToSkip = currentRowInStripe_;
// seek to the target row group if row indexes exists
- if (rowIndexStride > 0 && currentStripeInfo.index_length() > 0) {
- if (rowIndexes.empty()) {
+ if (rowIndexStride > 0 && currentStripeInfo_.index_length() > 0) {
+ if (rowIndexes_.empty()) {
loadStripeIndex();
}
// TODO(ORC-1175): process the failures of loadStripeIndex() call
@@ -432,36 +431,36 @@ namespace orc {
// 'reader' is reset in startNextStripe(). It could be nullptr if 'rowsToSkip' is 0,
// e.g. when startNextStripe() skips all remaining rows of the file.
if (rowsToSkip > 0) {
- reader->skip(rowsToSkip);
+ reader_->skip(rowsToSkip);
}
}
void RowReaderImpl::loadStripeIndex() {
// reset all previous row indexes
- rowIndexes.clear();
- bloomFilterIndex.clear();
+ rowIndexes_.clear();
+ bloomFilterIndex_.clear();
// obtain row indexes for selected columns
- uint64_t offset = currentStripeInfo.offset();
- for (int i = 0; i < currentStripeFooter.streams_size(); ++i) {
- const proto::Stream& pbStream = currentStripeFooter.streams(i);
+ uint64_t offset = currentStripeInfo_.offset();
+ for (int i = 0; i < currentStripeFooter_.streams_size(); ++i) {
+ const proto::Stream& pbStream = currentStripeFooter_.streams(i);
uint64_t colId = pbStream.column();
- if (selectedColumns[colId] && pbStream.has_kind() &&
+ if (selectedColumns_[colId] && pbStream.has_kind() &&
(pbStream.kind() == proto::Stream_Kind_ROW_INDEX ||
pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) {
std::unique_ptr<SeekableInputStream> inStream = createDecompressor(
getCompression(),
std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
- contents->stream.get(), offset, pbStream.length(), *contents->pool)),
- getCompressionSize(), *contents->pool, contents->readerMetrics);
+ contents_->stream.get(), offset, pbStream.length(), *contents_->pool)),
+ getCompressionSize(), *contents_->pool, contents_->readerMetrics);
if (pbStream.kind() == proto::Stream_Kind_ROW_INDEX) {
proto::RowIndex rowIndex;
if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) {
throw ParseError("Failed to parse the row index");
}
- rowIndexes[colId] = rowIndex;
- } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8
+ rowIndexes_[colId] = rowIndex;
+ } else if (!skipBloomFilters_) { // Stream_Kind_BLOOM_FILTER_UTF8
proto::BloomFilterIndex pbBFIndex;
if (!pbBFIndex.ParseFromZeroCopyStream(inStream.get())) {
throw ParseError("Failed to parse bloom filter index");
@@ -469,11 +468,11 @@ namespace orc {
BloomFilterIndex bfIndex;
for (int j = 0; j < pbBFIndex.bloom_filter_size(); j++) {
bfIndex.entries.push_back(BloomFilterUTF8Utils::deserialize(
- pbStream.kind(), currentStripeFooter.columns(static_cast<int>(pbStream.column())),
+ pbStream.kind(), currentStripeFooter_.columns(static_cast<int>(pbStream.column())),
pbBFIndex.bloom_filter(j)));
}
// add bloom filters to result for one column
- bloomFilterIndex[pbStream.column()] = bfIndex;
+ bloomFilterIndex_[pbStream.column()] = bfIndex;
}
}
offset += pbStream.length();
@@ -486,7 +485,7 @@ namespace orc {
// store position providers for selected colimns
std::unordered_map<uint64_t, PositionProvider> positionProviders;
- for (auto rowIndex = rowIndexes.cbegin(); rowIndex != rowIndexes.cend(); ++rowIndex) {
+ for (auto rowIndex = rowIndexes_.cbegin(); rowIndex != rowIndexes_.cend(); ++rowIndex) {
uint64_t colId = rowIndex->first;
const proto::RowIndexEntry& entry =
rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId));
@@ -500,23 +499,23 @@ namespace orc {
positionProviders.insert(std::make_pair(colId, PositionProvider(position)));
}
- reader->seekToRowGroup(positionProviders);
+ reader_->seekToRowGroup(positionProviders);
}
const FileContents& RowReaderImpl::getFileContents() const {
- return *contents;
+ return *contents_;
}
bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const {
- return throwOnHive11DecimalOverflow;
+ return throwOnHive11DecimalOverflow_;
}
bool RowReaderImpl::getIsDecimalAsLong() const {
- return contents->isDecimalAsLong;
+ return contents_->isDecimalAsLong;
}
int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const {
- return forcedScaleOnHive11Decimal;
+ return forcedScaleOnHive11Decimal_;
}
proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
@@ -542,29 +541,29 @@ namespace orc {
return result;
}
- ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, const ReaderOptions& opts,
- uint64_t _fileLength, uint64_t _postscriptLength)
- : contents(std::move(_contents)),
- options(opts),
- fileLength(_fileLength),
- postscriptLength(_postscriptLength),
- footer(contents->footer.get()) {
- isMetadataLoaded = false;
+ ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> contents, const ReaderOptions& opts,
+ uint64_t fileLength, uint64_t postscriptLength)
+ : contents_(std::move(contents)),
+ options_(opts),
+ fileLength_(fileLength),
+ postscriptLength_(postscriptLength),
+ footer_(contents_->footer.get()) {
+ isMetadataLoaded_ = false;
checkOrcVersion();
- numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
- contents->schema = convertType(footer->types(0), *footer);
- contents->blockSize = getCompressionBlockSize(*contents->postscript);
- contents->compression = convertCompressionKind(*contents->postscript);
+ numberOfStripes_ = static_cast<uint64_t>(footer_->stripes_size());
+ contents_->schema = convertType(footer_->types(0), *footer_);
+ contents_->blockSize = getCompressionBlockSize(*contents_->postscript);
+ contents_->compression = convertCompressionKind(*contents_->postscript);
}
std::string ReaderImpl::getSerializedFileTail() const {
proto::FileTail tail;
proto::PostScript* mutable_ps = tail.mutable_postscript();
- mutable_ps->CopyFrom(*contents->postscript);
+ mutable_ps->CopyFrom(*contents_->postscript);
proto::Footer* mutableFooter = tail.mutable_footer();
- mutableFooter->CopyFrom(*footer);
- tail.set_file_length(fileLength);
- tail.set_postscript_length(postscriptLength);
+ mutableFooter->CopyFrom(*footer_);
+ tail.set_file_length(fileLength_);
+ tail.set_postscript_length(postscriptLength_);
TProtobufString result;
if (!tail.SerializeToString(&result)) {
throw ParseError("Failed to serialize file tail");
@@ -573,56 +572,56 @@ namespace orc {
}
const ReaderOptions& ReaderImpl::getReaderOptions() const {
- return options;
+ return options_;
}
CompressionKind ReaderImpl::getCompression() const {
- return contents->compression;
+ return contents_->compression;
}
uint64_t ReaderImpl::getCompressionSize() const {
- return contents->blockSize;
+ return contents_->blockSize;
}
uint64_t ReaderImpl::getNumberOfStripes() const {
- return numberOfStripes;
+ return numberOfStripes_;
}
uint64_t ReaderImpl::getNumberOfStripeStatistics() const {
- if (!isMetadataLoaded) {
+ if (!isMetadataLoaded_) {
readMetadata();
}
- return contents->metadata == nullptr
+ return contents_->metadata == nullptr
? 0
- : static_cast<uint64_t>(contents->metadata->stripe_stats_size());
+ : static_cast<uint64_t>(contents_->metadata->stripe_stats_size());
}
std::unique_ptr<StripeInformation> ReaderImpl::getStripe(uint64_t stripeIndex) const {
if (stripeIndex > getNumberOfStripes()) {
throw std::logic_error("stripe index out of range");
}
- proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(stripeIndex));
+ proto::StripeInformation stripeInfo = footer_->stripes(static_cast<int>(stripeIndex));
return std::unique_ptr<StripeInformation>(new StripeInformationImpl(
stripeInfo.offset(), stripeInfo.index_length(), stripeInfo.data_length(),
- stripeInfo.footer_length(), stripeInfo.number_of_rows(), contents->stream.get(),
- *contents->pool, contents->compression, contents->blockSize, contents->readerMetrics));
+ stripeInfo.footer_length(), stripeInfo.number_of_rows(), contents_->stream.get(),
+ *contents_->pool, contents_->compression, contents_->blockSize, contents_->readerMetrics));
}
FileVersion ReaderImpl::getFormatVersion() const {
- if (contents->postscript->version_size() != 2) {
+ if (contents_->postscript->version_size() != 2) {
return FileVersion::v_0_11();
}
- return {contents->postscript->version(0), contents->postscript->version(1)};
+ return {contents_->postscript->version(0), contents_->postscript->version(1)};
}
uint64_t ReaderImpl::getNumberOfRows() const {
- return footer->number_of_rows();
+ return footer_->number_of_rows();
}
WriterId ReaderImpl::getWriterId() const {
- if (footer->has_writer()) {
- uint32_t id = footer->writer();
+ if (footer_->has_writer()) {
+ uint32_t id = footer_->writer();
if (id > WriterId::CUDF_WRITER) {
return WriterId::UNKNOWN_WRITER;
} else {
@@ -633,8 +632,8 @@ namespace orc {
}
uint32_t ReaderImpl::getWriterIdValue() const {
- if (footer->has_writer()) {
- return footer->writer();
+ if (footer_->has_writer()) {
+ return footer_->writer();
} else {
return WriterId::ORC_JAVA_WRITER;
}
@@ -643,56 +642,56 @@ namespace orc {
std::string ReaderImpl::getSoftwareVersion() const {
std::ostringstream buffer;
buffer << writerIdToString(getWriterIdValue());
- if (footer->has_software_version()) {
- buffer << " " << footer->software_version();
+ if (footer_->has_software_version()) {
+ buffer << " " << footer_->software_version();
}
return buffer.str();
}
WriterVersion ReaderImpl::getWriterVersion() const {
- return getWriterVersionImpl(contents.get());
+ return getWriterVersionImpl(contents_.get());
}
uint64_t ReaderImpl::getContentLength() const {
- return footer->content_length();
+ return footer_->content_length();
}
uint64_t ReaderImpl::getStripeStatisticsLength() const {
- return contents->postscript->metadata_length();
+ return contents_->postscript->metadata_length();
}
uint64_t ReaderImpl::getFileFooterLength() const {
- return contents->postscript->footer_length();
+ return contents_->postscript->footer_length();
}
uint64_t ReaderImpl::getFilePostscriptLength() const {
- return postscriptLength;
+ return postscriptLength_;
}
uint64_t ReaderImpl::getFileLength() const {
- return fileLength;
+ return fileLength_;
}
uint64_t ReaderImpl::getRowIndexStride() const {
- return footer->row_index_stride();
+ return footer_->row_index_stride();
}
const std::string& ReaderImpl::getStreamName() const {
- return contents->stream->getName();
+ return contents_->stream->getName();
}
std::list<std::string> ReaderImpl::getMetadataKeys() const {
std::list<std::string> result;
- for (int i = 0; i < footer->metadata_size(); ++i) {
- result.push_back(footer->metadata(i).name());
+ for (int i = 0; i < footer_->metadata_size(); ++i) {
+ result.push_back(footer_->metadata(i).name());
}
return result;
}
std::string ReaderImpl::getMetadataValue(const std::string& key) const {
- for (int i = 0; i < footer->metadata_size(); ++i) {
- if (footer->metadata(i).name() == key) {
- return footer->metadata(i).value();
+ for (int i = 0; i < footer_->metadata_size(); ++i) {
+ if (footer_->metadata(i).name() == key) {
+ return footer_->metadata(i).value();
}
}
throw std::range_error("key not found");
@@ -719,10 +718,10 @@ namespace orc {
throw ParseError(msg.str());
}
std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
+ createDecompressor(contents_->compression,
std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
- contents->stream.get(), offset, length, *contents->pool)),
- contents->blockSize, *(contents->pool), contents->readerMetrics);
+ contents_->stream.get(), offset, length, *contents_->pool)),
+ contents_->blockSize, *(contents_->pool), contents_->readerMetrics);
proto::RowIndex rowIndex;
if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) {
@@ -740,8 +739,8 @@ namespace orc {
}
bool ReaderImpl::hasMetadataValue(const std::string& key) const {
- for (int i = 0; i < footer->metadata_size(); ++i) {
- if (footer->metadata(i).name() == key) {
+ for (int i = 0; i < footer_->metadata_size(); ++i) {
+ if (footer_->metadata(i).name() == key) {
return true;
}
}
@@ -749,22 +748,22 @@ namespace orc {
}
const Type& ReaderImpl::getType() const {
- return *(contents->schema.get());
+ return *(contents_->schema.get());
}
std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
- if (!isMetadataLoaded) {
+ if (!isMetadataLoaded_) {
readMetadata();
}
- if (contents->metadata == nullptr) {
+ if (contents_->metadata == nullptr) {
throw std::logic_error("No stripe statistics in file");
}
size_t num_cols = static_cast<size_t>(
- contents->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
+ contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols);
- proto::StripeInformation currentStripeInfo = footer->stripes(static_cast<int>(stripeIndex));
- proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get());
+ proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast<int>(stripeIndex));
+ proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get());
getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
@@ -773,47 +772,47 @@ namespace orc {
: getLocalTimezone();
StatContext statContext(hasCorrectStatistics(), &writerTZ);
return std::make_unique<StripeStatisticsImpl>(
- contents->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext);
+ contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext);
}
std::unique_ptr<Statistics> ReaderImpl::getStatistics() const {
StatContext statContext(hasCorrectStatistics());
- return std::make_unique<StatisticsImpl>(*footer, statContext);
+ return std::make_unique<StatisticsImpl>(*footer_, statContext);
}
std::unique_ptr<ColumnStatistics> ReaderImpl::getColumnStatistics(uint32_t index) const {
- if (index >= static_cast<uint64_t>(footer->statistics_size())) {
+ if (index >= static_cast<uint64_t>(footer_->statistics_size())) {
throw std::logic_error("column index out of range");
}
- proto::ColumnStatistics col = footer->statistics(static_cast<int32_t>(index));
+ proto::ColumnStatistics col = footer_->statistics(static_cast<int32_t>(index));
StatContext statContext(hasCorrectStatistics());
return std::unique_ptr<ColumnStatistics>(convertColumnStatistics(col, statContext));
}
void ReaderImpl::readMetadata() const {
- uint64_t metadataSize = contents->postscript->metadata_length();
- uint64_t footerLength = contents->postscript->footer_length();
- if (fileLength < metadataSize + footerLength + postscriptLength + 1) {
+ uint64_t metadataSize = contents_->postscript->metadata_length();
+ uint64_t footerLength = contents_->postscript->footer_length();
+ if (fileLength_ < metadataSize + footerLength + postscriptLength_ + 1) {
std::stringstream msg;
- msg << "Invalid Metadata length: fileLength=" << fileLength
+ msg << "Invalid Metadata length: fileLength=" << fileLength_
<< ", metadataLength=" << metadataSize << ", footerLength=" << footerLength
- << ", postscriptLength=" << postscriptLength;
+ << ", postscriptLength=" << postscriptLength_;
throw ParseError(msg.str());
}
- uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1;
+ uint64_t metadataStart = fileLength_ - metadataSize - footerLength - postscriptLength_ - 1;
if (metadataSize != 0) {
std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
- contents->compression,
- std::make_unique<SeekableFileInputStream>(contents->stream.get(), metadataStart,
- metadataSize, *contents->pool),
- contents->blockSize, *contents->pool, contents->readerMetrics);
- contents->metadata.reset(new proto::Metadata());
- if (!contents->metadata->ParseFromZeroCopyStream(pbStream.get())) {
+ contents_->compression,
+ std::make_unique<SeekableFileInputStream>(contents_->stream.get(), metadataStart,
+ metadataSize, *contents_->pool),
+ contents_->blockSize, *contents_->pool, contents_->readerMetrics);
+ contents_->metadata.reset(new proto::Metadata());
+ if (!contents_->metadata->ParseFromZeroCopyStream(pbStream.get())) {
throw ParseError("Failed to parse the metadata");
}
}
- isMetadataLoaded = true;
+ isMetadataLoaded_ = true;
}
bool ReaderImpl::hasCorrectStatistics() const {
@@ -823,9 +822,9 @@ namespace orc {
void ReaderImpl::checkOrcVersion() {
FileVersion version = getFormatVersion();
if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) {
- *(options.getErrorStream()) << "Warning: ORC file " << contents->stream->getName()
- << " was written in an unknown format version "
- << version.toString() << "\n";
+ *(options_.getErrorStream())
+ << "Warning: ORC file " << contents_->stream->getName()
+ << " was written in an unknown format version " << version.toString() << "\n";
}
}
@@ -835,11 +834,11 @@ namespace orc {
}
std::unique_ptr<RowReader> ReaderImpl::createRowReader(const RowReaderOptions& opts) const {
- if (opts.getSearchArgument() && !isMetadataLoaded) {
+ if (opts.getSearchArgument() && !isMetadataLoaded_) {
// load stripe statistics for PPD
readMetadata();
}
- return std::make_unique<RowReaderImpl>(contents, opts);
+ return std::make_unique<RowReaderImpl>(contents_, opts);
}
uint64_t maxStreamsForType(const proto::Type& type) {
@@ -874,15 +873,15 @@ namespace orc {
uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true);
+ selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), true);
return getMemoryUse(stripeIx, selectedColumns);
}
uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) {
std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- ColumnSelector column_selector(contents.get());
- if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) {
+ selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false);
+ ColumnSelector column_selector(contents_.get());
+ if (contents_->schema->getKind() == STRUCT && include.begin() != include.end()) {
for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end();
++field) {
column_selector.updateSelectedByFieldId(selectedColumns, *field);
@@ -891,16 +890,16 @@ namespace orc {
// default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
- column_selector.selectParents(selectedColumns, *contents->schema.get());
+ column_selector.selectParents(selectedColumns, *contents_->schema.get());
selectedColumns[0] = true; // column 0 is selected by default
return getMemoryUse(stripeIx, selectedColumns);
}
uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) {
std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- ColumnSelector column_selector(contents.get());
- if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) {
+ selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false);
+ ColumnSelector column_selector(contents_.get());
+ if (contents_->schema->getKind() == STRUCT && names.begin() != names.end()) {
for (std::list<std::string>::const_iterator field = names.begin(); field != names.end();
++field) {
column_selector.updateSelectedByName(selectedColumns, *field);
@@ -909,15 +908,15 @@ namespace orc {
// default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
- column_selector.selectParents(selectedColumns, *contents->schema.get());
+ column_selector.selectParents(selectedColumns, *contents_->schema.get());
selectedColumns[0] = true; // column 0 is selected by default
return getMemoryUse(stripeIx, selectedColumns);
}
uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) {
std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- ColumnSelector column_selector(contents.get());
+ selectedColumns.assign(static_cast<size_t>(contents_->footer->types_size()), false);
+ ColumnSelector column_selector(contents_.get());
if (include.begin() != include.end()) {
for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end();
++field) {
@@ -927,7 +926,7 @@ namespace orc {
// default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
- column_selector.selectParents(selectedColumns, *contents->schema.get());
+ column_selector.selectParents(selectedColumns, *contents_->schema.get());
selectedColumns[0] = true; // column 0 is selected by default
return getMemoryUse(stripeIx, selectedColumns);
}
@@ -935,14 +934,14 @@ namespace orc {
uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) {
uint64_t maxDataLength = 0;
- if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
- uint64_t stripe = footer->stripes(stripeIx).data_length();
+ if (stripeIx >= 0 && stripeIx < footer_->stripes_size()) {
+ uint64_t stripe = footer_->stripes(stripeIx).data_length();
if (maxDataLength < stripe) {
maxDataLength = stripe;
}
} else {
- for (int i = 0; i < footer->stripes_size(); i++) {
- uint64_t stripe = footer->stripes(i).data_length();
+ for (int i = 0; i < footer_->stripes_size(); i++) {
+ uint64_t stripe = footer_->stripes(i).data_length();
if (maxDataLength < stripe) {
maxDataLength = stripe;
}
@@ -951,9 +950,9 @@ namespace orc {
bool hasStringColumn = false;
uint64_t nSelectedStreams = 0;
- for (int i = 0; !hasStringColumn && i < footer->types_size(); i++) {
+ for (int i = 0; !hasStringColumn && i < footer_->types_size(); i++) {
if (selectedColumns[static_cast<size_t>(i)]) {
- const proto::Type& type = footer->types(i);
+ const proto::Type& type = footer_->types(i);
nSelectedStreams += maxStreamsForType(type);
switch (static_cast<int64_t>(type.kind())) {
case proto::Type_Kind_CHAR:
@@ -979,29 +978,29 @@ namespace orc {
uint64_t memory = hasStringColumn
? 2 * maxDataLength
: std::min(uint64_t(maxDataLength),
- nSelectedStreams * contents->stream->getNaturalReadSize());
+ nSelectedStreams * contents_->stream->getNaturalReadSize());
// Do we need even more memory to read the footer or the metadata?
- if (memory < contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS) {
- memory = contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS;
+ if (memory < contents_->postscript->footer_length() + DIRECTORY_SIZE_GUESS) {
+ memory = contents_->postscript->footer_length() + DIRECTORY_SIZE_GUESS;
}
- if (memory < contents->postscript->metadata_length()) {
- memory = contents->postscript->metadata_length();
+ if (memory < contents_->postscript->metadata_length()) {
+ memory = contents_->postscript->metadata_length();
}
// Account for firstRowOfStripe.
- memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t);
+ memory += static_cast<uint64_t>(footer_->stripes_size()) * sizeof(uint64_t);
// Decompressors need buffers for each stream
uint64_t decompressorMemory = 0;
- if (contents->compression != CompressionKind_NONE) {
- for (int i = 0; i < footer->types_size(); i++) {
+ if (contents_->compression != CompressionKind_NONE) {
+ for (int i = 0; i < footer_->types_size(); i++) {
if (selectedColumns[static_cast<size_t>(i)]) {
- const proto::Type& type = footer->types(i);
- decompressorMemory += maxStreamsForType(type) * contents->blockSize;
+ const proto::Type& type = footer_->types(i);
+ decompressorMemory += maxStreamsForType(type) * contents_->blockSize;
}
}
- if (contents->compression == CompressionKind_SNAPPY) {
+ if (contents_->compression == CompressionKind_SNAPPY) {
decompressorMemory *= 2; // Snappy decompressor uses a second buffer
}
}
@@ -1011,101 +1010,104 @@ namespace orc {
// Update fields to indicate we've reached the end of file
void RowReaderImpl::markEndOfFile() {
- currentStripe = lastStripe;
- currentRowInStripe = 0;
- rowsInCurrentStripe = 0;
- if (lastStripe == 0) {
+ currentStripe_ = lastStripe_;
+ currentRowInStripe_ = 0;
+ rowsInCurrentStripe_ = 0;
+ if (lastStripe_ == 0) {
// Empty file
- previousRow = 0;
+ previousRow_ = 0;
} else {
- previousRow = firstRowOfStripe[lastStripe - 1] +
- footer->stripes(static_cast<int>(lastStripe - 1)).number_of_rows();
+ previousRow_ = firstRowOfStripe_[lastStripe_ - 1] +
+ footer_->stripes(static_cast<int>(lastStripe_ - 1)).number_of_rows();
}
}
void RowReaderImpl::startNextStripe() {
- reader.reset(); // ColumnReaders use lots of memory; free old memory first
- rowIndexes.clear();
- bloomFilterIndex.clear();
+ reader_.reset(); // ColumnReaders use lots of memory; free old memory first
+ rowIndexes_.clear();
+ bloomFilterIndex_.clear();
// evaluate file statistics if it exists
- if (sargsApplier && !sargsApplier->evaluateFileStatistics(*footer, numRowGroupsInStripeRange)) {
+ if (sargsApplier_ &&
+ !sargsApplier_->evaluateFileStatistics(*footer_, numRowGroupsInStripeRange_)) {
// skip the entire file
markEndOfFile();
return;
}
do {
- currentStripeInfo = footer->stripes(static_cast<int>(currentStripe));
- uint64_t fileLength = contents->stream->getLength();
- if (currentStripeInfo.offset() + currentStripeInfo.index_length() +
- currentStripeInfo.data_length() + currentStripeInfo.footer_length() >=
+ currentStripeInfo_ = footer_->stripes(static_cast<int>(currentStripe_));
+ uint64_t fileLength = contents_->stream->getLength();
+ if (currentStripeInfo_.offset() + currentStripeInfo_.index_length() +
+ currentStripeInfo_.data_length() + currentStripeInfo_.footer_length() >=
fileLength) {
std::stringstream msg;
- msg << "Malformed StripeInformation at stripe index " << currentStripe
+ msg << "Malformed StripeInformation at stripe index " << currentStripe_
<< ": fileLength=" << fileLength
- << ", StripeInfo=(offset=" << currentStripeInfo.offset()
- << ", indexLength=" << currentStripeInfo.index_length()
- << ", dataLength=" << currentStripeInfo.data_length()
- << ", footerLength=" << currentStripeInfo.footer_length() << ")";
+ << ", StripeInfo=(offset=" << currentStripeInfo_.offset()
+ << ", indexLength=" << currentStripeInfo_.index_length()
+ << ", dataLength=" << currentStripeInfo_.data_length()
+ << ", footerLength=" << currentStripeInfo_.footer_length() << ")";
throw ParseError(msg.str());
}
- currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get());
- rowsInCurrentStripe = currentStripeInfo.number_of_rows();
- processingStripe = currentStripe;
-
- if (sargsApplier) {
- bool isStripeNeeded = true;
- if (contents->metadata) {
- const auto& currentStripeStats =
- contents->metadata->stripe_stats(static_cast<int>(currentStripe));
- // skip this stripe after stats fail to satisfy sargs
- uint64_t stripeRowGroupCount =
- (rowsInCurrentStripe + footer->row_index_stride() - 1) / footer->row_index_stride();
- isStripeNeeded =
- sargsApplier->evaluateStripeStatistics(currentStripeStats, stripeRowGroupCount);
- }
+ rowsInCurrentStripe_ = currentStripeInfo_.number_of_rows();
+ processingStripe_ = currentStripe_;
+
+ bool isStripeNeeded = true;
+ // If PPD enabled and stripe stats existed, evaulate it first
+ if (sargsApplier_ && contents_->metadata) {
+ const auto& currentStripeStats =
+ contents_->metadata->stripe_stats(static_cast<int>(currentStripe_));
+ // skip this stripe after stats fail to satisfy sargs
+ uint64_t stripeRowGroupCount =
+ (rowsInCurrentStripe_ + footer_->row_index_stride() - 1) / footer_->row_index_stride();
+ isStripeNeeded =
+ sargsApplier_->evaluateStripeStatistics(currentStripeStats, stripeRowGroupCount);
+ }
- if (isStripeNeeded) {
+ if (isStripeNeeded) {
+ currentStripeFooter_ = getStripeFooter(currentStripeInfo_, *contents_.get());
+ if (sargsApplier_) {
// read row group statistics and bloom filters of current stripe
loadStripeIndex();
// select row groups to read in the current stripe
- sargsApplier->pickRowGroups(rowsInCurrentStripe, rowIndexes, bloomFilterIndex);
- if (sargsApplier->hasSelectedFrom(currentRowInStripe)) {
+ sargsApplier_->pickRowGroups(rowsInCurrentStripe_, rowIndexes_, bloomFilterIndex_);
+ if (sargsApplier_->hasSelectedFrom(currentRowInStripe_)) {
// current stripe has at least one row group matching the predicate
break;
}
isStripeNeeded = false;
}
- if (!isStripeNeeded) {
- // advance to next stripe when current stripe has no matching rows
- currentStripe += 1;
- currentRowInStripe = 0;
- }
}
- } while (sargsApplier && currentStripe < lastStripe);
- if (currentStripe < lastStripe) {
+ if (!isStripeNeeded) {
+ // advance to next stripe when current stripe has no matching rows
+ currentStripe_ += 1;
+ currentRowInStripe_ = 0;
+ }
+ } while (sargsApplier_ && currentStripe_ < lastStripe_);
+
+ if (currentStripe_ < lastStripe_) {
// get writer timezone info from stripe footer to help understand timestamp values.
const Timezone& writerTimezone =
- currentStripeFooter.has_writer_timezone()
- ? getTimezoneByName(currentStripeFooter.writer_timezone())
- : localTimezone;
- StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, currentStripeFooter,
- currentStripeInfo.offset(), *contents->stream, writerTimezone,
- readerTimezone);
- reader = buildReader(*contents->schema, stripeStreams, useTightNumericVector,
- throwOnSchemaEvolutionOverflow, /*convertToReadType=*/true);
-
- if (sargsApplier) {
+ currentStripeFooter_.has_writer_timezone()
+ ? getTimezoneByName(currentStripeFooter_.writer_timezone())
+ : localTimezone_;
+ StripeStreamsImpl stripeStreams(*this, currentStripe_, currentStripeInfo_,
+ currentStripeFooter_, currentStripeInfo_.offset(),
+ *contents_->stream, writerTimezone, readerTimezone_);
+ reader_ = buildReader(*contents_->schema, stripeStreams, useTightNumericVector_,
+ throwOnSchemaEvolutionOverflow_, /*convertToReadType=*/true);
+
+ if (sargsApplier_) {
// move to the 1st selected row group when PPD is enabled.
- currentRowInStripe =
- advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe,
- footer->row_index_stride(), sargsApplier->getNextSkippedRows());
- previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe - 1;
- if (currentRowInStripe > 0) {
- seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride()));
+ currentRowInStripe_ =
+ advanceToNextRowGroup(currentRowInStripe_, rowsInCurrentStripe_,
+ footer_->row_index_stride(), sargsApplier_->getNextSkippedRows());
+ previousRow_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_ - 1;
+ if (currentRowInStripe_ > 0) {
+ seekToRowGroup(static_cast<uint32_t>(currentRowInStripe_ / footer_->row_index_stride()));
}
}
} else {
@@ -1115,52 +1117,53 @@ namespace orc {
}
bool RowReaderImpl::next(ColumnVectorBatch& data) {
- SCOPED_STOPWATCH(contents->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall);
- if (currentStripe >= lastStripe) {
+ SCOPED_STOPWATCH(contents_->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall);
+ if (currentStripe_ >= lastStripe_) {
data.numElements = 0;
markEndOfFile();
return false;
}
- if (currentRowInStripe == 0) {
+ if (currentRowInStripe_ == 0) {
startNextStripe();
}
uint64_t rowsToRead =
- std::min(static_cast<uint64_t>(data.capacity), rowsInCurrentStripe - currentRowInStripe);
- if (sargsApplier && rowsToRead > 0) {
- rowsToRead = computeBatchSize(rowsToRead, currentRowInStripe, rowsInCurrentStripe,
- footer->row_index_stride(), sargsApplier->getNextSkippedRows());
+ std::min(static_cast<uint64_t>(data.capacity), rowsInCurrentStripe_ - currentRowInStripe_);
+ if (sargsApplier_ && rowsToRead > 0) {
+ rowsToRead =
+ computeBatchSize(rowsToRead, currentRowInStripe_, rowsInCurrentStripe_,
+ footer_->row_index_stride(), sargsApplier_->getNextSkippedRows());
}
data.numElements = rowsToRead;
if (rowsToRead == 0) {
markEndOfFile();
return false;
}
- if (enableEncodedBlock) {
- reader->nextEncoded(data, rowsToRead, nullptr);
+ if (enableEncodedBlock_) {
+ reader_->nextEncoded(data, rowsToRead, nullptr);
} else {
- reader->next(data, rowsToRead, nullptr);
+ reader_->next(data, rowsToRead, nullptr);
}
// update row number
- previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe;
- currentRowInStripe += rowsToRead;
+ previousRow_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_;
+ currentRowInStripe_ += rowsToRead;
// check if we need to advance to next selected row group
- if (sargsApplier) {
+ if (sargsApplier_) {
uint64_t nextRowToRead =
- advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, footer->row_index_stride(),
- sargsApplier->getNextSkippedRows());
- if (currentRowInStripe != nextRowToRead) {
+ advanceToNextRowGroup(currentRowInStripe_, rowsInCurrentStripe_,
+ footer_->row_index_stride(), sargsApplier_->getNextSkippedRows());
+ if (currentRowInStripe_ != nextRowToRead) {
// it is guaranteed to be at start of a row group
- currentRowInStripe = nextRowToRead;
- if (currentRowInStripe < rowsInCurrentStripe) {
- seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride()));
+ currentRowInStripe_ = nextRowToRead;
+ if (currentRowInStripe_ < rowsInCurrentStripe_) {
+ seekToRowGroup(static_cast<uint32_t>(currentRowInStripe_ / footer_->row_index_stride()));
}
}
}
- if (currentRowInStripe >= rowsInCurrentStripe) {
- currentStripe += 1;
- currentRowInStripe = 0;
+ if (currentRowInStripe_ >= rowsInCurrentStripe_) {
+ currentStripe_ += 1;
+ currentRowInStripe_ = 0;
}
return rowsToRead != 0;
}
@@ -1219,9 +1222,9 @@ namespace orc {
std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
// If the read type is specified, then check that the selected schema matches the read type
// on the first call to createRowBatch.
- if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+ if (schemaEvolution_.getReadType() && selectedSchema_.get() == nullptr) {
auto fileSchema = &getSelectedType();
- auto readType = schemaEvolution.getReadType();
+ auto readType = schemaEvolution_.getReadType();
std::set<uint64_t> readColumns, fileColumns;
getColumnIds(readType, readColumns);
getColumnIds(fileSchema, fileColumns);
@@ -1233,9 +1236,9 @@ namespace orc {
}
}
const Type& readType =
- schemaEvolution.getReadType() ? *schemaEvolution.getReadType() : getSelectedType();
- return readType.createRowBatch(capacity, *contents->pool, enableEncodedBlock,
- useTightNumericVector);
+ schemaEvolution_.getReadType() ? *schemaEvolution_.getReadType() : getSelectedType();
+ return readType.createRowBatch(capacity, *contents_->pool, enableEncodedBlock_,
+ useTightNumericVector_);
}
void ensureOrcFooter(InputStream* stream, DataBuffer<char>* buffer, uint64_t postscriptLength) {
@@ -1423,17 +1426,10 @@ namespace orc {
uint32_t stripeIndex, const std::set<uint32_t>& included) const {
std::map<uint32_t, BloomFilterIndex> ret;
- // find stripe info
- if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) {
- throw std::logic_error("Illegal stripe index: " +
- to_string(static_cast<int64_t>(stripeIndex)));
- }
- const proto::StripeInformation currentStripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
- const proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents);
+ uint64_t offset;
+ auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset);
// iterate stripe footer to get stream of bloom_filter
- uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset());
for (int i = 0; i < currentStripeFooter.streams_size(); i++) {
const proto::Stream& stream = currentStripeFooter.streams(i);
uint32_t column = static_cast<uint32_t>(stream.column());
@@ -1443,10 +1439,10 @@ namespace orc {
if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 &&
(included.empty() || included.find(column) != included.end())) {
std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
+ createDecompressor(contents_->compression,
std::make_unique<SeekableFileInputStream>(
- contents->stream.get(), offset, length, *contents->pool),
- contents->blockSize, *(contents->pool), contents->readerMetrics);
+ contents_->stream.get(), offset, length, *contents_->pool),
+ contents_->blockSize, *(contents_->pool), contents_->readerMetrics);
proto::BloomFilterIndex pbBFIndex;
if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) {
@@ -1471,6 +1467,150 @@ namespace orc {
return ret;
}
+ proto::StripeFooter ReaderImpl::loadCurrentStripeFooter(uint32_t stripeIndex,
+ uint64_t& offset) const {
+ // find stripe info
+ if (stripeIndex >= static_cast<uint32_t>(footer_->stripes_size())) {
+ throw std::logic_error("Illegal stripe index: " +
+ to_string(static_cast<int64_t>(stripeIndex)));
+ }
+ const proto::StripeInformation currentStripeInfo =
+ footer_->stripes(static_cast<int>(stripeIndex));
+ offset = static_cast<uint64_t>(currentStripeInfo.offset());
+ return getStripeFooter(currentStripeInfo, *contents_);
+ }
+
+ std::map<uint32_t, RowGroupIndex> ReaderImpl::getRowGroupIndex(
+ uint32_t stripeIndex, const std::set<uint32_t>& included) const {
+ std::map<uint32_t, RowGroupIndex> ret;
+ uint64_t offset;
+ auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset);
+
+ // iterate stripe footer to get stream of row_index
+ for (int i = 0; i < currentStripeFooter.streams_size(); i++) {
+ const proto::Stream& stream = currentStripeFooter.streams(i);
+ uint32_t column = static_cast<uint32_t>(stream.column());
+ uint64_t length = static_cast<uint64_t>(stream.length());
+ RowGroupIndex& rowGroupIndex = ret[column];
+
+ if (stream.kind() == proto::Stream_Kind_ROW_INDEX &&
+ (included.empty() || included.find(column) != included.end())) {
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(contents_->compression,
+ std::make_unique<SeekableFileInputStream>(
+ contents_->stream.get(), offset, length, *contents_->pool),
+ contents_->blockSize, *(contents_->pool), contents_->readerMetrics);
+
+ proto::RowIndex pbRowIndex;
+ if (!pbRowIndex.ParseFromZeroCopyStream(pbStream.get())) {
+ std::stringstream errMsgBuffer;
+ errMsgBuffer << "Failed to parse RowIndex at column " << column << " in stripe "
+ << stripeIndex;
+ throw ParseError(errMsgBuffer.str());
+ }
+
+ // add rowGroupIndex to result for one column
+ for (auto& rowIndexEntry : pbRowIndex.entry()) {
+ std::vector<uint64_t> posVector;
+ for (auto& position : rowIndexEntry.positions()) {
+ posVector.push_back(position);
+ }
+ rowGroupIndex.positions.push_back(posVector);
+ }
+ }
+ offset += length;
+ }
+ return ret;
+ }
+
+ void ReaderImpl::releaseBuffer(uint64_t boundary) {
+ std::lock_guard<std::mutex> lock(contents_->readCacheMutex);
+
+ if (contents_->readCache) {
+ contents_->readCache->evictEntriesBefore(boundary);
+ }
+ }
+
+ void ReaderImpl::preBuffer(const std::vector<uint32_t>& stripes,
+ const std::list<uint64_t>& includeTypes) {
+ std::vector<uint32_t> newStripes;
+ for (auto stripe : stripes) {
+ if (stripe < static_cast<uint32_t>(footer_->stripes_size())) newStripes.push_back(stripe);
+ }
+
+ std::list<uint64_t> newIncludeTypes;
+ for (auto type : includeTypes) {
+ if (type < static_cast<uint64_t>(footer_->types_size())) newIncludeTypes.push_back(type);
+ }
+
+ if (newStripes.empty() || newIncludeTypes.empty()) {
+ return;
+ }
+
+ orc::RowReaderOptions rowReaderOptions;
+ rowReaderOptions.includeTypes(newIncludeTypes);
+ ColumnSelector columnSelector(contents_.get());
+ std::vector<bool> selectedColumns;
+ columnSelector.updateSelected(selectedColumns, rowReaderOptions);
+
+ std::vector<ReadRange> ranges;
+ ranges.reserve(newIncludeTypes.size());
+ for (auto stripe : newStripes) {
+ // get stripe information
+ const auto& stripeInfo = footer_->stripes(stripe);
+ uint64_t stripeFooterStart =
+ stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length();
+ uint64_t stripeFooterLength = stripeInfo.footer_length();
+
+ // get stripe footer
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ contents_->compression,
+ std::make_unique<SeekableFileInputStream>(contents_->stream.get(), stripeFooterStart,
+ stripeFooterLength, *contents_->pool),
+ contents_->blockSize, *contents_->pool, contents_->readerMetrics);
+ proto::StripeFooter stripeFooter;
+ if (!stripeFooter.ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError(std::string("bad StripeFooter from ") + pbStream->getName());
+ }
+
+ // traverse all streams in stripe footer, choose selected streams to prebuffer
+ uint64_t offset = stripeInfo.offset();
+ for (int i = 0; i < stripeFooter.streams_size(); i++) {
+ const proto::Stream& stream = stripeFooter.streams(i);
+ if (offset + stream.length() > stripeFooterStart) {
+ std::stringstream msg;
+ msg << "Malformed stream meta at stream index " << i << " in stripe " << stripe
+ << ": streamOffset=" << offset << ", streamLength=" << stream.length()
+ << ", stripeOffset=" << stripeInfo.offset()
+ << ", stripeIndexLength=" << stripeInfo.index_length()
+ << ", stripeDataLength=" << stripeInfo.data_length();
+ throw ParseError(msg.str());
+ }
+
+ if (stream.has_kind() && selectedColumns[stream.column()]) {
+ const auto& kind = stream.kind();
+ if (kind == proto::Stream_Kind_DATA || kind == proto::Stream_Kind_DICTIONARY_DATA ||
+ kind == proto::Stream_Kind_PRESENT || kind == proto::Stream_Kind_LENGTH ||
+ kind == proto::Stream_Kind_SECONDARY) {
+ ranges.emplace_back(offset, stream.length());
+ }
+ }
+
+ offset += stream.length();
+ }
+
+ {
+ std::lock_guard<std::mutex> lock(contents_->readCacheMutex);
+
+ if (!contents_->readCache) {
+ contents_->readCache = std::make_shared<ReadRangeCache>(
+ getStream(), options_.getCacheOptions(), contents_->pool, contents_->readerMetrics);
+ }
+ contents_->readCache->cache(std::move(ranges));
+ }
+ }
+ }
+
RowReader::~RowReader() {
// PASS
}
diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh
index a1367e4bd3..39ca739675 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.hh
+++ b/contrib/libs/apache/orc/c++/src/Reader.hh
@@ -26,6 +26,8 @@
#include "ColumnReader.hh"
#include "RLE.hh"
+#include "io/Cache.hh"
+
#include "SchemaEvolution.hh"
#include "TypeImpl.hh"
#include "sargs/SargsApplier.hh"
@@ -39,17 +41,17 @@ namespace orc {
*/
class WriterVersionImpl {
private:
- WriterVersion version;
+ WriterVersion version_;
public:
// Known Versions with issues resolved
// The static method below is to fix global constructors Clang warning
static const WriterVersionImpl& VERSION_HIVE_8732();
- WriterVersionImpl(WriterVersion ver) : version(ver) {}
+ WriterVersionImpl(WriterVersion ver) : version_(ver) {}
bool compareGT(const WriterVersion other) const {
- return version > other;
+ return version_ > other;
}
};
@@ -70,6 +72,11 @@ namespace orc {
bool isDecimalAsLong;
std::unique_ptr<proto::Metadata> metadata;
ReaderMetrics* readerMetrics;
+
+ // mutex to protect readCache_ from concurrent access
+ std::mutex readCacheMutex;
+ // cached io ranges. only valid when preBuffer is invoked.
+ std::shared_ptr<ReadRangeCache> readCache;
};
proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
@@ -80,10 +87,10 @@ namespace orc {
class ColumnSelector {
private:
- std::map<std::string, uint64_t> nameIdMap;
- std::map<uint64_t, const Type*> idTypeMap;
- const FileContents* contents;
- std::vector<std::string> columns;
+ std::map<std::string, uint64_t> nameIdMap_;
+ std::map<uint64_t, const Type*> idTypeMap_;
+ const FileContents* contents_;
+ std::vector<std::string> columns_;
// build map from type name and id, id to Type
void buildTypeNameIdMap(const Type* type);
@@ -127,54 +134,54 @@ namespace orc {
class RowReaderImpl : public RowReader {
private:
- const Timezone& localTimezone;
+ const Timezone& localTimezone_;
// contents
- std::shared_ptr<FileContents> contents;
- const bool throwOnHive11DecimalOverflow;
- const int32_t forcedScaleOnHive11Decimal;
+ std::shared_ptr<FileContents> contents_;
+ const bool throwOnHive11DecimalOverflow_;
+ const int32_t forcedScaleOnHive11Decimal_;
// inputs
- std::vector<bool> selectedColumns;
+ std::vector<bool> selectedColumns_;
// footer
- proto::Footer* footer;
- DataBuffer<uint64_t> firstRowOfStripe;
- mutable std::unique_ptr<Type> selectedSchema;
- bool skipBloomFilters;
+ proto::Footer* footer_;
+ DataBuffer<uint64_t> firstRowOfStripe_;
+ mutable std::unique_ptr<Type> selectedSchema_;
+ bool skipBloomFilters_;
// reading state
- uint64_t previousRow;
- uint64_t firstStripe;
- uint64_t currentStripe;
- uint64_t lastStripe; // the stripe AFTER the last one
- uint64_t processingStripe;
- uint64_t currentRowInStripe;
- uint64_t rowsInCurrentStripe;
+ uint64_t previousRow_;
+ uint64_t firstStripe_;
+ uint64_t currentStripe_;
+ uint64_t lastStripe_; // the stripe AFTER the last one
+ uint64_t processingStripe_;
+ uint64_t currentRowInStripe_;
+ uint64_t rowsInCurrentStripe_;
// number of row groups between first stripe and last stripe
- uint64_t numRowGroupsInStripeRange;
- proto::StripeInformation currentStripeInfo;
- proto::StripeFooter currentStripeFooter;
- std::unique_ptr<ColumnReader> reader;
-
- bool enableEncodedBlock;
- bool useTightNumericVector;
- bool throwOnSchemaEvolutionOverflow;
+ uint64_t numRowGroupsInStripeRange_;
+ proto::StripeInformation currentStripeInfo_;
+ proto::StripeFooter currentStripeFooter_;
+ std::unique_ptr<ColumnReader> reader_;
+
+ bool enableEncodedBlock_;
+ bool useTightNumericVector_;
+ bool throwOnSchemaEvolutionOverflow_;
// internal methods
void startNextStripe();
inline void markEndOfFile();
// row index of current stripe with column id as the key
- std::unordered_map<uint64_t, proto::RowIndex> rowIndexes;
- std::map<uint32_t, BloomFilterIndex> bloomFilterIndex;
- std::shared_ptr<SearchArgument> sargs;
- std::unique_ptr<SargsApplier> sargsApplier;
+ std::unordered_map<uint64_t, proto::RowIndex> rowIndexes_;
+ std::map<uint32_t, BloomFilterIndex> bloomFilterIndex_;
+ std::shared_ptr<SearchArgument> sargs_;
+ std::unique_ptr<SargsApplier> sargsApplier_;
// desired timezone to return data of timestamp types.
- const Timezone& readerTimezone;
+ const Timezone& readerTimezone_;
// match read and file types
- SchemaEvolution schemaEvolution;
+ SchemaEvolution schemaEvolution_;
// load stripe index if not done so
void loadStripeIndex();
@@ -196,7 +203,7 @@ namespace orc {
// whether the current stripe is initialized
inline bool isCurrentStripeInited() const {
- return currentStripe == processingStripe;
+ return currentStripe_ == processingStripe_;
}
/**
@@ -243,35 +250,40 @@ namespace orc {
int32_t getForcedScaleOnHive11Decimal() const;
const SchemaEvolution* getSchemaEvolution() const {
- return &schemaEvolution;
+ return &schemaEvolution_;
+ }
+
+ std::shared_ptr<ReadRangeCache> getReadCache() const {
+ return contents_->readCache;
}
};
class ReaderImpl : public Reader {
private:
// FileContents
- std::shared_ptr<FileContents> contents;
+ std::shared_ptr<FileContents> contents_;
// inputs
- const ReaderOptions options;
- const uint64_t fileLength;
- const uint64_t postscriptLength;
+ const ReaderOptions options_;
+ const uint64_t fileLength_;
+ const uint64_t postscriptLength_;
// footer
- proto::Footer* footer;
- uint64_t numberOfStripes;
+ proto::Footer* footer_;
+ uint64_t numberOfStripes_;
+
uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns);
// internal methods
void readMetadata() const;
void checkOrcVersion();
- void getRowIndexStatistics(
- const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
- const proto::StripeFooter& currentStripeFooter,
- std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;
+ void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
+ const proto::StripeFooter& currentStripeFooter,
+ std::vector<std::vector<proto::ColumnStatistics>>* indexStats) const;
+ proto::StripeFooter loadCurrentStripeFooter(uint32_t stripeIndex, uint64_t& offset) const;
// metadata
- mutable bool isMetadataLoaded;
+ mutable bool isMetadataLoaded_;
public:
/**
@@ -341,27 +353,27 @@ namespace orc {
bool hasCorrectStatistics() const override;
const ReaderMetrics* getReaderMetrics() const override {
- return contents->readerMetrics;
+ return contents_->readerMetrics;
}
const proto::PostScript* getPostscript() const {
- return contents->postscript.get();
+ return contents_->postscript.get();
}
uint64_t getBlockSize() const {
- return contents->blockSize;
+ return contents_->blockSize;
}
const proto::Footer* getFooter() const {
- return contents->footer.get();
+ return contents_->footer.get();
}
const Type* getSchema() const {
- return contents->schema.get();
+ return contents_->schema.get();
}
InputStream* getStream() const {
- return contents->stream.get();
+ return contents_->stream.get();
}
uint64_t getMemoryUse(int stripeIx = -1) override;
@@ -374,6 +386,13 @@ namespace orc {
std::map<uint32_t, BloomFilterIndex> getBloomFilters(
uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
+
+ void preBuffer(const std::vector<uint32_t>& stripes,
+ const std::list<uint64_t>& includeTypes) override;
+ void releaseBuffer(uint64_t boundary) override;
+
+ std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
+ uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
index ae05a70a36..95eec22ca7 100644
--- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
@@ -31,17 +31,17 @@ namespace orc {
unsigned char RleDecoderV2::readByte() {
SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs);
- if (bufferStart == bufferEnd) {
+ if (bufferStart_ == bufferEnd_) {
int bufferLength;
const void* bufferPointer;
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ if (!inputStream_->Next(&bufferPointer, &bufferLength)) {
throw ParseError("bad read in RleDecoderV2::readByte");
}
- bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer));
- bufferEnd = bufferStart + bufferLength;
+ bufferStart_ = const_cast<char*>(static_cast<const char*>(bufferPointer));
+ bufferEnd_ = bufferStart_ + bufferLength;
}
- unsigned char result = static_cast<unsigned char>(*bufferStart++);
+ unsigned char result = static_cast<unsigned char>(*bufferStart_++);
return result;
}
@@ -89,29 +89,29 @@ namespace orc {
return dispatch.func(this, data, offset, len, fbs);
}
- RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool _isSigned,
- MemoryPool& pool, ReaderMetrics* _metrics)
- : RleDecoder(_metrics),
- inputStream(std::move(input)),
- isSigned(_isSigned),
- firstByte(0),
- bufferStart(nullptr),
- bufferEnd(bufferStart),
- runLength(0),
- runRead(0),
- bitsLeft(0),
- curByte(0),
- unpackedPatch(pool, 0),
- literals(pool, MAX_LITERAL_SIZE) {
+ RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned,
+ MemoryPool& pool, ReaderMetrics* metrics)
+ : RleDecoder(metrics),
+ inputStream_(std::move(input)),
+ isSigned_(isSigned),
+ firstByte_(0),
+ bufferStart_(nullptr),
+ bufferEnd_(bufferStart_),
+ runLength_(0),
+ runRead_(0),
+ bitsLeft_(0),
+ curByte_(0),
+ unpackedPatch_(pool, 0),
+ literals_(pool, MAX_LITERAL_SIZE) {
// PASS
}
void RleDecoderV2::seek(PositionProvider& location) {
// move the input stream
- inputStream->seek(location);
+ inputStream_->seek(location);
// clear state
- bufferEnd = bufferStart = nullptr;
- runRead = runLength = 0;
+ bufferEnd_ = bufferStart_ = nullptr;
+ runRead_ = runLength_ = 0;
// skip ahead the given number of records
skip(location.next());
}
@@ -142,14 +142,14 @@ namespace orc {
}
}
- if (runRead == runLength) {
+ if (runRead_ == runLength_) {
resetRun();
- firstByte = readByte();
+ firstByte_ = readByte();
}
uint64_t offset = nRead, length = numValues - nRead;
- EncodingType enc = static_cast<EncodingType>((firstByte >> 6) & 0x03);
+ EncodingType enc = static_cast<EncodingType>((firstByte_ >> 6) & 0x03);
switch (static_cast<int64_t>(enc)) {
case SHORT_REPEAT:
nRead += nextShortRepeats(data, offset, length, notNull);
@@ -184,37 +184,37 @@ namespace orc {
template <typename T>
uint64_t RleDecoderV2::nextShortRepeats(T* const data, uint64_t offset, uint64_t numValues,
const char* const notNull) {
- if (runRead == runLength) {
+ if (runRead_ == runLength_) {
// extract the number of fixed bytes
- uint64_t byteSize = (firstByte >> 3) & 0x07;
+ uint64_t byteSize = (firstByte_ >> 3) & 0x07;
byteSize += 1;
- runLength = firstByte & 0x07;
+ runLength_ = firstByte_ & 0x07;
// run lengths values are stored only after MIN_REPEAT value is met
- runLength += MIN_REPEAT;
- runRead = 0;
+ runLength_ += MIN_REPEAT;
+ runRead_ = 0;
// read the repeated value which is store using fixed bytes
- literals[0] = readLongBE(byteSize);
+ literals_[0] = readLongBE(byteSize);
- if (isSigned) {
- literals[0] = unZigZag(static_cast<uint64_t>(literals[0]));
+ if (isSigned_) {
+ literals_[0] = unZigZag(static_cast<uint64_t>(literals_[0]));
}
}
- uint64_t nRead = std::min(runLength - runRead, numValues);
+ uint64_t nRead = std::min(runLength_ - runRead_, numValues);
if (notNull) {
for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
if (notNull[pos]) {
- data[pos] = static_cast<T>(literals[0]);
- ++runRead;
+ data[pos] = static_cast<T>(literals_[0]);
+ ++runRead_;
}
}
} else {
for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
- data[pos] = static_cast<T>(literals[0]);
- ++runRead;
+ data[pos] = static_cast<T>(literals_[0]);
+ ++runRead_;
}
}
@@ -224,22 +224,22 @@ namespace orc {
template <typename T>
uint64_t RleDecoderV2::nextDirect(T* const data, uint64_t offset, uint64_t numValues,
const char* const notNull) {
- if (runRead == runLength) {
+ if (runRead_ == runLength_) {
// extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
+ unsigned char fbo = (firstByte_ >> 1) & 0x1f;
uint32_t bitSize = decodeBitWidth(fbo);
// extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
+ runLength_ = static_cast<uint64_t>(firstByte_ & 0x01) << 8;
+ runLength_ |= readByte();
// runs are one off
- runLength += 1;
- runRead = 0;
+ runLength_ += 1;
+ runRead_ = 0;
- readLongs(literals.data(), 0, runLength, bitSize);
- if (isSigned) {
- for (uint64_t i = 0; i < runLength; ++i) {
- literals[i] = unZigZag(static_cast<uint64_t>(literals[i]));
+ readLongs(literals_.data(), 0, runLength_, bitSize);
+ if (isSigned_) {
+ for (uint64_t i = 0; i < runLength_; ++i) {
+ literals_[i] = unZigZag(static_cast<uint64_t>(literals_[i]));
}
}
}
@@ -250,8 +250,8 @@ namespace orc {
void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap,
int64_t* resPatch, uint64_t* patchIdx) {
uint64_t idx = *patchIdx;
- uint64_t gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize;
- int64_t patch = unpackedPatch[idx] & patchMask;
+ uint64_t gap = static_cast<uint64_t>(unpackedPatch_[idx]) >> patchBitSize;
+ int64_t patch = unpackedPatch_[idx] & patchMask;
int64_t actualGap = 0;
// special case: gap is >255 then patch value will be 0.
@@ -259,8 +259,8 @@ namespace orc {
while (gap == 255 && patch == 0) {
actualGap += 255;
++idx;
- gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize;
- patch = unpackedPatch[idx] & patchMask;
+ gap = static_cast<uint64_t>(unpackedPatch_[idx]) >> patchBitSize;
+ patch = unpackedPatch_[idx] & patchMask;
}
// add the left over gap
actualGap += gap;
@@ -273,17 +273,17 @@ namespace orc {
template <typename T>
uint64_t RleDecoderV2::nextPatched(T* const data, uint64_t offset, uint64_t numValues,
const char* const notNull) {
- if (runRead == runLength) {
+ if (runRead_ == runLength_) {
// extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
+ unsigned char fbo = (firstByte_ >> 1) & 0x1f;
uint32_t bitSize = decodeBitWidth(fbo);
// extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
+ runLength_ = static_cast<uint64_t>(firstByte_ & 0x01) << 8;
+ runLength_ |= readByte();
// runs are one off
- runLength += 1;
- runRead = 0;
+ runLength_ += 1;
+ runRead_ = 0;
// extract the number of bytes occupied by base
uint64_t thirdByte = readByte();
@@ -316,12 +316,12 @@ namespace orc {
base = -base;
}
- readLongs(literals.data(), 0, runLength, bitSize);
+ readLongs(literals_.data(), 0, runLength_, bitSize);
// any remaining bits are thrown out
resetReadLongs();
// TODO: something more efficient than resize
- unpackedPatch.resize(pl);
+ unpackedPatch_.resize(pl);
// TODO: Skip corrupt?
// if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
if ((patchBitSize + pgw) > 64) {
@@ -330,7 +330,7 @@ namespace orc {
"(patchBitSize + pgw > 64)!");
}
uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
- readLongs(unpackedPatch.data(), 0, pl, cfb);
+ readLongs(unpackedPatch_.data(), 0, pl, cfb);
// any remaining bits are thrown out
resetReadLongs();
@@ -342,21 +342,21 @@ namespace orc {
uint64_t patchIdx = 0;
adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx);
- for (uint64_t i = 0; i < runLength; ++i) {
+ for (uint64_t i = 0; i < runLength_; ++i) {
if (static_cast<int64_t>(i) != gap) {
// no patching required. add base to unpacked value to get final value
- literals[i] += base;
+ literals_[i] += base;
} else {
// extract the patch value
- int64_t patchedVal = literals[i] | (patch << bitSize);
+ int64_t patchedVal = literals_[i] | (patch << bitSize);
// add base to patched value
- literals[i] = base + patchedVal;
+ literals_[i] = base + patchedVal;
// increment the patch to point to next entry in patch list
++patchIdx;
- if (patchIdx < unpackedPatch.size()) {
+ if (patchIdx < unpackedPatch_.size()) {
adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx);
// next gap is relative to the current gap
@@ -372,9 +372,9 @@ namespace orc {
template <typename T>
uint64_t RleDecoderV2::nextDelta(T* const data, uint64_t offset, uint64_t numValues,
const char* const notNull) {
- if (runRead == runLength) {
+ if (runRead_ == runLength_) {
// extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
+ unsigned char fbo = (firstByte_ >> 1) & 0x1f;
uint32_t bitSize;
if (fbo != 0) {
bitSize = decodeBitWidth(fbo);
@@ -383,20 +383,20 @@ namespace orc {
}
// extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- ++runLength; // account for first value
- runRead = 0;
+ runLength_ = static_cast<uint64_t>(firstByte_ & 0x01) << 8;
+ runLength_ |= readByte();
+ ++runLength_; // account for first value
+ runRead_ = 0;
int64_t prevValue;
// read the first value stored as vint
- if (isSigned) {
+ if (isSigned_) {
prevValue = readVslong();
} else {
prevValue = static_cast<int64_t>(readVulong());
}
- literals[0] = prevValue;
+ literals_[0] = prevValue;
// read the fixed delta value stored as vint (deltas can be negative even
// if all number are positive)
@@ -404,28 +404,28 @@ namespace orc {
if (bitSize == 0) {
// add fixed deltas to adjacent values
- for (uint64_t i = 1; i < runLength; ++i) {
- literals[i] = literals[i - 1] + deltaBase;
+ for (uint64_t i = 1; i < runLength_; ++i) {
+ literals_[i] = literals_[i - 1] + deltaBase;
}
} else {
- prevValue = literals[1] = prevValue + deltaBase;
- if (runLength < 2) {
+ prevValue = literals_[1] = prevValue + deltaBase;
+ if (runLength_ < 2) {
std::stringstream ss;
- ss << "Illegal run length for delta encoding: " << runLength;
+ ss << "Illegal run length for delta encoding: " << runLength_;
throw ParseError(ss.str());
}
// write the unpacked values, add it to previous value and store final
// value to result buffer. if the delta base value is negative then it
// is a decreasing sequence else an increasing sequence.
// read deltas using the literals buffer.
- readLongs(literals.data(), 2, runLength - 2, bitSize);
+ readLongs(literals_.data(), 2, runLength_ - 2, bitSize);
if (deltaBase < 0) {
- for (uint64_t i = 2; i < runLength; ++i) {
- prevValue = literals[i] = prevValue - literals[i];
+ for (uint64_t i = 2; i < runLength_; ++i) {
+ prevValue = literals_[i] = prevValue - literals_[i];
}
} else {
- for (uint64_t i = 2; i < runLength; ++i) {
- prevValue = literals[i] = prevValue + literals[i];
+ for (uint64_t i = 2; i < runLength_; ++i) {
+ prevValue = literals_[i] = prevValue + literals_[i];
}
}
}
@@ -437,16 +437,16 @@ namespace orc {
template <typename T>
uint64_t RleDecoderV2::copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues,
const char* notNull) {
- uint64_t nRead = std::min(runLength - runRead, numValues);
+ uint64_t nRead = std::min(runLength_ - runRead_, numValues);
if (notNull) {
for (uint64_t i = offset; i < (offset + nRead); ++i) {
if (notNull[i]) {
- data[i] = static_cast<T>(literals[runRead++]);
+ data[i] = static_cast<T>(literals_[runRead_++]);
}
}
} else {
for (uint64_t i = offset; i < (offset + nRead); ++i) {
- data[i] = static_cast<T>(literals[runRead++]);
+ data[i] = static_cast<T>(literals_[runRead_++]);
}
}
return nRead;
diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
index a75aeac2eb..1cda9ee91e 100644
--- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
@@ -41,11 +41,11 @@ namespace orc {
if (!reuseHist) {
// histogram that store the encoded bit requirement for each values.
// maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t));
+ memset(histgram_, 0, FixedBitSizes::SIZE * sizeof(int32_t));
// compute the histogram
for (size_t i = offset; i < (offset + length); i++) {
uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
- histgram[idx] += 1;
+ histgram_[idx] += 1;
}
}
@@ -53,7 +53,7 @@ namespace orc {
// return the bits required by pth percentile length
for (int32_t i = HIST_LEN - 1; i >= 0; i--) {
- perLen -= histgram[i];
+ perLen -= histgram_[i];
if (perLen < 0) {
return decodeBitWidth(static_cast<uint32_t>(i));
}
@@ -64,13 +64,13 @@ namespace orc {
RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned,
bool alignBitPacking)
: RleEncoder(std::move(outStream), hasSigned),
- alignedBitPacking(alignBitPacking),
- prevDelta(0) {
+ alignedBitPacking_(alignBitPacking),
+ prevDelta_(0) {
literals = new int64_t[MAX_LITERAL_SIZE];
- gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
- zigzagLiterals = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr;
- baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
- adjDeltas = new int64_t[MAX_LITERAL_SIZE];
+ gapVsPatchList_ = new int64_t[MAX_LITERAL_SIZE];
+ zigzagLiterals_ = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr;
+ baseRedLiterals_ = new int64_t[MAX_LITERAL_SIZE];
+ adjDeltas_ = new int64_t[MAX_LITERAL_SIZE];
}
void RleEncoderV2::write(int64_t val) {
@@ -80,39 +80,39 @@ namespace orc {
}
if (numLiterals == 1) {
- prevDelta = val - literals[0];
+ prevDelta_ = val - literals[0];
literals[numLiterals++] = val;
if (val == literals[0]) {
- fixedRunLength = 2;
- variableRunLength = 0;
+ fixedRunLength_ = 2;
+ variableRunLength_ = 0;
} else {
- fixedRunLength = 0;
- variableRunLength = 2;
+ fixedRunLength_ = 0;
+ variableRunLength_ = 2;
}
return;
}
int64_t currentDelta = val - literals[numLiterals - 1];
EncodingOption option = {};
- if (prevDelta == 0 && currentDelta == 0) {
+ if (prevDelta_ == 0 && currentDelta == 0) {
// case 1: fixed delta run
literals[numLiterals++] = val;
- if (variableRunLength > 0) {
+ if (variableRunLength_ > 0) {
// if variable run is non-zero then we are seeing repeating
// values at the end of variable run in which case fixed Run
// length is 2
- fixedRunLength = 2;
+ fixedRunLength_ = 2;
}
- fixedRunLength++;
+ fixedRunLength_++;
// if fixed run met the minimum condition and if variable
// run is non-zero then flush the variable run and shift the
// tail fixed runs to start of the buffer
- if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
+ if (fixedRunLength_ >= MIN_REPEAT && variableRunLength_ > 0) {
numLiterals -= MIN_REPEAT;
- variableRunLength -= (MIN_REPEAT - 1);
+ variableRunLength_ -= (MIN_REPEAT - 1);
determineEncoding(option);
writeValues(option);
@@ -124,7 +124,7 @@ namespace orc {
numLiterals = MIN_REPEAT;
}
- if (fixedRunLength == MAX_LITERAL_SIZE) {
+ if (fixedRunLength_ == MAX_LITERAL_SIZE) {
option.encoding = DELTA;
option.isFixedDelta = true;
writeValues(option);
@@ -137,8 +137,8 @@ namespace orc {
// if fixed run length is non-zero and if it satisfies the
// short repeat conditions then write the values as short repeats
// else use delta encoding
- if (fixedRunLength >= MIN_REPEAT) {
- if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ if (fixedRunLength_ >= MIN_REPEAT) {
+ if (fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) {
option.encoding = SHORT_REPEAT;
} else {
option.encoding = DELTA;
@@ -149,20 +149,20 @@ namespace orc {
// if fixed run length is <MIN_REPEAT and current value is
// different from previous then treat it as variable run
- if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
+ if (fixedRunLength_ > 0 && fixedRunLength_ < MIN_REPEAT && val != literals[numLiterals - 1]) {
+ variableRunLength_ = fixedRunLength_;
+ fixedRunLength_ = 0;
}
// after writing values re-initialize the variables
if (numLiterals == 0) {
initializeLiterals(val);
} else {
- prevDelta = val - literals[numLiterals - 1];
+ prevDelta_ = val - literals[numLiterals - 1];
literals[numLiterals++] = val;
- variableRunLength++;
+ variableRunLength_++;
- if (variableRunLength == MAX_LITERAL_SIZE) {
+ if (variableRunLength_ == MAX_LITERAL_SIZE) {
determineEncoding(option);
writeValues(option);
}
@@ -172,7 +172,7 @@ namespace orc {
void RleEncoderV2::computeZigZagLiterals(EncodingOption& option) {
assert(isSigned);
for (size_t i = 0; i < numLiterals; i++) {
- zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]);
+ zigzagLiterals_[option.zigzagLiteralsCount++] = zigZag(literals[i]);
}
}
@@ -207,7 +207,7 @@ namespace orc {
for (size_t i = 0; i < numLiterals; i++) {
// if value is above mask then create the patch and record the gap
- if (baseRedLiterals[i] > mask) {
+ if (baseRedLiterals_[i] > mask) {
size_t gap = i - prev;
if (gap > maxGap) {
maxGap = gap;
@@ -219,12 +219,12 @@ namespace orc {
gapIdx++;
// extract the most significant bits that are over mask bits
- int64_t patch = baseRedLiterals[i] >> option.brBits95p;
+ int64_t patch = baseRedLiterals_[i] >> option.brBits95p;
patchList.push_back(patch);
patchIdx++;
// strip off the MSB to enable safe bit packing
- baseRedLiterals[i] &= mask;
+ baseRedLiterals_[i] &= mask;
}
}
@@ -268,13 +268,13 @@ namespace orc {
int64_t g = gapList[gapIdx++];
int64_t p = patchList[patchIdx++];
while (g > 255) {
- gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
+ gapVsPatchList_[option.gapVsPatchListCount++] = (255L << option.patchWidth);
i++;
g -= 255;
}
// store patch value in LSBs and gap in MSBs
- gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
+ gapVsPatchList_[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
}
}
@@ -287,7 +287,7 @@ namespace orc {
if (isSigned) {
computeZigZagLiterals(option);
}
- int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals_ : literals;
option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
return currentZigzagLiterals;
}
@@ -318,7 +318,7 @@ namespace orc {
int64_t initialDelta = literals[1] - literals[0];
int64_t currDelta = 0;
int64_t deltaMax = 0;
- adjDeltas[option.adjDeltasCount++] = initialDelta;
+ adjDeltas_[option.adjDeltasCount++] = initialDelta;
for (size_t i = 1; i < numLiterals; i++) {
const int64_t l1 = literals[i];
@@ -332,8 +332,8 @@ namespace orc {
option.isFixedDelta &= (currDelta == initialDelta);
if (i > 1) {
- adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
- deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
+ adjDeltas_[option.adjDeltasCount++] = std::abs(currDelta);
+ deltaMax = std::max(deltaMax, adjDeltas_[i - 1]);
}
}
@@ -407,15 +407,15 @@ namespace orc {
// patching is done only on base reduced values.
// remove base from literals
for (size_t i = 0; i < numLiterals; i++) {
- baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
+ baseRedLiterals_[option.baseRedLiteralsCount++] = (literals[i] - option.min);
}
// 95th percentile width is used to determine max allowed value
// after which patching will be done
- option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
+ option.brBits95p = percentileBits(baseRedLiterals_, 0, numLiterals, 0.95);
// 100th percentile is used to compute the max patch width
- option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
+ option.brBits100p = percentileBits(baseRedLiterals_, 0, numLiterals, 1.0, true);
// after base reducing the values, if the difference in bits between
// 95th percentile and 100th percentile value is zero then there
@@ -440,31 +440,8 @@ namespace orc {
}
uint64_t RleEncoderV2::flush() {
- if (numLiterals != 0) {
- EncodingOption option = {};
- if (variableRunLength != 0) {
- determineEncoding(option);
- writeValues(option);
- } else if (fixedRunLength != 0) {
- if (fixedRunLength < MIN_REPEAT) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- determineEncoding(option);
- writeValues(option);
- } else if (fixedRunLength >= MIN_REPEAT && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- option.encoding = SHORT_REPEAT;
- writeValues(option);
- } else {
- option.encoding = DELTA;
- option.isFixedDelta = true;
- writeValues(option);
- }
- }
- }
-
- outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
+ finishEncode();
uint64_t dataSize = outputStream->flush();
- bufferLength = bufferPosition = 0;
return dataSize;
}
@@ -488,7 +465,7 @@ namespace orc {
}
numLiterals = 0;
- prevDelta = 0;
+ prevDelta_ = 0;
}
}
@@ -506,8 +483,8 @@ namespace orc {
uint32_t header = getOpCode(SHORT_REPEAT);
- fixedRunLength -= MIN_REPEAT;
- header |= fixedRunLength;
+ fixedRunLength_ -= MIN_REPEAT;
+ header |= fixedRunLength_;
header |= ((numBytesRepeatVal - 1) << 3);
writeByte(static_cast<char>(header));
@@ -517,40 +494,40 @@ namespace orc {
writeByte(static_cast<char>(b));
}
- fixedRunLength = 0;
+ fixedRunLength_ = 0;
}
void RleEncoderV2::writeDirectValues(EncodingOption& option) {
// write the number of fixed bits required in next 5 bits
uint32_t fb = option.zzBits100p;
- if (alignedBitPacking) {
+ if (alignedBitPacking_) {
fb = getClosestAlignedFixedBits(fb);
}
const uint32_t efb = encodeBitWidth(fb) << 1;
// adjust variable run length
- variableRunLength -= 1;
+ variableRunLength_ -= 1;
// extract the 9th bit of run length
- const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
+ const uint32_t tailBits = (variableRunLength_ & 0x100) >> 8;
// create first byte of the header
const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits);
// second byte of the header stores the remaining 8 bits of runlength
- const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
+ const char headerSecondByte = static_cast<char>(variableRunLength_ & 0xff);
// write header
writeByte(headerFirstByte);
writeByte(headerSecondByte);
// bit packing the zigzag encoded literals
- int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals_ : literals;
writeInts(currentZigzagLiterals, 0, numLiterals, fb);
// reset run length
- variableRunLength = 0;
+ variableRunLength_ = 0;
}
void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
@@ -565,16 +542,16 @@ namespace orc {
const uint32_t efb = encodeBitWidth(option.brBits95p) << 1;
// adjust variable run length, they are one off
- variableRunLength -= 1;
+ variableRunLength_ -= 1;
// extract the 9th bit of run length
- const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
+ const uint32_t tailBits = (variableRunLength_ & 0x100) >> 8;
// create first byte of the header
const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits);
// second byte of the header stores the remaining 8 bits of runlength
- const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
+ const char headerSecondByte = static_cast<char>(variableRunLength_ & 0xff);
// if the min value is negative toggle the sign
const bool isNegative = (option.min < 0);
@@ -618,15 +595,15 @@ namespace orc {
// base reduced literals are bit packed
uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p);
- writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits);
+ writeInts(baseRedLiterals_, 0, numLiterals, closestFixedBits);
// write patch list
closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth);
- writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits);
+ writeInts(gapVsPatchList_, 0, option.patchLength, closestFixedBits);
// reset run length
- variableRunLength = 0;
+ variableRunLength_ = 0;
}
void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
@@ -634,7 +611,7 @@ namespace orc {
uint32_t fb = option.bitsDeltaMax;
uint32_t efb = 0;
- if (alignedBitPacking) {
+ if (alignedBitPacking_) {
fb = getClosestAlignedFixedBits(fb);
}
@@ -642,14 +619,14 @@ namespace orc {
// if fixed run length is greater than threshold then it will be fixed
// delta sequence with delta value 0 else fixed delta sequence with
// non-zero delta value
- if (fixedRunLength > MIN_REPEAT) {
+ if (fixedRunLength_ > MIN_REPEAT) {
// ex. sequence: 2 2 2 2 2 2 2 2
- len = fixedRunLength - 1;
- fixedRunLength = 0;
+ len = fixedRunLength_ - 1;
+ fixedRunLength_ = 0;
} else {
// ex. sequence: 4 6 8 10 12 14 16
- len = variableRunLength - 1;
- variableRunLength = 0;
+ len = variableRunLength_ - 1;
+ variableRunLength_ = 0;
}
} else {
// fixed width 0 is used for long repeating values.
@@ -658,8 +635,8 @@ namespace orc {
fb = 2;
}
efb = encodeBitWidth(fb) << 1;
- len = variableRunLength - 1;
- variableRunLength = 0;
+ len = variableRunLength_ - 1;
+ variableRunLength_ = 0;
}
// extract the 9th bit of run length
@@ -687,13 +664,13 @@ namespace orc {
writeVslong(option.fixedDelta);
} else {
// store the first value as delta value using zigzag encoding
- writeVslong(adjDeltas[0]);
+ writeVslong(adjDeltas_[0]);
// adjacent delta values are bit packed. The length of adjDeltas array is
// always one less than the number of literals (delta difference for n
// elements is n-1). We have already written one element, write the
// remaining numLiterals - 2 elements here
- writeInts(adjDeltas, 1, numLiterals - 2, fb);
+ writeInts(adjDeltas_, 1, numLiterals - 2, fb);
}
}
@@ -776,7 +753,33 @@ namespace orc {
void RleEncoderV2::initializeLiterals(int64_t val) {
literals[numLiterals++] = val;
- fixedRunLength = 1;
- variableRunLength = 1;
+ fixedRunLength_ = 1;
+ variableRunLength_ = 1;
+ }
+
+ void RleEncoderV2::finishEncode() {
+ if (numLiterals != 0) {
+ EncodingOption option = {};
+ if (variableRunLength_ != 0) {
+ determineEncoding(option);
+ writeValues(option);
+ } else if (fixedRunLength_ != 0) {
+ if (fixedRunLength_ < MIN_REPEAT) {
+ variableRunLength_ = fixedRunLength_;
+ fixedRunLength_ = 0;
+ determineEncoding(option);
+ writeValues(option);
+ } else if (fixedRunLength_ >= MIN_REPEAT && fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) {
+ option.encoding = SHORT_REPEAT;
+ writeValues(option);
+ } else {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ writeValues(option);
+ }
+ }
+ }
+
+ RleEncoder::finishEncode();
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
index b8c4fd4048..7cf3b5c512 100644
--- a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
+++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
@@ -21,20 +21,20 @@
namespace orc {
- SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& _readType, const Type* fileType)
- : readType(_readType) {
- if (readType) {
- buildConversion(readType.get(), fileType);
+ SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& readType, const Type* fileType)
+ : readType_(readType) {
+ if (readType_) {
+ buildConversion(readType_.get(), fileType);
} else {
for (uint64_t i = 0; i <= fileType->getMaximumColumnId(); ++i) {
- safePPDConversionMap.insert(i);
+ safePPDConversionMap_.insert(i);
}
}
}
const Type* SchemaEvolution::getReadType(const Type& fileType) const {
- auto ret = readTypeMap.find(fileType.getColumnId());
- return ret == readTypeMap.cend() ? &fileType : ret->second;
+ auto ret = readTypeMap_.find(fileType.getColumnId());
+ return ret == readTypeMap_.cend() ? &fileType : ret->second;
}
inline void invalidConversion(const Type* readType, const Type* fileType) {
@@ -80,7 +80,7 @@ namespace orc {
if (readType.getKind() == fileType.getKind()) {
ret.isValid = true;
if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) {
- ret.isValid = readType.getMaximumLength() == fileType.getMaximumLength();
+ ret.needConvert = readType.getMaximumLength() != fileType.getMaximumLength();
} else if (fileType.getKind() == DECIMAL) {
ret.needConvert = readType.getPrecision() != fileType.getPrecision() ||
readType.getScale() != fileType.getScale();
@@ -99,12 +99,17 @@ namespace orc {
break;
}
case DECIMAL: {
- ret.isValid = ret.needConvert = isNumeric(readType);
+ ret.isValid = ret.needConvert =
+ isNumeric(readType) || isStringVariant(readType) || isTimestamp(readType);
break;
}
case STRING:
case CHAR:
- case VARCHAR:
+ case VARCHAR: {
+ ret.isValid = ret.needConvert = isStringVariant(readType) || isNumeric(readType) ||
+ isTimestamp(readType) || isDecimal(readType);
+ break;
+ }
case TIMESTAMP:
case TIMESTAMP_INSTANT:
case DATE:
@@ -126,22 +131,22 @@ namespace orc {
return ret;
}
- void SchemaEvolution::buildConversion(const Type* _readType, const Type* fileType) {
+ void SchemaEvolution::buildConversion(const Type* readType, const Type* fileType) {
if (fileType == nullptr) {
- throw SchemaEvolutionError("File does not have " + _readType->toString());
+ throw SchemaEvolutionError("File does not have " + readType->toString());
}
- auto [valid, convert] = checkConversion(*_readType, *fileType);
+ auto [valid, convert] = checkConversion(*readType, *fileType);
if (!valid) {
- invalidConversion(_readType, fileType);
+ invalidConversion(readType, fileType);
}
- readTypeMap.emplace(_readType->getColumnId(), convert ? _readType : fileType);
+ readTypeMap_.emplace(readType->getColumnId(), convert ? readType : fileType);
// check whether PPD conversion is safe
- buildSafePPDConversionMap(_readType, fileType);
+ buildSafePPDConversionMap(readType, fileType);
- for (uint64_t i = 0; i < _readType->getSubtypeCount(); ++i) {
- auto subType = _readType->getSubtype(i);
+ for (uint64_t i = 0; i < readType->getSubtypeCount(); ++i) {
+ auto subType = readType->getSubtype(i);
if (subType) {
// null subType means that this is a sub column of map/list type
// and it does not exist in the file. simply skip it.
@@ -164,20 +169,20 @@ namespace orc {
return kind != STRUCT && kind != MAP && kind != LIST && kind != UNION;
}
- void SchemaEvolution::buildSafePPDConversionMap(const Type* _readType, const Type* fileType) {
- if (_readType == nullptr || !isPrimitive(_readType) || fileType == nullptr ||
+ void SchemaEvolution::buildSafePPDConversionMap(const Type* readType, const Type* fileType) {
+ if (readType == nullptr || !isPrimitive(readType) || fileType == nullptr ||
!isPrimitive(fileType)) {
return;
}
bool isSafe = false;
- if (_readType == fileType) {
+ if (readType == fileType) {
// short cut for same type
isSafe = true;
- } else if (_readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) {
+ } else if (readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) {
// for decimals alone do equality check to not mess up with precision change
- if (fileType->getPrecision() == readType->getPrecision() &&
- fileType->getScale() == readType->getScale()) {
+ if (fileType->getPrecision() == readType_->getPrecision() &&
+ fileType->getScale() == readType_->getScale()) {
isSafe = true;
}
} else {
@@ -195,32 +200,32 @@ namespace orc {
// as ORC stores char with padded spaces in its internal index.
switch (fileType->getKind()) {
case BYTE: {
- if (readType->getKind() == SHORT || readType->getKind() == INT ||
- readType->getKind() == LONG) {
+ if (readType_->getKind() == SHORT || readType_->getKind() == INT ||
+ readType_->getKind() == LONG) {
isSafe = true;
}
break;
}
case SHORT: {
- if (readType->getKind() == INT || readType->getKind() == LONG) {
+ if (readType_->getKind() == INT || readType_->getKind() == LONG) {
isSafe = true;
}
break;
}
case INT: {
- if (readType->getKind() == LONG) {
+ if (readType_->getKind() == LONG) {
isSafe = true;
}
break;
}
case STRING: {
- if (readType->getKind() == VARCHAR) {
+ if (readType_->getKind() == VARCHAR) {
isSafe = true;
}
break;
}
case VARCHAR: {
- if (readType->getKind() == STRING) {
+ if (readType_->getKind() == STRING) {
isSafe = true;
}
break;
@@ -244,12 +249,12 @@ namespace orc {
}
if (isSafe) {
- safePPDConversionMap.insert(fileType->getColumnId());
+ safePPDConversionMap_.insert(fileType->getColumnId());
}
}
bool SchemaEvolution::isSafePPDConversion(uint64_t columnId) const {
- return safePPDConversionMap.find(columnId) != safePPDConversionMap.cend();
+ return safePPDConversionMap_.find(columnId) != safePPDConversionMap_.cend();
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh
index ef9020eba4..c3deff7236 100644
--- a/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh
+++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh
@@ -46,7 +46,7 @@ namespace orc {
// return selected read type
const Type* getReadType() const {
- return readType.get();
+ return readType_.get();
}
private:
@@ -54,9 +54,9 @@ namespace orc {
void buildSafePPDConversionMap(const Type* readType, const Type* fileType);
private:
- const std::shared_ptr<Type> readType;
- std::unordered_map<uint64_t, const Type*> readTypeMap;
- std::unordered_set<uint64_t> safePPDConversionMap;
+ const std::shared_ptr<Type> readType_;
+ std::unordered_map<uint64_t, const Type*> readTypeMap_;
+ std::unordered_set<uint64_t> safePPDConversionMap_;
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc
index 8ed29d0e7c..76fd736b27 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.cc
+++ b/contrib/libs/apache/orc/c++/src/Statistics.cc
@@ -52,18 +52,18 @@ namespace orc {
StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats,
const StatContext& statContext) {
for (int i = 0; i < stripeStats.col_stats_size(); i++) {
- colStats.push_back(convertColumnStatistics(stripeStats.col_stats(i), statContext));
+ colStats_.push_back(convertColumnStatistics(stripeStats.col_stats(i), statContext));
}
}
StatisticsImpl::StatisticsImpl(const proto::Footer& footer, const StatContext& statContext) {
for (int i = 0; i < footer.statistics_size(); i++) {
- colStats.push_back(convertColumnStatistics(footer.statistics(i), statContext));
+ colStats_.push_back(convertColumnStatistics(footer.statistics(i), statContext));
}
}
StatisticsImpl::~StatisticsImpl() {
- for (std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); ptr != colStats.end();
+ for (std::vector<ColumnStatistics*>::iterator ptr = colStats_.begin(); ptr != colStats_.end();
++ptr) {
delete *ptr;
}
@@ -85,11 +85,11 @@ namespace orc {
const proto::StripeStatistics& stripeStats,
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
const StatContext& statContext) {
- columnStats = std::make_unique<StatisticsImpl>(stripeStats, statContext);
- rowIndexStats.resize(indexStats.size());
- for (size_t i = 0; i < rowIndexStats.size(); i++) {
+ columnStats_ = std::make_unique<StatisticsImpl>(stripeStats, statContext);
+ rowIndexStats_.resize(indexStats.size());
+ for (size_t i = 0; i < rowIndexStats_.size(); i++) {
for (size_t j = 0; j < indexStats[i].size(); j++) {
- rowIndexStats[i].push_back(std::shared_ptr<const ColumnStatistics>(
+ rowIndexStats_[i].push_back(std::shared_ptr<const ColumnStatistics>(
convertColumnStatistics(indexStats[i][j], statContext)));
}
}
@@ -180,205 +180,205 @@ namespace orc {
}
ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
}
BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_binary_statistics() && statContext.correctStats) {
- _stats.setHasTotalLength(pb.binary_statistics().has_sum());
- _stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
+ stats_.setHasTotalLength(pb.binary_statistics().has_sum());
+ stats_.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
}
}
BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_bucket_statistics() && statContext.correctStats) {
- _hasCount = true;
- _trueCount = pb.bucket_statistics().count(0);
+ hasCount_ = true;
+ trueCount_ = pb.bucket_statistics().count(0);
} else {
- _hasCount = false;
- _trueCount = 0;
+ hasCount_ = false;
+ trueCount_ = 0;
}
}
DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_date_statistics() || !statContext.correctStats) {
// hasMinimum_ is false by default;
// hasMaximum_ is false by default;
- _stats.setMinimum(0);
- _stats.setMaximum(0);
+ stats_.setMinimum(0);
+ stats_.setMaximum(0);
} else {
- _stats.setHasMinimum(pb.date_statistics().has_minimum());
- _stats.setHasMaximum(pb.date_statistics().has_maximum());
- _stats.setMinimum(pb.date_statistics().minimum());
- _stats.setMaximum(pb.date_statistics().maximum());
+ stats_.setHasMinimum(pb.date_statistics().has_minimum());
+ stats_.setHasMaximum(pb.date_statistics().has_maximum());
+ stats_.setMinimum(pb.date_statistics().minimum());
+ stats_.setMaximum(pb.date_statistics().maximum());
}
}
DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_decimal_statistics() && statContext.correctStats) {
const proto::DecimalStatistics& stats = pb.decimal_statistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasSum(stats.has_sum());
+ stats_.setHasMinimum(stats.has_minimum());
+ stats_.setHasMaximum(stats.has_maximum());
+ stats_.setHasSum(stats.has_sum());
- _stats.setMinimum(Decimal(stats.minimum()));
- _stats.setMaximum(Decimal(stats.maximum()));
- _stats.setSum(Decimal(stats.sum()));
+ stats_.setMinimum(Decimal(stats.minimum()));
+ stats_.setMaximum(Decimal(stats.maximum()));
+ stats_.setSum(Decimal(stats.sum()));
}
}
DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_double_statistics()) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _stats.setSum(0);
+ stats_.setMinimum(0);
+ stats_.setMaximum(0);
+ stats_.setSum(0);
} else {
const proto::DoubleStatistics& stats = pb.double_statistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasSum(stats.has_sum());
+ stats_.setHasMinimum(stats.has_minimum());
+ stats_.setHasMaximum(stats.has_maximum());
+ stats_.setHasSum(stats.has_sum());
- _stats.setMinimum(stats.minimum());
- _stats.setMaximum(stats.maximum());
- _stats.setSum(stats.sum());
+ stats_.setMinimum(stats.minimum());
+ stats_.setMaximum(stats.maximum());
+ stats_.setSum(stats.sum());
}
}
IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_int_statistics()) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _stats.setSum(0);
+ stats_.setMinimum(0);
+ stats_.setMaximum(0);
+ stats_.setSum(0);
} else {
const proto::IntegerStatistics& stats = pb.int_statistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasSum(stats.has_sum());
+ stats_.setHasMinimum(stats.has_minimum());
+ stats_.setHasMaximum(stats.has_maximum());
+ stats_.setHasSum(stats.has_sum());
- _stats.setMinimum(stats.minimum());
- _stats.setMaximum(stats.maximum());
- _stats.setSum(stats.sum());
+ stats_.setMinimum(stats.minimum());
+ stats_.setMaximum(stats.maximum());
+ stats_.setSum(stats.sum());
}
}
StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_string_statistics() || !statContext.correctStats) {
- _stats.setTotalLength(0);
+ stats_.setTotalLength(0);
} else {
const proto::StringStatistics& stats = pb.string_statistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasTotalLength(stats.has_sum());
+ stats_.setHasMinimum(stats.has_minimum());
+ stats_.setHasMaximum(stats.has_maximum());
+ stats_.setHasTotalLength(stats.has_sum());
- _stats.setMinimum(stats.minimum());
- _stats.setMaximum(stats.maximum());
- _stats.setTotalLength(static_cast<uint64_t>(stats.sum()));
+ stats_.setMinimum(stats.minimum());
+ stats_.setMaximum(stats.maximum());
+ stats_.setTotalLength(static_cast<uint64_t>(stats.sum()));
}
}
TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_timestamp_statistics() || !statContext.correctStats) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _lowerBound = 0;
- _upperBound = 0;
- _minimumNanos = DEFAULT_MIN_NANOS;
- _maximumNanos = DEFAULT_MAX_NANOS;
+ stats_.setMinimum(0);
+ stats_.setMaximum(0);
+ lowerBound_ = 0;
+ upperBound_ = 0;
+ minimumNanos_ = DEFAULT_MIN_NANOS;
+ maximumNanos_ = DEFAULT_MAX_NANOS;
} else {
const proto::TimestampStatistics& stats = pb.timestamp_statistics();
- _stats.setHasMinimum(stats.has_minimum_utc() ||
+ stats_.setHasMinimum(stats.has_minimum_utc() ||
(stats.has_minimum() && (statContext.writerTimezone != nullptr)));
- _stats.setHasMaximum(stats.has_maximum_utc() ||
+ stats_.setHasMaximum(stats.has_maximum_utc() ||
(stats.has_maximum() && (statContext.writerTimezone != nullptr)));
- _hasLowerBound = stats.has_minimum_utc() || stats.has_minimum();
- _hasUpperBound = stats.has_maximum_utc() || stats.has_maximum();
+ hasLowerBound_ = stats.has_minimum_utc() || stats.has_minimum();
+ hasUpperBound_ = stats.has_maximum_utc() || stats.has_maximum();
// to be consistent with java side, non-default minimum_nanos and maximum_nanos
// are added by one in their serialized form.
- _minimumNanos = stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS;
- _maximumNanos = stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS;
+ minimumNanos_ = stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS;
+ maximumNanos_ = stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS;
// Timestamp stats are stored in milliseconds
if (stats.has_minimum_utc()) {
int64_t minimum = stats.minimum_utc();
- _stats.setMinimum(minimum);
- _lowerBound = minimum;
+ stats_.setMinimum(minimum);
+ lowerBound_ = minimum;
} else if (statContext.writerTimezone) {
int64_t writerTimeSec = stats.minimum() / 1000;
// multiply the offset by 1000 to convert to millisecond
int64_t minimum = stats.minimum() +
(statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
- _stats.setMinimum(minimum);
- _lowerBound = minimum;
+ stats_.setMinimum(minimum);
+ lowerBound_ = minimum;
} else {
- _stats.setMinimum(0);
+ stats_.setMinimum(0);
// subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown
// TZ and daylight savings
- _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
+ lowerBound_ = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
}
// Timestamp stats are stored in milliseconds
if (stats.has_maximum_utc()) {
int64_t maximum = stats.maximum_utc();
- _stats.setMaximum(maximum);
- _upperBound = maximum;
+ stats_.setMaximum(maximum);
+ upperBound_ = maximum;
} else if (statContext.writerTimezone) {
int64_t writerTimeSec = stats.maximum() / 1000;
// multiply the offset by 1000 to convert to millisecond
int64_t maximum = stats.maximum() +
(statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
- _stats.setMaximum(maximum);
- _upperBound = maximum;
+ stats_.setMaximum(maximum);
+ upperBound_ = maximum;
} else {
- _stats.setMaximum(0);
+ stats_.setMaximum(0);
// add 1 day 1 hour (25 hours) in milliseconds to handle unknown
// TZ and daylight savings
- _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
+ upperBound_ = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
}
// Add 1 millisecond to account for microsecond precision of values
- _upperBound += 1;
+ upperBound_ += 1;
}
}
CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ stats_.setNumberOfValues(pb.number_of_values());
+ stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_collection_statistics()) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _stats.setSum(0);
+ stats_.setMinimum(0);
+ stats_.setMaximum(0);
+ stats_.setSum(0);
} else {
const proto::CollectionStatistics& stats = pb.collection_statistics();
- _stats.setHasMinimum(stats.has_min_children());
- _stats.setHasMaximum(stats.has_max_children());
- _stats.setHasSum(stats.has_total_children());
+ stats_.setHasMinimum(stats.has_min_children());
+ stats_.setHasMaximum(stats.has_max_children());
+ stats_.setHasSum(stats.has_total_children());
- _stats.setMinimum(stats.min_children());
- _stats.setMaximum(stats.max_children());
- _stats.setSum(stats.total_children());
+ stats_.setMinimum(stats.min_children());
+ stats_.setMaximum(stats.max_children());
+ stats_.setSum(stats.total_children());
}
}
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh
index e585bf971c..6f212c15cc 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.hh
+++ b/contrib/libs/apache/orc/c++/src/Statistics.hh
@@ -48,160 +48,160 @@ namespace orc {
template <typename T>
class InternalStatisticsImpl {
private:
- bool _hasNull;
- bool _hasMinimum;
- bool _hasMaximum;
- bool _hasSum;
- bool _hasTotalLength;
- uint64_t _totalLength;
- uint64_t _valueCount;
- T _minimum;
- T _maximum;
- T _sum;
+ bool hasNull_;
+ bool hasMinimum_;
+ bool hasMaximum_;
+ bool hasSum_;
+ bool hasTotalLength_;
+ uint64_t totalLength_;
+ uint64_t valueCount_;
+ T minimum_;
+ T maximum_;
+ T sum_;
public:
InternalStatisticsImpl() {
- _hasNull = false;
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
- _hasTotalLength = false;
- _totalLength = 0;
- _valueCount = 0;
+ hasNull_ = false;
+ hasMinimum_ = false;
+ hasMaximum_ = false;
+ hasSum_ = false;
+ hasTotalLength_ = false;
+ totalLength_ = 0;
+ valueCount_ = 0;
}
~InternalStatisticsImpl() {}
// GET / SET _totalLength
bool hasTotalLength() const {
- return _hasTotalLength;
+ return hasTotalLength_;
}
void setHasTotalLength(bool hasTotalLength) {
- _hasTotalLength = hasTotalLength;
+ hasTotalLength_ = hasTotalLength;
}
uint64_t getTotalLength() const {
- return _totalLength;
+ return totalLength_;
}
void setTotalLength(uint64_t totalLength) {
- _totalLength = totalLength;
+ totalLength_ = totalLength;
}
// GET / SET _sum
bool hasSum() const {
- return _hasSum;
+ return hasSum_;
}
void setHasSum(bool hasSum) {
- _hasSum = hasSum;
+ hasSum_ = hasSum;
}
T getSum() const {
- return _sum;
+ return sum_;
}
void setSum(T sum) {
- _sum = sum;
+ sum_ = sum;
}
// GET / SET _maximum
bool hasMaximum() const {
- return _hasMaximum;
+ return hasMaximum_;
}
const T& getMaximum() const {
- return _maximum;
+ return maximum_;
}
void setHasMaximum(bool hasMax) {
- _hasMaximum = hasMax;
+ hasMaximum_ = hasMax;
}
void setMaximum(T max) {
- _maximum = max;
+ maximum_ = max;
}
// GET / SET _minimum
bool hasMinimum() const {
- return _hasMinimum;
+ return hasMinimum_;
}
void setHasMinimum(bool hasMin) {
- _hasMinimum = hasMin;
+ hasMinimum_ = hasMin;
}
const T& getMinimum() const {
- return _minimum;
+ return minimum_;
}
void setMinimum(T min) {
- _minimum = min;
+ minimum_ = min;
}
// GET / SET _valueCount
uint64_t getNumberOfValues() const {
- return _valueCount;
+ return valueCount_;
}
void setNumberOfValues(uint64_t numValues) {
- _valueCount = numValues;
+ valueCount_ = numValues;
}
// GET / SET _hasNullValue
bool hasNull() const {
- return _hasNull;
+ return hasNull_;
}
void setHasNull(bool hasNull) {
- _hasNull = hasNull;
+ hasNull_ = hasNull;
}
void reset() {
- _hasNull = false;
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
- _hasTotalLength = false;
- _totalLength = 0;
- _valueCount = 0;
+ hasNull_ = false;
+ hasMinimum_ = false;
+ hasMaximum_ = false;
+ hasSum_ = false;
+ hasTotalLength_ = false;
+ totalLength_ = 0;
+ valueCount_ = 0;
}
void updateMinMax(T value) {
- if (!_hasMinimum) {
- _hasMinimum = _hasMaximum = true;
- _minimum = _maximum = value;
- } else if (compare(value, _minimum)) {
- _minimum = value;
- } else if (compare(_maximum, value)) {
- _maximum = value;
+ if (!hasMinimum_) {
+ hasMinimum_ = hasMaximum_ = true;
+ minimum_ = maximum_ = value;
+ } else if (compare(value, minimum_)) {
+ minimum_ = value;
+ } else if (compare(maximum_, value)) {
+ maximum_ = value;
}
}
// sum is not merged here as we need to check overflow
void merge(const InternalStatisticsImpl& other) {
- _hasNull = _hasNull || other._hasNull;
- _valueCount += other._valueCount;
-
- if (other._hasMinimum) {
- if (!_hasMinimum) {
- _hasMinimum = _hasMaximum = true;
- _minimum = other._minimum;
- _maximum = other._maximum;
+ hasNull_ = hasNull_ || other.hasNull_;
+ valueCount_ += other.valueCount_;
+
+ if (other.hasMinimum_) {
+ if (!hasMinimum_) {
+ hasMinimum_ = hasMaximum_ = true;
+ minimum_ = other.minimum_;
+ maximum_ = other.maximum_;
} else {
// all template types should support operator<
- if (compare(_maximum, other._maximum)) {
- _maximum = other._maximum;
+ if (compare(maximum_, other.maximum_)) {
+ maximum_ = other.maximum_;
}
- if (compare(other._minimum, _minimum)) {
- _minimum = other._minimum;
+ if (compare(other.minimum_, minimum_)) {
+ minimum_ = other.minimum_;
}
}
}
- _hasTotalLength = _hasTotalLength && other._hasTotalLength;
- _totalLength += other._totalLength;
+ hasTotalLength_ = hasTotalLength_ && other.hasTotalLength_;
+ totalLength_ += other.totalLength_;
}
};
@@ -240,7 +240,7 @@ namespace orc {
class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics {
private:
- InternalCharStatistics _stats;
+ InternalCharStatistics stats_;
public:
ColumnStatisticsImpl() {
@@ -250,36 +250,36 @@ namespace orc {
virtual ~ColumnStatisticsImpl() override;
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
void merge(const MutableColumnStatistics& other) override {
- _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
+ stats_.merge(dynamic_cast<const ColumnStatisticsImpl&>(other).stats_);
}
void reset() override {
- _stats.reset();
+ stats_.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
}
std::string toString() const override {
@@ -292,7 +292,7 @@ namespace orc {
class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics {
private:
- InternalCharStatistics _stats;
+ InternalCharStatistics stats_;
public:
BinaryColumnStatisticsImpl() {
@@ -303,63 +303,63 @@ namespace orc {
virtual ~BinaryColumnStatisticsImpl() override;
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
bool hasTotalLength() const override {
- return _stats.hasTotalLength();
+ return stats_.hasTotalLength();
}
uint64_t getTotalLength() const override {
if (hasTotalLength()) {
- return _stats.getTotalLength();
+ return stats_.getTotalLength();
} else {
throw ParseError("Total length is not defined.");
}
}
void setTotalLength(uint64_t length) {
- _stats.setHasTotalLength(true);
- _stats.setTotalLength(length);
+ stats_.setHasTotalLength(true);
+ stats_.setTotalLength(length);
}
void update(size_t length) {
- _stats.setTotalLength(_stats.getTotalLength() + length);
+ stats_.setTotalLength(stats_.getTotalLength() + length);
}
void merge(const MutableColumnStatistics& other) override {
const BinaryColumnStatisticsImpl& binStats =
dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
- _stats.merge(binStats._stats);
+ stats_.merge(binStats.stats_);
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setTotalLength(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::BinaryStatistics* binStats = pbStats.mutable_binary_statistics();
- binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+ binStats->set_sum(static_cast<int64_t>(stats_.getTotalLength()));
}
std::string toString() const override {
@@ -379,9 +379,9 @@ namespace orc {
class BooleanColumnStatisticsImpl : public BooleanColumnStatistics,
public MutableColumnStatistics {
private:
- InternalBooleanStatistics _stats;
- bool _hasCount;
- uint64_t _trueCount;
+ InternalBooleanStatistics stats_;
+ bool hasCount_;
+ uint64_t trueCount_;
public:
BooleanColumnStatisticsImpl() {
@@ -392,33 +392,33 @@ namespace orc {
virtual ~BooleanColumnStatisticsImpl() override;
bool hasCount() const override {
- return _hasCount;
+ return hasCount_;
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- _hasCount = true;
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
+ hasCount_ = true;
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
uint64_t getFalseCount() const override {
if (hasCount()) {
- return getNumberOfValues() - _trueCount;
+ return getNumberOfValues() - trueCount_;
} else {
throw ParseError("False count is not defined.");
}
@@ -426,43 +426,43 @@ namespace orc {
uint64_t getTrueCount() const override {
if (hasCount()) {
- return _trueCount;
+ return trueCount_;
} else {
throw ParseError("True count is not defined.");
}
}
void setTrueCount(uint64_t trueCount) {
- _hasCount = true;
- _trueCount = trueCount;
+ hasCount_ = true;
+ trueCount_ = trueCount;
}
void update(bool value, size_t repetitions) {
if (value) {
- _trueCount += repetitions;
+ trueCount_ += repetitions;
}
}
void merge(const MutableColumnStatistics& other) override {
const BooleanColumnStatisticsImpl& boolStats =
dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
- _stats.merge(boolStats._stats);
- _hasCount = _hasCount && boolStats._hasCount;
- _trueCount += boolStats._trueCount;
+ stats_.merge(boolStats.stats_);
+ hasCount_ = hasCount_ && boolStats.hasCount_;
+ trueCount_ += boolStats.trueCount_;
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setTrueCount(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::BucketStatistics* bucketStats = pbStats.mutable_bucket_statistics();
- if (_hasCount) {
- bucketStats->add_count(_trueCount);
+ if (hasCount_) {
+ bucketStats->add_count(trueCount_);
} else {
bucketStats->clear_count();
}
@@ -485,7 +485,7 @@ namespace orc {
class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics {
private:
- InternalDateStatistics _stats;
+ InternalDateStatistics stats_;
public:
DateColumnStatisticsImpl() {
@@ -495,36 +495,36 @@ namespace orc {
virtual ~DateColumnStatisticsImpl() override;
bool hasMinimum() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximum() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
int32_t getMinimum() const override {
if (hasMinimum()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
@@ -532,44 +532,44 @@ namespace orc {
int32_t getMaximum() const override {
if (hasMaximum()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int32_t minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximum(int32_t maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
void update(int32_t value) {
- _stats.updateMinMax(value);
+ stats_.updateMinMax(value);
}
void merge(const MutableColumnStatistics& other) override {
const DateColumnStatisticsImpl& dateStats =
dynamic_cast<const DateColumnStatisticsImpl&>(other);
- _stats.merge(dateStats._stats);
+ stats_.merge(dateStats.stats_);
}
void reset() override {
- _stats.reset();
+ stats_.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::DateStatistics* dateStatistics = pbStats.mutable_date_statistics();
- if (_stats.hasMinimum()) {
- dateStatistics->set_maximum(_stats.getMaximum());
- dateStatistics->set_minimum(_stats.getMinimum());
+ if (stats_.hasMinimum()) {
+ dateStatistics->set_maximum(stats_.getMaximum());
+ dateStatistics->set_minimum(stats_.getMinimum());
} else {
dateStatistics->clear_minimum();
dateStatistics->clear_maximum();
@@ -599,7 +599,7 @@ namespace orc {
class DecimalColumnStatisticsImpl : public DecimalColumnStatistics,
public MutableColumnStatistics {
private:
- InternalDecimalStatistics _stats;
+ InternalDecimalStatistics stats_;
public:
DecimalColumnStatisticsImpl() {
@@ -610,40 +610,40 @@ namespace orc {
virtual ~DecimalColumnStatisticsImpl() override;
bool hasMinimum() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximum() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
bool hasSum() const override {
- return _stats.hasSum();
+ return stats_.hasSum();
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
Decimal getMinimum() const override {
if (hasMinimum()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
@@ -651,39 +651,39 @@ namespace orc {
Decimal getMaximum() const override {
if (hasMaximum()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(Decimal minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximum(Decimal maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
Decimal getSum() const override {
if (hasSum()) {
- return _stats.getSum();
+ return stats_.getSum();
} else {
throw ParseError("Sum is not defined.");
}
}
void setSum(Decimal sum) {
- _stats.setHasSum(true);
- _stats.setSum(sum);
+ stats_.setHasSum(true);
+ stats_.setSum(sum);
}
void update(const Decimal& value) {
- _stats.updateMinMax(value);
+ stats_.updateMinMax(value);
- if (_stats.hasSum()) {
+ if (stats_.hasSum()) {
updateSum(value);
}
}
@@ -692,33 +692,33 @@ namespace orc {
const DecimalColumnStatisticsImpl& decStats =
dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
- _stats.merge(decStats._stats);
+ stats_.merge(decStats.stats_);
- _stats.setHasSum(_stats.hasSum() && decStats.hasSum());
- if (_stats.hasSum()) {
+ stats_.setHasSum(stats_.hasSum() && decStats.hasSum());
+ if (stats_.hasSum()) {
updateSum(decStats.getSum());
}
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setSum(Decimal());
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::DecimalStatistics* decStats = pbStats.mutable_decimal_statistics();
- if (_stats.hasMinimum()) {
- decStats->set_minimum(_stats.getMinimum().toString(true));
- decStats->set_maximum(_stats.getMaximum().toString(true));
+ if (stats_.hasMinimum()) {
+ decStats->set_minimum(stats_.getMinimum().toString(true));
+ decStats->set_maximum(stats_.getMaximum().toString(true));
} else {
decStats->clear_minimum();
decStats->clear_maximum();
}
- if (_stats.hasSum()) {
- decStats->set_sum(_stats.getSum().toString(true));
+ if (stats_.hasSum()) {
+ decStats->set_sum(stats_.getSum().toString(true));
} else {
decStats->clear_sum();
}
@@ -752,9 +752,9 @@ namespace orc {
private:
void updateSum(Decimal value) {
- if (_stats.hasSum()) {
+ if (stats_.hasSum()) {
bool overflow = false;
- Decimal sum = _stats.getSum();
+ Decimal sum = stats_.getSum();
if (sum.scale > value.scale) {
value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow);
} else if (sum.scale < value.scale) {
@@ -766,14 +766,14 @@ namespace orc {
bool wasPositive = sum.value >= 0;
sum.value += value.value;
if ((value.value >= 0) == wasPositive) {
- _stats.setHasSum((sum.value >= 0) == wasPositive);
+ stats_.setHasSum((sum.value >= 0) == wasPositive);
}
} else {
- _stats.setHasSum(false);
+ stats_.setHasSum(false);
}
- if (_stats.hasSum()) {
- _stats.setSum(sum);
+ if (stats_.hasSum()) {
+ stats_.setSum(sum);
}
}
}
@@ -781,7 +781,7 @@ namespace orc {
class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics {
private:
- InternalDoubleStatistics _stats;
+ InternalDoubleStatistics stats_;
public:
DoubleColumnStatisticsImpl() {
@@ -791,40 +791,40 @@ namespace orc {
virtual ~DoubleColumnStatisticsImpl() override;
bool hasMinimum() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximum() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
bool hasSum() const override {
- return _stats.hasSum();
+ return stats_.hasSum();
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
double getMinimum() const override {
if (hasMinimum()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
@@ -832,70 +832,70 @@ namespace orc {
double getMaximum() const override {
if (hasMaximum()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(double minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximum(double maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
double getSum() const override {
if (hasSum()) {
- return _stats.getSum();
+ return stats_.getSum();
} else {
throw ParseError("Sum is not defined.");
}
}
void setSum(double sum) {
- _stats.setHasSum(true);
- _stats.setSum(sum);
+ stats_.setHasSum(true);
+ stats_.setSum(sum);
}
void update(double value) {
- _stats.updateMinMax(value);
- _stats.setSum(_stats.getSum() + value);
+ stats_.updateMinMax(value);
+ stats_.setSum(stats_.getSum() + value);
}
void merge(const MutableColumnStatistics& other) override {
const DoubleColumnStatisticsImpl& doubleStats =
dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
- _stats.merge(doubleStats._stats);
+ stats_.merge(doubleStats.stats_);
- _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
- if (_stats.hasSum()) {
- _stats.setSum(_stats.getSum() + doubleStats.getSum());
+ stats_.setHasSum(stats_.hasSum() && doubleStats.hasSum());
+ if (stats_.hasSum()) {
+ stats_.setSum(stats_.getSum() + doubleStats.getSum());
}
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setSum(0.0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::DoubleStatistics* doubleStats = pbStats.mutable_double_statistics();
- if (_stats.hasMinimum()) {
- doubleStats->set_minimum(_stats.getMinimum());
- doubleStats->set_maximum(_stats.getMaximum());
+ if (stats_.hasMinimum()) {
+ doubleStats->set_minimum(stats_.getMinimum());
+ doubleStats->set_maximum(stats_.getMaximum());
} else {
doubleStats->clear_minimum();
doubleStats->clear_maximum();
}
- if (_stats.hasSum()) {
- doubleStats->set_sum(_stats.getSum());
+ if (stats_.hasSum()) {
+ doubleStats->set_sum(stats_.getSum());
} else {
doubleStats->clear_sum();
}
@@ -930,7 +930,7 @@ namespace orc {
class IntegerColumnStatisticsImpl : public IntegerColumnStatistics,
public MutableColumnStatistics {
private:
- InternalIntegerStatistics _stats;
+ InternalIntegerStatistics stats_;
public:
IntegerColumnStatisticsImpl() {
@@ -940,40 +940,40 @@ namespace orc {
virtual ~IntegerColumnStatisticsImpl() override;
bool hasMinimum() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximum() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
bool hasSum() const override {
- return _stats.hasSum();
+ return stats_.hasSum();
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
int64_t getMinimum() const override {
if (hasMinimum()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
@@ -981,48 +981,48 @@ namespace orc {
int64_t getMaximum() const override {
if (hasMaximum()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int64_t minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximum(int64_t maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
int64_t getSum() const override {
if (hasSum()) {
- return _stats.getSum();
+ return stats_.getSum();
} else {
throw ParseError("Sum is not defined.");
}
}
void setSum(int64_t sum) {
- _stats.setHasSum(true);
- _stats.setSum(sum);
+ stats_.setHasSum(true);
+ stats_.setSum(sum);
}
void update(int64_t value, int repetitions) {
- _stats.updateMinMax(value);
+ stats_.updateMinMax(value);
- if (_stats.hasSum()) {
+ if (stats_.hasSum()) {
if (repetitions > 1) {
- _stats.setHasSum(multiplyExact(value, repetitions, &value));
+ stats_.setHasSum(multiplyExact(value, repetitions, &value));
}
- if (_stats.hasSum()) {
- _stats.setHasSum(addExact(_stats.getSum(), value, &value));
+ if (stats_.hasSum()) {
+ stats_.setHasSum(addExact(stats_.getSum(), value, &value));
- if (_stats.hasSum()) {
- _stats.setSum(value);
+ if (stats_.hasSum()) {
+ stats_.setSum(value);
}
}
}
@@ -1032,38 +1032,38 @@ namespace orc {
const IntegerColumnStatisticsImpl& intStats =
dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
- _stats.merge(intStats._stats);
+ stats_.merge(intStats.stats_);
// update sum and check overflow
- _stats.setHasSum(_stats.hasSum() && intStats.hasSum());
- if (_stats.hasSum()) {
+ stats_.setHasSum(stats_.hasSum() && intStats.hasSum());
+ if (stats_.hasSum()) {
int64_t value;
- _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value));
- if (_stats.hasSum()) {
- _stats.setSum(value);
+ stats_.setHasSum(addExact(stats_.getSum(), intStats.getSum(), &value));
+ if (stats_.hasSum()) {
+ stats_.setSum(value);
}
}
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setSum(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::IntegerStatistics* intStats = pbStats.mutable_int_statistics();
- if (_stats.hasMinimum()) {
- intStats->set_minimum(_stats.getMinimum());
- intStats->set_maximum(_stats.getMaximum());
+ if (stats_.hasMinimum()) {
+ intStats->set_minimum(stats_.getMinimum());
+ intStats->set_maximum(stats_.getMaximum());
} else {
intStats->clear_minimum();
intStats->clear_maximum();
}
- if (_stats.hasSum()) {
- intStats->set_sum(_stats.getSum());
+ if (stats_.hasSum()) {
+ intStats->set_sum(stats_.getSum());
} else {
intStats->clear_sum();
}
@@ -1097,7 +1097,7 @@ namespace orc {
class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics {
private:
- InternalStringStatistics _stats;
+ InternalStringStatistics stats_;
public:
StringColumnStatisticsImpl() {
@@ -1108,40 +1108,40 @@ namespace orc {
virtual ~StringColumnStatisticsImpl() override;
bool hasMinimum() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximum() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
bool hasTotalLength() const override {
- return _stats.hasTotalLength();
+ return stats_.hasTotalLength();
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
const std::string& getMinimum() const override {
if (hasMinimum()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
@@ -1149,59 +1149,59 @@ namespace orc {
const std::string& getMaximum() const override {
if (hasMaximum()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(std::string minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximum(std::string maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
uint64_t getTotalLength() const override {
if (hasTotalLength()) {
- return _stats.getTotalLength();
+ return stats_.getTotalLength();
} else {
throw ParseError("Total length is not defined.");
}
}
void setTotalLength(uint64_t length) {
- _stats.setHasTotalLength(true);
- _stats.setTotalLength(length);
+ stats_.setHasTotalLength(true);
+ stats_.setTotalLength(length);
}
void update(const char* value, size_t length) {
if (value != nullptr) {
- if (!_stats.hasMinimum()) {
+ if (!stats_.hasMinimum()) {
std::string tempStr(value, value + length);
setMinimum(tempStr);
setMaximum(tempStr);
} else {
// update min
- int minCmp = strncmp(_stats.getMinimum().c_str(), value,
- std::min(_stats.getMinimum().length(), length));
- if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) {
+ int minCmp = strncmp(stats_.getMinimum().c_str(), value,
+ std::min(stats_.getMinimum().length(), length));
+ if (minCmp > 0 || (minCmp == 0 && length < stats_.getMinimum().length())) {
setMinimum(std::string(value, value + length));
}
// update max
- int maxCmp = strncmp(_stats.getMaximum().c_str(), value,
- std::min(_stats.getMaximum().length(), length));
- if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) {
+ int maxCmp = strncmp(stats_.getMaximum().c_str(), value,
+ std::min(stats_.getMaximum().length(), length));
+ if (maxCmp < 0 || (maxCmp == 0 && length > stats_.getMaximum().length())) {
setMaximum(std::string(value, value + length));
}
}
}
- _stats.setTotalLength(_stats.getTotalLength() + length);
+ stats_.setTotalLength(stats_.getTotalLength() + length);
}
void update(std::string value) {
@@ -1211,28 +1211,28 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const StringColumnStatisticsImpl& strStats =
dynamic_cast<const StringColumnStatisticsImpl&>(other);
- _stats.merge(strStats._stats);
+ stats_.merge(strStats.stats_);
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setTotalLength(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::StringStatistics* strStats = pbStats.mutable_string_statistics();
- if (_stats.hasMinimum()) {
- strStats->set_minimum(_stats.getMinimum());
- strStats->set_maximum(_stats.getMaximum());
+ if (stats_.hasMinimum()) {
+ strStats->set_minimum(stats_.getMinimum());
+ strStats->set_maximum(stats_.getMaximum());
} else {
strStats->clear_minimum();
strStats->clear_maximum();
}
- if (_stats.hasTotalLength()) {
- strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+ if (stats_.hasTotalLength()) {
+ strStats->set_sum(static_cast<int64_t>(stats_.getTotalLength()));
} else {
strStats->clear_sum();
}
@@ -1267,13 +1267,13 @@ namespace orc {
class TimestampColumnStatisticsImpl : public TimestampColumnStatistics,
public MutableColumnStatistics {
private:
- InternalIntegerStatistics _stats;
- bool _hasLowerBound;
- bool _hasUpperBound;
- int64_t _lowerBound;
- int64_t _upperBound;
- int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp
- int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp
+ InternalIntegerStatistics stats_;
+ bool hasLowerBound_;
+ bool hasUpperBound_;
+ int64_t lowerBound_;
+ int64_t upperBound_;
+ int32_t minimumNanos_; // last 6 digits of nanosecond of minimum timestamp
+ int32_t maximumNanos_; // last 6 digits of nanosecond of maximum timestamp
static constexpr int32_t DEFAULT_MIN_NANOS = 0;
static constexpr int32_t DEFAULT_MAX_NANOS = 999999;
@@ -1286,36 +1286,36 @@ namespace orc {
virtual ~TimestampColumnStatisticsImpl() override;
bool hasMinimum() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximum() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
int64_t getMinimum() const override {
if (hasMinimum()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
@@ -1323,46 +1323,46 @@ namespace orc {
int64_t getMaximum() const override {
if (hasMaximum()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int64_t minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximum(int64_t maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
void update(int64_t value) {
- _stats.updateMinMax(value);
+ stats_.updateMinMax(value);
}
void update(int64_t milli, int32_t nano) {
- if (!_stats.hasMinimum()) {
- _stats.setHasMinimum(true);
- _stats.setHasMaximum(true);
- _stats.setMinimum(milli);
- _stats.setMaximum(milli);
- _maximumNanos = _minimumNanos = nano;
+ if (!stats_.hasMinimum()) {
+ stats_.setHasMinimum(true);
+ stats_.setHasMaximum(true);
+ stats_.setMinimum(milli);
+ stats_.setMaximum(milli);
+ maximumNanos_ = minimumNanos_ = nano;
} else {
- if (milli <= _stats.getMinimum()) {
- if (milli < _stats.getMinimum() || nano < _minimumNanos) {
- _minimumNanos = nano;
+ if (milli <= stats_.getMinimum()) {
+ if (milli < stats_.getMinimum() || nano < minimumNanos_) {
+ minimumNanos_ = nano;
}
- _stats.setMinimum(milli);
+ stats_.setMinimum(milli);
}
- if (milli >= _stats.getMaximum()) {
- if (milli > _stats.getMaximum() || nano > _maximumNanos) {
- _maximumNanos = nano;
+ if (milli >= stats_.getMaximum()) {
+ if (milli > stats_.getMaximum() || nano > maximumNanos_) {
+ maximumNanos_ = nano;
}
- _stats.setMaximum(milli);
+ stats_.setMaximum(milli);
}
}
}
@@ -1371,55 +1371,55 @@ namespace orc {
const TimestampColumnStatisticsImpl& tsStats =
dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
- _stats.setHasNull(_stats.hasNull() || tsStats.hasNull());
- _stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues());
+ stats_.setHasNull(stats_.hasNull() || tsStats.hasNull());
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + tsStats.getNumberOfValues());
if (tsStats.hasMinimum()) {
- if (!_stats.hasMinimum()) {
- _stats.setHasMinimum(true);
- _stats.setHasMaximum(true);
- _stats.setMinimum(tsStats.getMinimum());
- _stats.setMaximum(tsStats.getMaximum());
- _minimumNanos = tsStats.getMinimumNanos();
- _maximumNanos = tsStats.getMaximumNanos();
+ if (!stats_.hasMinimum()) {
+ stats_.setHasMinimum(true);
+ stats_.setHasMaximum(true);
+ stats_.setMinimum(tsStats.getMinimum());
+ stats_.setMaximum(tsStats.getMaximum());
+ minimumNanos_ = tsStats.getMinimumNanos();
+ maximumNanos_ = tsStats.getMaximumNanos();
} else {
- if (tsStats.getMaximum() >= _stats.getMaximum()) {
- if (tsStats.getMaximum() > _stats.getMaximum() ||
- tsStats.getMaximumNanos() > _maximumNanos) {
- _maximumNanos = tsStats.getMaximumNanos();
+ if (tsStats.getMaximum() >= stats_.getMaximum()) {
+ if (tsStats.getMaximum() > stats_.getMaximum() ||
+ tsStats.getMaximumNanos() > maximumNanos_) {
+ maximumNanos_ = tsStats.getMaximumNanos();
}
- _stats.setMaximum(tsStats.getMaximum());
+ stats_.setMaximum(tsStats.getMaximum());
}
- if (tsStats.getMinimum() <= _stats.getMinimum()) {
- if (tsStats.getMinimum() < _stats.getMinimum() ||
- tsStats.getMinimumNanos() < _minimumNanos) {
- _minimumNanos = tsStats.getMinimumNanos();
+ if (tsStats.getMinimum() <= stats_.getMinimum()) {
+ if (tsStats.getMinimum() < stats_.getMinimum() ||
+ tsStats.getMinimumNanos() < minimumNanos_) {
+ minimumNanos_ = tsStats.getMinimumNanos();
}
- _stats.setMinimum(tsStats.getMinimum());
+ stats_.setMinimum(tsStats.getMinimum());
}
}
}
}
void reset() override {
- _stats.reset();
- _minimumNanos = DEFAULT_MIN_NANOS;
- _maximumNanos = DEFAULT_MAX_NANOS;
+ stats_.reset();
+ minimumNanos_ = DEFAULT_MIN_NANOS;
+ maximumNanos_ = DEFAULT_MAX_NANOS;
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::TimestampStatistics* tsStats = pbStats.mutable_timestamp_statistics();
- if (_stats.hasMinimum()) {
- tsStats->set_minimum_utc(_stats.getMinimum());
- tsStats->set_maximum_utc(_stats.getMaximum());
- if (_minimumNanos != DEFAULT_MIN_NANOS) {
- tsStats->set_minimum_nanos(_minimumNanos + 1);
+ if (stats_.hasMinimum()) {
+ tsStats->set_minimum_utc(stats_.getMinimum());
+ tsStats->set_maximum_utc(stats_.getMaximum());
+ if (minimumNanos_ != DEFAULT_MIN_NANOS) {
+ tsStats->set_minimum_nanos(minimumNanos_ + 1);
}
- if (_maximumNanos != DEFAULT_MAX_NANOS) {
- tsStats->set_maximum_nanos(_maximumNanos + 1);
+ if (maximumNanos_ != DEFAULT_MAX_NANOS) {
+ tsStats->set_maximum_nanos(maximumNanos_ + 1);
}
} else {
tsStats->clear_minimum_utc();
@@ -1478,16 +1478,16 @@ namespace orc {
}
bool hasLowerBound() const override {
- return _hasLowerBound;
+ return hasLowerBound_;
}
bool hasUpperBound() const override {
- return _hasUpperBound;
+ return hasUpperBound_;
}
int64_t getLowerBound() const override {
if (hasLowerBound()) {
- return _lowerBound;
+ return lowerBound_;
} else {
throw ParseError("LowerBound is not defined.");
}
@@ -1495,7 +1495,7 @@ namespace orc {
int64_t getUpperBound() const override {
if (hasUpperBound()) {
- return _upperBound;
+ return upperBound_;
} else {
throw ParseError("UpperBound is not defined.");
}
@@ -1503,7 +1503,7 @@ namespace orc {
int32_t getMinimumNanos() const override {
if (hasMinimum()) {
- return _minimumNanos;
+ return minimumNanos_;
} else {
throw ParseError("Minimum is not defined.");
}
@@ -1511,7 +1511,7 @@ namespace orc {
int32_t getMaximumNanos() const override {
if (hasMaximum()) {
- return _maximumNanos;
+ return maximumNanos_;
} else {
throw ParseError("Maximum is not defined.");
}
@@ -1521,7 +1521,7 @@ namespace orc {
class CollectionColumnStatisticsImpl : public CollectionColumnStatistics,
public MutableColumnStatistics {
private:
- InternalCollectionStatistics _stats;
+ InternalCollectionStatistics stats_;
public:
CollectionColumnStatisticsImpl() {
@@ -1531,40 +1531,40 @@ namespace orc {
virtual ~CollectionColumnStatisticsImpl() override;
bool hasMinimumChildren() const override {
- return _stats.hasMinimum();
+ return stats_.hasMinimum();
}
bool hasMaximumChildren() const override {
- return _stats.hasMaximum();
+ return stats_.hasMaximum();
}
bool hasTotalChildren() const override {
- return _stats.hasSum();
+ return stats_.hasSum();
}
void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
+ return stats_.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
+ stats_.setNumberOfValues(value);
}
bool hasNull() const override {
- return _stats.hasNull();
+ return stats_.hasNull();
}
void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
+ stats_.setHasNull(hasNull);
}
uint64_t getMinimumChildren() const override {
if (hasMinimumChildren()) {
- return _stats.getMinimum();
+ return stats_.getMinimum();
} else {
throw ParseError("MinimumChildren is not defined.");
}
@@ -1572,7 +1572,7 @@ namespace orc {
uint64_t getMaximumChildren() const override {
if (hasMaximumChildren()) {
- return _stats.getMaximum();
+ return stats_.getMaximum();
} else {
throw ParseError("MaximumChildren is not defined.");
}
@@ -1580,78 +1580,78 @@ namespace orc {
uint64_t getTotalChildren() const override {
if (hasTotalChildren()) {
- return _stats.getSum();
+ return stats_.getSum();
} else {
throw ParseError("TotalChildren is not defined.");
}
}
void setMinimumChildren(uint64_t minimum) override {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
+ stats_.setHasMinimum(true);
+ stats_.setMinimum(minimum);
}
void setMaximumChildren(uint64_t maximum) override {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
+ stats_.setHasMaximum(true);
+ stats_.setMaximum(maximum);
}
void setTotalChildren(uint64_t sum) override {
- _stats.setHasSum(true);
- _stats.setSum(sum);
+ stats_.setHasSum(true);
+ stats_.setSum(sum);
}
void setHasTotalChildren(bool hasSum) override {
- _stats.setHasSum(hasSum);
+ stats_.setHasSum(hasSum);
}
void merge(const MutableColumnStatistics& other) override {
const CollectionColumnStatisticsImpl& collectionStats =
dynamic_cast<const CollectionColumnStatisticsImpl&>(other);
- _stats.merge(collectionStats._stats);
+ stats_.merge(collectionStats.stats_);
// hasSumValue here means no overflow
- _stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren());
- if (_stats.hasSum()) {
- uint64_t oldSum = _stats.getSum();
- _stats.setSum(_stats.getSum() + collectionStats.getTotalChildren());
- if (oldSum > _stats.getSum()) {
- _stats.setHasSum(false);
+ stats_.setHasSum(stats_.hasSum() && collectionStats.hasTotalChildren());
+ if (stats_.hasSum()) {
+ uint64_t oldSum = stats_.getSum();
+ stats_.setSum(stats_.getSum() + collectionStats.getTotalChildren());
+ if (oldSum > stats_.getSum()) {
+ stats_.setHasSum(false);
}
}
}
void reset() override {
- _stats.reset();
+ stats_.reset();
setTotalChildren(0);
}
void update(uint64_t value) {
- _stats.updateMinMax(value);
- if (_stats.hasSum()) {
- uint64_t oldSum = _stats.getSum();
- _stats.setSum(_stats.getSum() + value);
- if (oldSum > _stats.getSum()) {
- _stats.setHasSum(false);
+ stats_.updateMinMax(value);
+ if (stats_.hasSum()) {
+ uint64_t oldSum = stats_.getSum();
+ stats_.setSum(stats_.getSum() + value);
+ if (oldSum > stats_.getSum()) {
+ stats_.setHasSum(false);
}
}
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_has_null(_stats.hasNull());
- pbStats.set_number_of_values(_stats.getNumberOfValues());
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
proto::CollectionStatistics* collectionStats = pbStats.mutable_collection_statistics();
- if (_stats.hasMinimum()) {
- collectionStats->set_min_children(_stats.getMinimum());
- collectionStats->set_max_children(_stats.getMaximum());
+ if (stats_.hasMinimum()) {
+ collectionStats->set_min_children(stats_.getMinimum());
+ collectionStats->set_max_children(stats_.getMaximum());
} else {
collectionStats->clear_min_children();
collectionStats->clear_max_children();
}
- if (_stats.hasSum()) {
- collectionStats->set_total_children(_stats.getSum());
+ if (stats_.hasSum()) {
+ collectionStats->set_total_children(stats_.getSum());
} else {
collectionStats->clear_total_children();
}
@@ -1688,7 +1688,7 @@ namespace orc {
class StatisticsImpl : public Statistics {
private:
- std::vector<ColumnStatistics*> colStats;
+ std::vector<ColumnStatistics*> colStats_;
// DELIBERATELY NOT IMPLEMENTED
StatisticsImpl(const StatisticsImpl&);
@@ -1700,20 +1700,20 @@ namespace orc {
StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
- return colStats[columnId];
+ return colStats_[columnId];
}
virtual ~StatisticsImpl() override;
uint32_t getNumberOfColumns() const override {
- return static_cast<uint32_t>(colStats.size());
+ return static_cast<uint32_t>(colStats_.size());
}
};
class StripeStatisticsImpl : public StripeStatistics {
private:
- std::unique_ptr<StatisticsImpl> columnStats;
- std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats;
+ std::unique_ptr<StatisticsImpl> columnStats_;
+ std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats_;
// DELIBERATELY NOT IMPLEMENTED
StripeStatisticsImpl(const StripeStatisticsImpl&);
@@ -1725,23 +1725,23 @@ namespace orc {
const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
- return columnStats->getColumnStatistics(columnId);
+ return columnStats_->getColumnStatistics(columnId);
}
uint32_t getNumberOfColumns() const override {
- return columnStats->getNumberOfColumns();
+ return columnStats_->getNumberOfColumns();
}
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
uint32_t rowIndex) const override {
// check id indices are valid
- return rowIndexStats[columnId][rowIndex].get();
+ return rowIndexStats_[columnId][rowIndex].get();
}
virtual ~StripeStatisticsImpl() override;
uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
- return static_cast<uint32_t>(rowIndexStats[columnId].size());
+ return static_cast<uint32_t>(rowIndexStats_[columnId].size());
}
};
diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc
index 8507e95767..a5609f7629 100644
--- a/contrib/libs/apache/orc/c++/src/StripeStream.cc
+++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc
@@ -19,25 +19,27 @@
#include "StripeStream.hh"
#include "RLE.hh"
#include "Reader.hh"
+#include "io/Cache.hh"
#include "orc/Exceptions.hh"
#include "wrap/coded-stream-wrapper.h"
namespace orc {
- StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index,
- const proto::StripeInformation& _stripeInfo,
- const proto::StripeFooter& _footer, uint64_t _stripeStart,
- InputStream& _input, const Timezone& _writerTimezone,
- const Timezone& _readerTimezone)
- : reader(_reader),
- stripeInfo(_stripeInfo),
- footer(_footer),
- stripeIndex(_index),
- stripeStart(_stripeStart),
- input(_input),
- writerTimezone(_writerTimezone),
- readerTimezone(_readerTimezone) {
+ StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index,
+ const proto::StripeInformation& stripeInfo,
+ const proto::StripeFooter& footer, uint64_t stripeStart,
+ InputStream& input, const Timezone& writerTimezone,
+ const Timezone& readerTimezone)
+ : reader_(reader),
+ stripeInfo_(stripeInfo),
+ footer_(footer),
+ stripeIndex_(index),
+ stripeStart_(stripeStart),
+ input_(input),
+ writerTimezone_(writerTimezone),
+ readerTimezone_(readerTimezone),
+ readCache_(reader.getReadCache()) {
// PASS
}
@@ -58,51 +60,65 @@ namespace orc {
}
const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const {
- return reader.getSelectedColumns();
+ return reader_.getSelectedColumns();
}
proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId) const {
- return footer.columns(static_cast<int>(columnId));
+ return footer_.columns(static_cast<int>(columnId));
}
const Timezone& StripeStreamsImpl::getWriterTimezone() const {
- return writerTimezone;
+ return writerTimezone_;
}
const Timezone& StripeStreamsImpl::getReaderTimezone() const {
- return readerTimezone;
+ return readerTimezone_;
}
std::ostream* StripeStreamsImpl::getErrorStream() const {
- return reader.getFileContents().errorStream;
+ return reader_.getFileContents().errorStream;
}
std::unique_ptr<SeekableInputStream> StripeStreamsImpl::getStream(uint64_t columnId,
proto::Stream_Kind kind,
bool shouldStream) const {
- uint64_t offset = stripeStart;
- uint64_t dataEnd = stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length();
- MemoryPool* pool = reader.getFileContents().pool;
- for (int i = 0; i < footer.streams_size(); ++i) {
- const proto::Stream& stream = footer.streams(i);
+ uint64_t offset = stripeStart_;
+ uint64_t dataEnd =
+ stripeInfo_.offset() + stripeInfo_.index_length() + stripeInfo_.data_length();
+ MemoryPool* pool = reader_.getFileContents().pool;
+ for (int i = 0; i < footer_.streams_size(); ++i) {
+ const proto::Stream& stream = footer_.streams(i);
if (stream.has_kind() && stream.kind() == kind &&
stream.column() == static_cast<uint64_t>(columnId)) {
uint64_t streamLength = stream.length();
- uint64_t myBlock = shouldStream ? input.getNaturalReadSize() : streamLength;
if (offset + streamLength > dataEnd) {
std::stringstream msg;
- msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex
+ msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex_
<< ": streamOffset=" << offset << ", streamLength=" << streamLength
- << ", stripeOffset=" << stripeInfo.offset()
- << ", stripeIndexLength=" << stripeInfo.index_length()
- << ", stripeDataLength=" << stripeInfo.data_length();
+ << ", stripeOffset=" << stripeInfo_.offset()
+ << ", stripeIndexLength=" << stripeInfo_.index_length()
+ << ", stripeDataLength=" << stripeInfo_.data_length();
throw ParseError(msg.str());
}
- return createDecompressor(reader.getCompression(),
- std::make_unique<SeekableFileInputStream>(
- &input, offset, stream.length(), *pool, myBlock),
- reader.getCompressionSize(), *pool,
- reader.getFileContents().readerMetrics);
+
+ BufferSlice slice;
+ if (readCache_) {
+ ReadRange range{offset, streamLength};
+ slice = readCache_->read(range);
+ }
+
+ uint64_t myBlock = shouldStream ? input_.getNaturalReadSize() : streamLength;
+ std::unique_ptr<SeekableInputStream> seekableInput;
+ if (slice.buffer) {
+ seekableInput = std::make_unique<SeekableArrayInputStream>(
+ slice.buffer->data() + slice.offset, slice.length);
+ } else {
+ seekableInput = std::make_unique<SeekableFileInputStream>(&input_, offset, streamLength,
+ *pool, myBlock);
+ }
+ return createDecompressor(reader_.getCompression(), std::move(seekableInput),
+ reader_.getCompressionSize(), *pool,
+ reader_.getFileContents().readerMetrics);
}
offset += stream.length();
}
@@ -110,38 +126,38 @@ namespace orc {
}
MemoryPool& StripeStreamsImpl::getMemoryPool() const {
- return *reader.getFileContents().pool;
+ return *reader_.getFileContents().pool;
}
ReaderMetrics* StripeStreamsImpl::getReaderMetrics() const {
- return reader.getFileContents().readerMetrics;
+ return reader_.getFileContents().readerMetrics;
}
bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const {
- return reader.getThrowOnHive11DecimalOverflow();
+ return reader_.getThrowOnHive11DecimalOverflow();
}
bool StripeStreamsImpl::isDecimalAsLong() const {
- return reader.getIsDecimalAsLong();
+ return reader_.getIsDecimalAsLong();
}
int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const {
- return reader.getForcedScaleOnHive11Decimal();
+ return reader_.getForcedScaleOnHive11Decimal();
}
const SchemaEvolution* StripeStreamsImpl::getSchemaEvolution() const {
- return reader.getSchemaEvolution();
+ return reader_.getSchemaEvolution();
}
void StripeInformationImpl::ensureStripeFooterLoaded() const {
- if (stripeFooter.get() == nullptr) {
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(compression,
- std::make_unique<SeekableFileInputStream>(
- stream, offset + indexLength + dataLength, footerLength, memory),
- blockSize, memory, metrics);
- stripeFooter = std::make_unique<proto::StripeFooter>();
- if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
+ if (stripeFooter_.get() == nullptr) {
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ compression_,
+ std::make_unique<SeekableFileInputStream>(stream_, offset_ + indexLength_ + dataLength_,
+ footerLength_, memory_),
+ blockSize_, memory_, metrics_);
+ stripeFooter_ = std::make_unique<proto::StripeFooter>();
+ if (!stripeFooter_->ParseFromZeroCopyStream(pbStream.get())) {
throw ParseError("Failed to parse the stripe footer");
}
}
@@ -150,12 +166,12 @@ namespace orc {
std::unique_ptr<StreamInformation> StripeInformationImpl::getStreamInformation(
uint64_t streamId) const {
ensureStripeFooterLoaded();
- uint64_t streamOffset = offset;
+ uint64_t streamOffset = offset_;
for (uint64_t s = 0; s < streamId; ++s) {
- streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
+ streamOffset += stripeFooter_->streams(static_cast<int>(s)).length();
}
return std::make_unique<StreamInformationImpl>(
- streamOffset, stripeFooter->streams(static_cast<int>(streamId)));
+ streamOffset, stripeFooter_->streams(static_cast<int>(streamId)));
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh
index eae6ce0c31..2d26f8575e 100644
--- a/contrib/libs/apache/orc/c++/src/StripeStream.hh
+++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh
@@ -30,6 +30,7 @@
namespace orc {
class RowReaderImpl;
+ class ReadRangeCache;
/**
* StripeStream Implementation
@@ -37,14 +38,15 @@ namespace orc {
class StripeStreamsImpl : public StripeStreams {
private:
- const RowReaderImpl& reader;
- const proto::StripeInformation& stripeInfo;
- const proto::StripeFooter& footer;
- const uint64_t stripeIndex;
- const uint64_t stripeStart;
- InputStream& input;
- const Timezone& writerTimezone;
- const Timezone& readerTimezone;
+ const RowReaderImpl& reader_;
+ const proto::StripeInformation& stripeInfo_;
+ const proto::StripeFooter& footer_;
+ const uint64_t stripeIndex_;
+ const uint64_t stripeStart_;
+ InputStream& input_;
+ const Timezone& writerTimezone_;
+ const Timezone& readerTimezone_;
+ std::shared_ptr<ReadRangeCache> readCache_;
public:
StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index,
@@ -87,36 +89,36 @@ namespace orc {
class StreamInformationImpl : public StreamInformation {
private:
- StreamKind kind;
- uint64_t column;
- uint64_t offset;
- uint64_t length;
+ StreamKind kind_;
+ uint64_t column_;
+ uint64_t offset_;
+ uint64_t length_;
public:
- StreamInformationImpl(uint64_t _offset, const proto::Stream& stream)
- : kind(static_cast<StreamKind>(stream.kind())),
- column(stream.column()),
- offset(_offset),
- length(stream.length()) {
+ StreamInformationImpl(uint64_t offset, const proto::Stream& stream)
+ : kind_(static_cast<StreamKind>(stream.kind())),
+ column_(stream.column()),
+ offset_(offset),
+ length_(stream.length()) {
// PASS
}
~StreamInformationImpl() override;
StreamKind getKind() const override {
- return kind;
+ return kind_;
}
uint64_t getColumnId() const override {
- return column;
+ return column_;
}
uint64_t getOffset() const override {
- return offset;
+ return offset_;
}
uint64_t getLength() const override {
- return length;
+ return length_;
}
};
@@ -125,34 +127,34 @@ namespace orc {
*/
class StripeInformationImpl : public StripeInformation {
- uint64_t offset;
- uint64_t indexLength;
- uint64_t dataLength;
- uint64_t footerLength;
- uint64_t numRows;
- InputStream* stream;
- MemoryPool& memory;
- CompressionKind compression;
- uint64_t blockSize;
- mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
- ReaderMetrics* metrics;
+ uint64_t offset_;
+ uint64_t indexLength_;
+ uint64_t dataLength_;
+ uint64_t footerLength_;
+ uint64_t numRows_;
+ InputStream* stream_;
+ MemoryPool& memory_;
+ CompressionKind compression_;
+ uint64_t blockSize_;
+ mutable std::unique_ptr<proto::StripeFooter> stripeFooter_;
+ ReaderMetrics* metrics_;
void ensureStripeFooterLoaded() const;
public:
- StripeInformationImpl(uint64_t _offset, uint64_t _indexLength, uint64_t _dataLength,
- uint64_t _footerLength, uint64_t _numRows, InputStream* _stream,
- MemoryPool& _memory, CompressionKind _compression, uint64_t _blockSize,
- ReaderMetrics* _metrics)
- : offset(_offset),
- indexLength(_indexLength),
- dataLength(_dataLength),
- footerLength(_footerLength),
- numRows(_numRows),
- stream(_stream),
- memory(_memory),
- compression(_compression),
- blockSize(_blockSize),
- metrics(_metrics) {
+ StripeInformationImpl(uint64_t offset, uint64_t indexLength, uint64_t dataLength,
+ uint64_t footerLength, uint64_t numRows, InputStream* stream,
+ MemoryPool& memory, CompressionKind compression, uint64_t blockSize,
+ ReaderMetrics* metrics)
+ : offset_(offset),
+ indexLength_(indexLength),
+ dataLength_(dataLength),
+ footerLength_(footerLength),
+ numRows_(numRows),
+ stream_(stream),
+ memory_(memory),
+ compression_(compression),
+ blockSize_(blockSize),
+ metrics_(metrics) {
// PASS
}
@@ -161,49 +163,50 @@ namespace orc {
}
uint64_t getOffset() const override {
- return offset;
+ return offset_;
}
uint64_t getLength() const override {
- return indexLength + dataLength + footerLength;
+ return indexLength_ + dataLength_ + footerLength_;
}
uint64_t getIndexLength() const override {
- return indexLength;
+ return indexLength_;
}
uint64_t getDataLength() const override {
- return dataLength;
+ return dataLength_;
}
uint64_t getFooterLength() const override {
- return footerLength;
+ return footerLength_;
}
uint64_t getNumberOfRows() const override {
- return numRows;
+ return numRows_;
}
uint64_t getNumberOfStreams() const override {
ensureStripeFooterLoaded();
- return static_cast<uint64_t>(stripeFooter->streams_size());
+ return static_cast<uint64_t>(stripeFooter_->streams_size());
}
std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId) const override;
ColumnEncodingKind getColumnEncoding(uint64_t colId) const override {
ensureStripeFooterLoaded();
- return static_cast<ColumnEncodingKind>(stripeFooter->columns(static_cast<int>(colId)).kind());
+ return static_cast<ColumnEncodingKind>(
+ stripeFooter_->columns(static_cast<int>(colId)).kind());
}
uint64_t getDictionarySize(uint64_t colId) const override {
ensureStripeFooterLoaded();
return static_cast<ColumnEncodingKind>(
- stripeFooter->columns(static_cast<int>(colId)).dictionary_size());
+ stripeFooter_->columns(static_cast<int>(colId)).dictionary_size());
}
const std::string& getWriterTimezone() const override {
ensureStripeFooterLoaded();
- return stripeFooter->writer_timezone();
+ return stripeFooter_->writer_timezone();
}
};
diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc
index 4c78a53a29..384f8ea99f 100644
--- a/contrib/libs/apache/orc/c++/src/Timezone.cc
+++ b/contrib/libs/apache/orc/c++/src/Timezone.cc
@@ -184,49 +184,49 @@ namespace orc {
* day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week>
*/
class FutureRuleImpl : public FutureRule {
- std::string ruleString;
- TimezoneVariant standard;
- bool hasDst;
- TimezoneVariant dst;
- Transition start;
- Transition end;
+ std::string ruleString_;
+ TimezoneVariant standard_;
+ bool hasDst_;
+ TimezoneVariant dst_;
+ Transition start_;
+ Transition end_;
// expanded time_t offsets of transitions
- std::vector<int64_t> offsets;
+ std::vector<int64_t> offsets_;
// Is the epoch (1 Jan 1970 00:00) in standard time?
// This code assumes that the transition dates fall in the same order
// each year. Hopefully no timezone regions decide to move across the
// equator, which is about what it would take.
- bool startInStd;
+ bool startInStd_;
void computeOffsets() {
- if (!hasDst) {
- startInStd = true;
- offsets.resize(1);
+ if (!hasDst_) {
+ startInStd_ = true;
+ offsets_.resize(1);
} else {
// Insert a transition for the epoch and two per a year for the next
// 400 years. We assume that the all even positions are in standard
// time if and only if startInStd and the odd ones are the reverse.
- offsets.resize(400 * 2 + 1);
- startInStd = start.getTime(1970) < end.getTime(1970);
+ offsets_.resize(400 * 2 + 1);
+ startInStd_ = start_.getTime(1970) < end_.getTime(1970);
int64_t base = 0;
for (int64_t year = 1970; year < 1970 + 400; ++year) {
- if (startInStd) {
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
- base + start.getTime(year) - standard.gmtOffset;
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
- base + end.getTime(year) - dst.gmtOffset;
+ if (startInStd_) {
+ offsets_[static_cast<uint64_t>(year - 1970) * 2 + 1] =
+ base + start_.getTime(year) - standard_.gmtOffset;
+ offsets_[static_cast<uint64_t>(year - 1970) * 2 + 2] =
+ base + end_.getTime(year) - dst_.gmtOffset;
} else {
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
- base + end.getTime(year) - dst.gmtOffset;
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
- base + start.getTime(year) - standard.gmtOffset;
+ offsets_[static_cast<uint64_t>(year - 1970) * 2 + 1] =
+ base + end_.getTime(year) - dst_.gmtOffset;
+ offsets_[static_cast<uint64_t>(year - 1970) * 2 + 2] =
+ base + start_.getTime(year) - standard_.gmtOffset;
}
base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY;
}
}
- offsets[0] = 0;
+ offsets_[0] = 0;
}
public:
@@ -247,34 +247,34 @@ namespace orc {
}
bool FutureRuleImpl::isDefined() const {
- return ruleString.size() > 0;
+ return ruleString_.size() > 0;
}
const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const {
- if (!hasDst) {
- return standard;
+ if (!hasDst_) {
+ return standard_;
} else {
int64_t adjusted = clk % SECONDS_PER_400_YEARS;
if (adjusted < 0) {
adjusted += SECONDS_PER_400_YEARS;
}
- int64_t idx = binarySearch(offsets, adjusted);
- if (startInStd == (idx % 2 == 0)) {
- return standard;
+ int64_t idx = binarySearch(offsets_, adjusted);
+ if (startInStd_ == (idx % 2 == 0)) {
+ return standard_;
} else {
- return dst;
+ return dst_;
}
}
}
void FutureRuleImpl::print(std::ostream& out) const {
if (isDefined()) {
- out << " Future rule: " << ruleString << "\n";
- out << " standard " << standard.toString() << "\n";
- if (hasDst) {
- out << " dst " << dst.toString() << "\n";
- out << " start " << start.toString() << "\n";
- out << " end " << end.toString() << "\n";
+ out << " Future rule: " << ruleString_ << "\n";
+ out << " standard " << standard_.toString() << "\n";
+ if (hasDst_) {
+ out << " dst " << dst_.toString() << "\n";
+ out << " start " << start_.toString() << "\n";
+ out << " end " << end_.toString() << "\n";
}
}
}
@@ -285,40 +285,40 @@ namespace orc {
class FutureRuleParser {
public:
FutureRuleParser(const std::string& str, FutureRuleImpl* rule)
- : ruleString(str), length(str.size()), position(0), output(*rule) {
- output.ruleString = str;
- if (position != length) {
- parseName(output.standard.name);
- output.standard.gmtOffset = -parseOffset();
- output.standard.isDst = false;
- output.hasDst = position < length;
- if (output.hasDst) {
- parseName(output.dst.name);
- output.dst.isDst = true;
- if (ruleString[position] != ',') {
- output.dst.gmtOffset = -parseOffset();
+ : ruleString_(str), length_(str.size()), position_(0), output_(*rule) {
+ output_.ruleString_ = str;
+ if (position_ != length_) {
+ parseName(output_.standard_.name);
+ output_.standard_.gmtOffset = -parseOffset();
+ output_.standard_.isDst = false;
+ output_.hasDst_ = position_ < length_;
+ if (output_.hasDst_) {
+ parseName(output_.dst_.name);
+ output_.dst_.isDst = true;
+ if (ruleString_[position_] != ',') {
+ output_.dst_.gmtOffset = -parseOffset();
} else {
- output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60;
+ output_.dst_.gmtOffset = output_.standard_.gmtOffset + 60 * 60;
}
- parseTransition(output.start);
- parseTransition(output.end);
+ parseTransition(output_.start_);
+ parseTransition(output_.end_);
}
- if (position != length) {
+ if (position_ != length_) {
throwError("Extra text");
}
- output.computeOffsets();
+ output_.computeOffsets();
}
}
private:
- const std::string& ruleString;
- size_t length;
- size_t position;
- FutureRuleImpl& output;
+ const std::string& ruleString_;
+ size_t length_;
+ size_t position_;
+ FutureRuleImpl& output_;
void throwError(const char* msg) {
std::stringstream buffer;
- buffer << msg << " at " << position << " in '" << ruleString << "'";
+ buffer << msg << " at " << position_ << " in '" << ruleString_ << "'";
throw TimezoneError(buffer.str());
}
@@ -328,46 +328,46 @@ namespace orc {
* and set the output string.
*/
void parseName(std::string& result) {
- if (position == length) {
+ if (position_ == length_) {
throwError("name required");
}
- size_t start = position;
- if (ruleString[position] == '<') {
- while (position < length && ruleString[position] != '>') {
- position += 1;
+ size_t start = position_;
+ if (ruleString_[position_] == '<') {
+ while (position_ < length_ && ruleString_[position_] != '>') {
+ position_ += 1;
}
- if (position == length) {
+ if (position_ == length_) {
throwError("missing close '>'");
}
- position += 1;
+ position_ += 1;
} else {
- while (position < length) {
- char ch = ruleString[position];
+ while (position_ < length_) {
+ char ch = ruleString_[position_];
if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') {
break;
}
- position += 1;
+ position_ += 1;
}
}
- if (position == start) {
+ if (position_ == start) {
throwError("empty string not allowed");
}
- result = ruleString.substr(start, position - start);
+ result = ruleString_.substr(start, position_ - start);
}
/**
* Parse an integer of the form [0-9]+ and return it.
*/
int64_t parseNumber() {
- if (position >= length) {
+ if (position_ >= length_) {
throwError("missing number");
}
int64_t result = 0;
- while (position < length) {
- char ch = ruleString[position];
+ while (position_ < length_) {
+ char ch = ruleString_[position_];
if (isdigit(ch)) {
result = result * 10 + (ch - '0');
- position += 1;
+ position_ += 1;
} else {
break;
}
@@ -383,17 +383,17 @@ namespace orc {
int64_t parseOffset() {
int64_t scale = 3600;
bool isNegative = false;
- if (position < length) {
- char ch = ruleString[position];
+ if (position_ < length_) {
+ char ch = ruleString_[position_];
isNegative = ch == '-';
if (ch == '-' || ch == '+') {
- position += 1;
+ position_ += 1;
}
}
int64_t result = parseNumber() * scale;
- while (position < length && scale > 1 && ruleString[position] == ':') {
+ while (position_ < length_ && scale > 1 && ruleString_[position_] == ':') {
scale /= 60;
- position += 1;
+ position_ += 1;
result += parseNumber() * scale;
}
if (isNegative) {
@@ -407,35 +407,35 @@ namespace orc {
* ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)?
*/
void parseTransition(Transition& transition) {
- if (length - position < 2 || ruleString[position] != ',') {
+ if (length_ - position_ < 2 || ruleString_[position_] != ',') {
throwError("missing transition");
}
- position += 1;
- char ch = ruleString[position];
+ position_ += 1;
+ char ch = ruleString_[position_];
if (ch == 'J') {
transition.kind = TRANSITION_JULIAN;
- position += 1;
+ position_ += 1;
transition.day = parseNumber();
} else if (ch == 'M') {
transition.kind = TRANSITION_MONTH;
- position += 1;
+ position_ += 1;
transition.month = parseNumber();
- if (position == length || ruleString[position] != '.') {
+ if (position_ == length_ || ruleString_[position_] != '.') {
throwError("missing first .");
}
- position += 1;
+ position_ += 1;
transition.week = parseNumber();
- if (position == length || ruleString[position] != '.') {
+ if (position_ == length_ || ruleString_[position_] != '.') {
throwError("missing second .");
}
- position += 1;
+ position_ += 1;
transition.day = parseNumber();
} else {
transition.kind = TRANSITION_DAY;
transition.day = parseNumber();
}
- if (position < length && ruleString[position] == '/') {
- position += 1;
+ if (position_ < length_ && ruleString_[position_] == '/') {
+ position_ += 1;
transition.time = parseOffset();
} else {
transition.time = 2 * 60 * 60;
@@ -565,7 +565,7 @@ namespace orc {
class TimezoneImpl : public Timezone {
public:
- TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer);
+ TimezoneImpl(const std::string& filename, const std::vector<unsigned char>& buffer);
virtual ~TimezoneImpl() override;
/**
@@ -576,11 +576,11 @@ namespace orc {
void print(std::ostream&) const override;
uint64_t getVersion() const override {
- return version;
+ return version_;
}
int64_t getEpoch() const override {
- return epoch;
+ return epoch_;
}
int64_t convertToUTC(int64_t clk) const override {
@@ -599,31 +599,31 @@ namespace orc {
void parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset, uint64_t fileLength,
const VersionParser& version);
// filename
- std::string filename;
+ std::string filename_;
// the version of the file
- uint64_t version;
+ uint64_t version_;
// the list of variants for this timezone
- std::vector<TimezoneVariant> variants;
+ std::vector<TimezoneVariant> variants_;
// the list of the times where the local rules change
- std::vector<int64_t> transitions;
+ std::vector<int64_t> transitions_;
// the variant that starts at this transition.
- std::vector<uint64_t> currentVariant;
+ std::vector<uint64_t> currentVariant_;
// the variant before the first transition
- uint64_t ancientVariant;
+ uint64_t ancientVariant_;
// the rule for future times
- std::shared_ptr<FutureRule> futureRule;
+ std::shared_ptr<FutureRule> futureRule_;
// the last explicit transition after which we use the future rule
- int64_t lastTransition;
+ int64_t lastTransition_;
// The ORC epoch time in this timezone.
- int64_t epoch;
+ int64_t epoch_;
};
DIAGNOSTIC_PUSH
@@ -639,8 +639,8 @@ namespace orc {
// PASS
}
- TimezoneImpl::TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer)
- : filename(_filename) {
+ TimezoneImpl::TimezoneImpl(const std::string& filename, const std::vector<unsigned char>& buffer)
+ : filename_(filename) {
parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser());
// Build the literal for the ORC epoch
// 2015 Jan 1 00:00:00
@@ -653,7 +653,7 @@ namespace orc {
epochStruct.tm_year = 2015 - 1900;
epochStruct.tm_isdst = 0;
time_t utcEpoch = timegm(&epochStruct);
- epoch = utcEpoch - getVariant(utcEpoch).gmtOffset;
+ epoch_ = utcEpoch - getVariant(utcEpoch).gmtOffset;
}
std::string getTimezoneDirectory() {
@@ -783,9 +783,9 @@ namespace orc {
uint64_t variantCount, uint64_t nameOffset,
uint64_t nameCount) {
for (uint64_t variant = 0; variant < variantCount; ++variant) {
- variants[variant].gmtOffset =
+ variants_[variant].gmtOffset =
static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
- variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0;
+ variants_[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0;
uint64_t nameStart = ptr[variantOffset + 6 * variant + 5];
if (nameStart >= nameCount) {
std::stringstream buffer;
@@ -793,7 +793,7 @@ namespace orc {
<< " >= " << nameCount;
throw TimezoneError(buffer.str());
}
- variants[variant].name =
+ variants_[variant].name =
std::string(reinterpret_cast<const char*>(ptr) + nameOffset + nameStart);
}
}
@@ -833,7 +833,7 @@ namespace orc {
if (fileLength < headerOffset + 6 * 4 ||
strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) != 0) {
std::stringstream buffer;
- buffer << "non-tzfile " << filename;
+ buffer << "non-tzfile " << filename_;
throw TimezoneError(buffer.str());
}
@@ -854,7 +854,7 @@ namespace orc {
if (sectionLength > fileLength) {
std::stringstream buffer;
- buffer << "tzfile too short " << filename << " needs " << sectionLength << " and has "
+ buffer << "tzfile too short " << filename_ << " needs " << sectionLength << " and has "
<< fileLength;
throw TimezoneError(buffer.str());
}
@@ -864,82 +864,82 @@ namespace orc {
parseZoneFile(ptr, sectionLength, fileLength, Version2Parser());
return;
}
- version = versionParser.getVersion();
- variants.resize(variantCount);
- transitions.resize(timeCount);
- currentVariant.resize(timeCount);
+ version_ = versionParser.getVersion();
+ variants_.resize(variantCount);
+ transitions_.resize(timeCount);
+ currentVariant_.resize(timeCount);
parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, nameCount);
bool foundAncient = false;
for (uint64_t t = 0; t < timeCount; ++t) {
- transitions[t] = versionParser.parseTime(ptr + timeOffset + t * versionParser.getTimeSize());
- currentVariant[t] = ptr[timeVariantOffset + t];
- if (currentVariant[t] >= variantCount) {
+ transitions_[t] = versionParser.parseTime(ptr + timeOffset + t * versionParser.getTimeSize());
+ currentVariant_[t] = ptr[timeVariantOffset + t];
+ if (currentVariant_[t] >= variantCount) {
std::stringstream buffer;
- buffer << "tzfile rule out of range " << filename << " references rule "
- << currentVariant[t] << " of " << variantCount;
+ buffer << "tzfile rule out of range " << filename_ << " references rule "
+ << currentVariant_[t] << " of " << variantCount;
throw TimezoneError(buffer.str());
}
// find the oldest standard time and use that as the ancient value
- if (!foundAncient && !variants[currentVariant[t]].isDst) {
+ if (!foundAncient && !variants_[currentVariant_[t]].isDst) {
foundAncient = true;
- ancientVariant = currentVariant[t];
+ ancientVariant_ = currentVariant_[t];
}
}
if (!foundAncient) {
- ancientVariant = 0;
+ ancientVariant_ = 0;
}
- futureRule = parseFutureRule(
+ futureRule_ = parseFutureRule(
versionParser.parseFutureString(ptr, sectionLength, fileLength - sectionLength));
// find the lower bound for applying the future rule
- if (futureRule->isDefined()) {
+ if (futureRule_->isDefined()) {
if (timeCount > 0) {
- lastTransition = transitions[timeCount - 1];
+ lastTransition_ = transitions_[timeCount - 1];
} else {
- lastTransition = INT64_MIN;
+ lastTransition_ = INT64_MIN;
}
} else {
- lastTransition = INT64_MAX;
+ lastTransition_ = INT64_MAX;
}
}
const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const {
// if it is after the last explicit entry in the table,
// use the future rule to get an answer
- if (clk > lastTransition) {
- return futureRule->getVariant(clk);
+ if (clk > lastTransition_) {
+ return futureRule_->getVariant(clk);
} else {
- int64_t transition = binarySearch(transitions, clk);
+ int64_t transition = binarySearch(transitions_, clk);
uint64_t idx;
if (transition < 0) {
- idx = ancientVariant;
+ idx = ancientVariant_;
} else {
- idx = currentVariant[static_cast<size_t>(transition)];
+ idx = currentVariant_[static_cast<size_t>(transition)];
}
- return variants[idx];
+ return variants_[idx];
}
}
void TimezoneImpl::print(std::ostream& out) const {
- out << "Timezone file: " << filename << "\n";
- out << " Version: " << version << "\n";
- futureRule->print(out);
- for (uint64_t r = 0; r < variants.size(); ++r) {
- out << " Variant " << r << ": " << variants[r].toString() << "\n";
+ out << "Timezone file: " << filename_ << "\n";
+ out << " Version: " << version_ << "\n";
+ futureRule_->print(out);
+ for (uint64_t r = 0; r < variants_.size(); ++r) {
+ out << " Variant " << r << ": " << variants_[r].toString() << "\n";
}
- for (uint64_t t = 0; t < transitions.size(); ++t) {
+ for (uint64_t t = 0; t < transitions_.size(); ++t) {
tm timeStruct;
tm* result = nullptr;
char buffer[25];
if (sizeof(time_t) >= 8) {
- time_t val = transitions[t];
+ time_t val = transitions_[t];
result = gmtime_r(&val, &timeStruct);
if (result) {
strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct);
}
}
- out << " Transition: " << (result == nullptr ? "null" : buffer) << " (" << transitions[t]
- << ") -> " << variants[currentVariant[t]].name << "\n";
+ out << " Transition: " << (result == nullptr ? "null" : buffer) << " (" << transitions_[t]
+ << ") -> " << variants_[currentVariant_[t]].name << "\n";
}
}
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
index c427a962b5..cbc7b82796 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
@@ -29,54 +29,54 @@ namespace orc {
// PASS
}
- TypeImpl::TypeImpl(TypeKind _kind) {
- parent = nullptr;
- columnId = -1;
- maximumColumnId = -1;
- kind = _kind;
- maxLength = 0;
- precision = 0;
- scale = 0;
- subtypeCount = 0;
- }
-
- TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) {
- parent = nullptr;
- columnId = -1;
- maximumColumnId = -1;
- kind = _kind;
- maxLength = _maxLength;
- precision = 0;
- scale = 0;
- subtypeCount = 0;
- }
-
- TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, uint64_t _scale) {
- parent = nullptr;
- columnId = -1;
- maximumColumnId = -1;
- kind = _kind;
- maxLength = 0;
- precision = _precision;
- scale = _scale;
- subtypeCount = 0;
+ TypeImpl::TypeImpl(TypeKind kind) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = 0;
+ precision_ = 0;
+ scale_ = 0;
+ subtypeCount_ = 0;
+ }
+
+ TypeImpl::TypeImpl(TypeKind kind, uint64_t maxLength) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = maxLength;
+ precision_ = 0;
+ scale_ = 0;
+ subtypeCount_ = 0;
+ }
+
+ TypeImpl::TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = 0;
+ precision_ = precision;
+ scale_ = scale;
+ subtypeCount_ = 0;
}
uint64_t TypeImpl::assignIds(uint64_t root) const {
- columnId = static_cast<int64_t>(root);
+ columnId_ = static_cast<int64_t>(root);
uint64_t current = root + 1;
- for (uint64_t i = 0; i < subtypeCount; ++i) {
- current = dynamic_cast<TypeImpl*>(subTypes[i].get())->assignIds(current);
+ for (uint64_t i = 0; i < subtypeCount_; ++i) {
+ current = dynamic_cast<TypeImpl*>(subTypes_[i].get())->assignIds(current);
}
- maximumColumnId = static_cast<int64_t>(current) - 1;
+ maximumColumnId_ = static_cast<int64_t>(current) - 1;
return current;
}
void TypeImpl::ensureIdAssigned() const {
- if (columnId == -1) {
+ if (columnId_ == -1) {
const TypeImpl* root = this;
- while (root->parent != nullptr) {
- root = root->parent;
+ while (root->parent_ != nullptr) {
+ root = root->parent_;
}
root->assignIds(0);
}
@@ -84,94 +84,94 @@ namespace orc {
uint64_t TypeImpl::getColumnId() const {
ensureIdAssigned();
- return static_cast<uint64_t>(columnId);
+ return static_cast<uint64_t>(columnId_);
}
uint64_t TypeImpl::getMaximumColumnId() const {
ensureIdAssigned();
- return static_cast<uint64_t>(maximumColumnId);
+ return static_cast<uint64_t>(maximumColumnId_);
}
TypeKind TypeImpl::getKind() const {
- return kind;
+ return kind_;
}
uint64_t TypeImpl::getSubtypeCount() const {
- return subtypeCount;
+ return subtypeCount_;
}
const Type* TypeImpl::getSubtype(uint64_t i) const {
- return subTypes[i].get();
+ return subTypes_[i].get();
}
const std::string& TypeImpl::getFieldName(uint64_t i) const {
- return fieldNames[i];
+ return fieldNames_[i];
}
uint64_t TypeImpl::getMaximumLength() const {
- return maxLength;
+ return maxLength_;
}
uint64_t TypeImpl::getPrecision() const {
- return precision;
+ return precision_;
}
uint64_t TypeImpl::getScale() const {
- return scale;
+ return scale_;
}
Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) {
- attributes[key] = value;
+ attributes_[key] = value;
return *this;
}
bool TypeImpl::hasAttributeKey(const std::string& key) const {
- return attributes.find(key) != attributes.end();
+ return attributes_.find(key) != attributes_.end();
}
Type& TypeImpl::removeAttribute(const std::string& key) {
- auto it = attributes.find(key);
- if (it == attributes.end()) {
+ auto it = attributes_.find(key);
+ if (it == attributes_.end()) {
throw std::range_error("Key not found: " + key);
}
- attributes.erase(it);
+ attributes_.erase(it);
return *this;
}
std::vector<std::string> TypeImpl::getAttributeKeys() const {
std::vector<std::string> ret;
- ret.reserve(attributes.size());
- for (auto& attribute : attributes) {
+ ret.reserve(attributes_.size());
+ for (auto& attribute : attributes_) {
ret.push_back(attribute.first);
}
return ret;
}
std::string TypeImpl::getAttributeValue(const std::string& key) const {
- auto it = attributes.find(key);
- if (it == attributes.end()) {
+ auto it = attributes_.find(key);
+ if (it == attributes_.end()) {
throw std::range_error("Key not found: " + key);
}
return it->second;
}
- void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) {
- columnId = static_cast<int64_t>(_columnId);
- maximumColumnId = static_cast<int64_t>(_maxColumnId);
+ void TypeImpl::setIds(uint64_t columnId, uint64_t maxColumnId) {
+ columnId_ = static_cast<int64_t>(columnId);
+ maximumColumnId_ = static_cast<int64_t>(maxColumnId);
}
void TypeImpl::addChildType(std::unique_ptr<Type> childType) {
TypeImpl* child = dynamic_cast<TypeImpl*>(childType.get());
- subTypes.push_back(std::move(childType));
+ subTypes_.push_back(std::move(childType));
if (child != nullptr) {
- child->parent = this;
+ child->parent_ = this;
}
- subtypeCount += 1;
+ subtypeCount_ += 1;
}
Type* TypeImpl::addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) {
addChildType(std::move(fieldType));
- fieldNames.push_back(fieldName);
+ fieldNames_.push_back(fieldName);
return this;
}
@@ -190,7 +190,7 @@ namespace orc {
}
std::string TypeImpl::toString() const {
- switch (static_cast<int64_t>(kind)) {
+ switch (static_cast<int64_t>(kind_)) {
case BOOLEAN:
return "boolean";
case BYTE:
@@ -214,20 +214,20 @@ namespace orc {
case TIMESTAMP_INSTANT:
return "timestamp with local time zone";
case LIST:
- return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
+ return "array<" + (subTypes_[0] ? subTypes_[0]->toString() : "void") + ">";
case MAP:
- return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
- (subTypes[1] ? subTypes[1]->toString() : "void") + ">";
+ return "map<" + (subTypes_[0] ? subTypes_[0]->toString() : "void") + "," +
+ (subTypes_[1] ? subTypes_[1]->toString() : "void") + ">";
case STRUCT: {
std::string result = "struct<";
- for (size_t i = 0; i < subTypes.size(); ++i) {
+ for (size_t i = 0; i < subTypes_.size(); ++i) {
if (i != 0) {
result += ",";
}
- if (isUnquotedFieldName(fieldNames[i])) {
- result += fieldNames[i];
+ if (isUnquotedFieldName(fieldNames_[i])) {
+ result += fieldNames_[i];
} else {
- std::string name(fieldNames[i]);
+ std::string name(fieldNames_[i]);
size_t pos = 0;
while ((pos = name.find("`", pos)) != std::string::npos) {
name.replace(pos, 1, "``");
@@ -238,37 +238,37 @@ namespace orc {
result += "`";
}
result += ":";
- result += subTypes[i]->toString();
+ result += subTypes_[i]->toString();
}
result += ">";
return result;
}
case UNION: {
std::string result = "uniontype<";
- for (size_t i = 0; i < subTypes.size(); ++i) {
+ for (size_t i = 0; i < subTypes_.size(); ++i) {
if (i != 0) {
result += ",";
}
- result += subTypes[i]->toString();
+ result += subTypes_[i]->toString();
}
result += ">";
return result;
}
case DECIMAL: {
std::stringstream result;
- result << "decimal(" << precision << "," << scale << ")";
+ result << "decimal(" << precision_ << "," << scale_ << ")";
return result.str();
}
case DATE:
return "date";
case VARCHAR: {
std::stringstream result;
- result << "varchar(" << maxLength << ")";
+ result << "varchar(" << maxLength_ << ")";
return result.str();
}
case CHAR: {
std::stringstream result;
- result << "char(" << maxLength << ")";
+ result << "char(" << maxLength_ << ")";
return result.str();
}
default:
@@ -285,7 +285,7 @@ namespace orc {
std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(uint64_t capacity,
MemoryPool& memoryPool, bool encoded,
bool useTightNumericVector) const {
- switch (static_cast<int64_t>(kind)) {
+ switch (static_cast<int64_t>(kind_)) {
case BOOLEAN:
if (useTightNumericVector) {
return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
@@ -660,7 +660,8 @@ namespace orc {
std::pair<std::string, size_t> nameRes = parseName(input, pos, end);
pos = nameRes.second;
if (input[pos] != ':') {
- throw std::logic_error("Invalid struct type. No field name set.");
+ throw std::logic_error("Invalid struct type. Field name can not contain '" +
+ std::string(1, input[pos]) + "'.");
}
std::pair<std::unique_ptr<Type>, size_t> typeRes = TypeImpl::parseType(input, ++pos, end);
result->addStructField(nameRes.first, std::move(typeRes.first));
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
index 6d0743793a..647d5a5d2c 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
@@ -30,17 +30,17 @@ namespace orc {
class TypeImpl : public Type {
private:
- TypeImpl* parent;
- mutable int64_t columnId;
- mutable int64_t maximumColumnId;
- TypeKind kind;
- std::vector<std::unique_ptr<Type>> subTypes;
- std::vector<std::string> fieldNames;
- uint64_t subtypeCount;
- uint64_t maxLength;
- uint64_t precision;
- uint64_t scale;
- std::map<std::string, std::string> attributes;
+ TypeImpl* parent_;
+ mutable int64_t columnId_;
+ mutable int64_t maximumColumnId_;
+ TypeKind kind_;
+ std::vector<std::unique_ptr<Type>> subTypes_;
+ std::vector<std::string> fieldNames_;
+ uint64_t subtypeCount_;
+ uint64_t maxLength_;
+ uint64_t precision_;
+ uint64_t scale_;
+ std::map<std::string, std::string> attributes_;
public:
/**
diff --git a/contrib/libs/apache/orc/c++/src/Utils.hh b/contrib/libs/apache/orc/c++/src/Utils.hh
index 751c09b205..851d0af15c 100644
--- a/contrib/libs/apache/orc/c++/src/Utils.hh
+++ b/contrib/libs/apache/orc/c++/src/Utils.hh
@@ -21,38 +21,39 @@
#include <atomic>
#include <chrono>
+#include <stdexcept>
namespace orc {
class AutoStopwatch {
- std::chrono::high_resolution_clock::time_point start;
- std::atomic<uint64_t>* latencyUs;
- std::atomic<uint64_t>* count;
- bool minus;
+ std::chrono::high_resolution_clock::time_point start_;
+ std::atomic<uint64_t>* latencyUs_;
+ std::atomic<uint64_t>* count_;
+ bool minus_;
public:
- AutoStopwatch(std::atomic<uint64_t>* _latencyUs, std::atomic<uint64_t>* _count,
- bool _minus = false)
- : latencyUs(_latencyUs), count(_count), minus(_minus) {
- if (latencyUs) {
- start = std::chrono::high_resolution_clock::now();
+ AutoStopwatch(std::atomic<uint64_t>* latencyUs, std::atomic<uint64_t>* count,
+ bool minus = false)
+ : latencyUs_(latencyUs), count_(count), minus_(minus) {
+ if (latencyUs_) {
+ start_ = std::chrono::high_resolution_clock::now();
}
}
~AutoStopwatch() {
- if (latencyUs) {
+ if (latencyUs_) {
std::chrono::microseconds elapsedTime =
std::chrono::duration_cast<std::chrono::microseconds>(
- std::chrono::high_resolution_clock::now() - start);
- if (!minus) {
- latencyUs->fetch_add(static_cast<uint64_t>(elapsedTime.count()));
+ std::chrono::high_resolution_clock::now() - start_);
+ if (!minus_) {
+ latencyUs_->fetch_add(static_cast<uint64_t>(elapsedTime.count()));
} else {
- latencyUs->fetch_sub(static_cast<uint64_t>(elapsedTime.count()));
+ latencyUs_->fetch_sub(static_cast<uint64_t>(elapsedTime.count()));
}
}
- if (count) {
- count->fetch_add(1);
+ if (count_) {
+ count_->fetch_add(1);
}
}
};
@@ -70,6 +71,75 @@ namespace orc {
#define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR)
#endif
+ struct Utf8Utils {
+ /**
+ * Counts how many utf-8 chars of the input data
+ */
+ static uint64_t charLength(const char* data, uint64_t length) {
+ uint64_t chars = 0;
+ for (uint64_t i = 0; i < length; i++) {
+ if (isUtfStartByte(data[i])) {
+ chars++;
+ }
+ }
+ return chars;
+ }
+
+ /**
+ * Return the number of bytes required to read at most maxCharLength
+ * characters in full from a utf-8 encoded byte array provided
+ * by data. This does not validate utf-8 data, but
+ * operates correctly on already valid utf-8 data.
+ *
+ * @param maxCharLength number of characters required
+ * @param data the bytes of UTF-8
+ * @param length the length of data to truncate
+ */
+ static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) {
+ uint64_t chars = 0;
+ if (length <= maxCharLength) {
+ return length;
+ }
+ for (uint64_t i = 0; i < length; i++) {
+ if (isUtfStartByte(data[i])) {
+ chars++;
+ }
+ if (chars > maxCharLength) {
+ return i;
+ }
+ }
+ // everything fits
+ return length;
+ }
+
+ /**
+ * Checks if b is the first byte of a UTF-8 character.
+ */
+ inline static bool isUtfStartByte(char b) {
+ return (b & 0xC0) != 0x80;
+ }
+
+ /**
+ * Find the start of the last character that ends in the current string.
+ * @param text the bytes of the utf-8
+ * @param from the first byte location
+ * @param until the last byte location
+ * @return the index of the last character
+ */
+ static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) {
+ uint64_t posn = until;
+ /* we don't expect characters more than 5 bytes */
+ while (posn >= from) {
+ if (isUtfStartByte(text[posn])) {
+ return posn;
+ }
+ posn -= 1;
+ }
+ /* beginning of a valid char not found */
+ throw std::logic_error("Could not truncate string, beginning of a valid char not found");
+ }
+ };
+
} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc
index b9e2854586..49f47aeb03 100644
--- a/contrib/libs/apache/orc/c++/src/Vector.cc
+++ b/contrib/libs/apache/orc/c++/src/Vector.cc
@@ -34,6 +34,7 @@ namespace orc {
notNull(pool, cap),
hasNulls(false),
isEncoded(false),
+ dictionaryDecoded(false),
memoryPool(pool) {
std::memset(notNull.data(), 1, capacity);
}
@@ -61,13 +62,20 @@ namespace orc {
return false;
}
+ void ColumnVectorBatch::decodeDictionary() {
+ if (dictionaryDecoded) return;
+
+ decodeDictionaryImpl();
+ dictionaryDecoded = true;
+ }
+
StringDictionary::StringDictionary(MemoryPool& pool)
: dictionaryBlob(pool), dictionaryOffset(pool) {
// PASS
}
- EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, MemoryPool& pool)
- : StringVectorBatch(_capacity, pool), dictionary(), index(pool, _capacity) {
+ EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool)
+ : StringVectorBatch(capacity, pool), dictionary(), index(pool, capacity) {
// PASS
}
@@ -88,10 +96,21 @@ namespace orc {
}
}
- StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool)
- : ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity),
- length(pool, _capacity),
+ void EncodedStringVectorBatch::decodeDictionaryImpl() {
+ size_t n = index.size();
+ resize(n);
+
+ for (size_t i = 0; i < n; ++i) {
+ if (!hasNulls || notNull[i]) {
+ dictionary->getValueByIndex(index[i], data[i], length[i]);
+ }
+ }
+ }
+
+ StringVectorBatch::StringVectorBatch(uint64_t capacity, MemoryPool& pool)
+ : ColumnVectorBatch(capacity, pool),
+ data(pool, capacity),
+ length(pool, capacity),
blob(pool) {
// PASS
}
@@ -174,6 +193,12 @@ namespace orc {
return false;
}
+ void StructVectorBatch::decodeDictionaryImpl() {
+ for (const auto& field : fields) {
+ field->decodeDictionary();
+ }
+ }
+
ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
offsets.zeroOut();
@@ -211,6 +236,10 @@ namespace orc {
return true;
}
+ void ListVectorBatch::decodeDictionaryImpl() {
+ elements->decodeDictionary();
+ }
+
MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
offsets.zeroOut();
@@ -251,6 +280,16 @@ namespace orc {
return true;
}
+ void MapVectorBatch::decodeDictionaryImpl() {
+ if (keys) {
+ keys->decodeDictionary();
+ }
+
+ if (elements) {
+ elements->decodeDictionary();
+ }
+ }
+
UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) {
tags.zeroOut();
@@ -310,6 +349,12 @@ namespace orc {
return false;
}
+ void UnionVectorBatch::decodeDictionaryImpl() {
+ for (const auto& child : children) {
+ child->decodeDictionary();
+ }
+ }
+
Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool),
precision(0),
@@ -383,7 +428,7 @@ namespace orc {
readScales.capacity() * sizeof(int64_t));
}
- Decimal::Decimal(const Int128& _value, int32_t _scale) : value(_value), scale(_scale) {
+ Decimal::Decimal(const Int128& value, int32_t scale) : value(value), scale(scale) {
// PASS
}
@@ -408,8 +453,8 @@ namespace orc {
return value.toDecimalString(scale, trimTrailingZeros);
}
- TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, MemoryPool& pool)
- : ColumnVectorBatch(_capacity, pool), data(pool, _capacity), nanoseconds(pool, _capacity) {
+ TimestampVectorBatch::TimestampVectorBatch(uint64_t capacity, MemoryPool& pool)
+ : ColumnVectorBatch(capacity, pool), data(pool, capacity), nanoseconds(pool, capacity) {
// PASS
}
diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc
index 89eb3781cf..775e6d2452 100644
--- a/contrib/libs/apache/orc/c++/src/Writer.cc
+++ b/contrib/libs/apache/orc/c++/src/Writer.cc
@@ -46,6 +46,8 @@ namespace orc {
WriterMetrics* metrics;
bool useTightNumericVector;
uint64_t outputBufferCapacity;
+ uint64_t memoryBlockSize;
+ bool alignBlockBoundToRowGroup;
WriterOptionsPrivate() : fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
stripeSize = 64 * 1024 * 1024; // 64M
@@ -67,28 +69,30 @@ namespace orc {
metrics = nullptr;
useTightNumericVector = false;
outputBufferCapacity = 1024 * 1024;
+ memoryBlockSize = 64 * 1024; // 64K
+ alignBlockBoundToRowGroup = false;
}
};
WriterOptions::WriterOptions()
- : privateBits(std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate())) {
+ : privateBits_(std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate())) {
// PASS
}
WriterOptions::WriterOptions(const WriterOptions& rhs)
- : privateBits(std::unique_ptr<WriterOptionsPrivate>(
- new WriterOptionsPrivate(*(rhs.privateBits.get())))) {
+ : privateBits_(std::unique_ptr<WriterOptionsPrivate>(
+ new WriterOptionsPrivate(*(rhs.privateBits_.get())))) {
// PASS
}
WriterOptions::WriterOptions(WriterOptions& rhs) {
// swap privateBits with rhs
- privateBits.swap(rhs.privateBits);
+ privateBits_.swap(rhs.privateBits_);
}
WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) {
if (this != &rhs) {
- privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get())));
+ privateBits_.reset(new WriterOptionsPrivate(*(rhs.privateBits_.get())));
}
return *this;
}
@@ -97,7 +101,7 @@ namespace orc {
// PASS
}
RleVersion WriterOptions::getRleVersion() const {
- if (privateBits->fileVersion == FileVersion::v_0_11()) {
+ if (privateBits_->fileVersion == FileVersion::v_0_11()) {
return RleVersion_1;
}
@@ -105,186 +109,204 @@ namespace orc {
}
WriterOptions& WriterOptions::setStripeSize(uint64_t size) {
- privateBits->stripeSize = size;
+ privateBits_->stripeSize = size;
return *this;
}
uint64_t WriterOptions::getStripeSize() const {
- return privateBits->stripeSize;
+ return privateBits_->stripeSize;
}
WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) {
if (size >= (1 << 23)) {
throw std::invalid_argument("Compression block size cannot be greater or equal than 8M");
}
- privateBits->compressionBlockSize = size;
+ privateBits_->compressionBlockSize = size;
return *this;
}
uint64_t WriterOptions::getCompressionBlockSize() const {
- return privateBits->compressionBlockSize;
+ return privateBits_->compressionBlockSize;
}
WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) {
- privateBits->rowIndexStride = stride;
- privateBits->enableIndex = (stride != 0);
+ privateBits_->rowIndexStride = stride;
+ privateBits_->enableIndex = (stride != 0);
return *this;
}
uint64_t WriterOptions::getRowIndexStride() const {
- return privateBits->rowIndexStride;
+ return privateBits_->rowIndexStride;
}
WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) {
- privateBits->dictionaryKeySizeThreshold = val;
+ privateBits_->dictionaryKeySizeThreshold = val;
return *this;
}
double WriterOptions::getDictionaryKeySizeThreshold() const {
- return privateBits->dictionaryKeySizeThreshold;
+ return privateBits_->dictionaryKeySizeThreshold;
}
WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) {
// Only Hive_0_11 and Hive_0_12 version are supported currently
if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) {
- privateBits->fileVersion = version;
+ privateBits_->fileVersion = version;
return *this;
}
if (version == FileVersion::UNSTABLE_PRE_2_0()) {
- *privateBits->errorStream << "Warning: ORC files written in "
- << FileVersion::UNSTABLE_PRE_2_0().toString()
- << " will not be readable by other versions of the software."
- << " It is only for developer testing.\n";
- privateBits->fileVersion = version;
+ *privateBits_->errorStream << "Warning: ORC files written in "
+ << FileVersion::UNSTABLE_PRE_2_0().toString()
+ << " will not be readable by other versions of the software."
+ << " It is only for developer testing.\n";
+ privateBits_->fileVersion = version;
return *this;
}
throw std::logic_error("Unsupported file version specified.");
}
FileVersion WriterOptions::getFileVersion() const {
- return privateBits->fileVersion;
+ return privateBits_->fileVersion;
}
WriterOptions& WriterOptions::setCompression(CompressionKind comp) {
- privateBits->compression = comp;
+ privateBits_->compression = comp;
return *this;
}
CompressionKind WriterOptions::getCompression() const {
- return privateBits->compression;
+ return privateBits_->compression;
}
WriterOptions& WriterOptions::setCompressionStrategy(CompressionStrategy strategy) {
- privateBits->compressionStrategy = strategy;
+ privateBits_->compressionStrategy = strategy;
return *this;
}
CompressionStrategy WriterOptions::getCompressionStrategy() const {
- return privateBits->compressionStrategy;
+ return privateBits_->compressionStrategy;
}
bool WriterOptions::getAlignedBitpacking() const {
- return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED;
+ return privateBits_->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED;
}
WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) {
- privateBits->paddingTolerance = tolerance;
+ privateBits_->paddingTolerance = tolerance;
return *this;
}
double WriterOptions::getPaddingTolerance() const {
- return privateBits->paddingTolerance;
+ return privateBits_->paddingTolerance;
}
WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) {
- privateBits->memoryPool = memoryPool;
+ privateBits_->memoryPool = memoryPool;
return *this;
}
MemoryPool* WriterOptions::getMemoryPool() const {
- return privateBits->memoryPool;
+ return privateBits_->memoryPool;
}
WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) {
- privateBits->errorStream = &errStream;
+ privateBits_->errorStream = &errStream;
return *this;
}
std::ostream* WriterOptions::getErrorStream() const {
- return privateBits->errorStream;
+ return privateBits_->errorStream;
}
bool WriterOptions::getEnableIndex() const {
- return privateBits->enableIndex;
+ return privateBits_->enableIndex;
}
bool WriterOptions::getEnableDictionary() const {
- return privateBits->dictionaryKeySizeThreshold > 0.0;
+ return privateBits_->dictionaryKeySizeThreshold > 0.0;
}
WriterOptions& WriterOptions::setColumnsUseBloomFilter(const std::set<uint64_t>& columns) {
- privateBits->columnsUseBloomFilter = columns;
+ privateBits_->columnsUseBloomFilter = columns;
return *this;
}
bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const {
- return privateBits->columnsUseBloomFilter.find(column) !=
- privateBits->columnsUseBloomFilter.end();
+ return privateBits_->columnsUseBloomFilter.find(column) !=
+ privateBits_->columnsUseBloomFilter.end();
}
WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) {
- privateBits->bloomFilterFalsePositiveProb = fpp;
+ privateBits_->bloomFilterFalsePositiveProb = fpp;
return *this;
}
double WriterOptions::getBloomFilterFPP() const {
- return privateBits->bloomFilterFalsePositiveProb;
+ return privateBits_->bloomFilterFalsePositiveProb;
}
// delibrately not provide setter to write bloom filter version because
// we only support UTF8 for now.
BloomFilterVersion WriterOptions::getBloomFilterVersion() const {
- return privateBits->bloomFilterVersion;
+ return privateBits_->bloomFilterVersion;
}
const Timezone& WriterOptions::getTimezone() const {
- return getTimezoneByName(privateBits->timezone);
+ return getTimezoneByName(privateBits_->timezone);
}
const std::string& WriterOptions::getTimezoneName() const {
- return privateBits->timezone;
+ return privateBits_->timezone;
}
WriterOptions& WriterOptions::setTimezoneName(const std::string& zone) {
- privateBits->timezone = zone;
+ privateBits_->timezone = zone;
return *this;
}
WriterMetrics* WriterOptions::getWriterMetrics() const {
- return privateBits->metrics;
+ return privateBits_->metrics;
}
WriterOptions& WriterOptions::setWriterMetrics(WriterMetrics* metrics) {
- privateBits->metrics = metrics;
+ privateBits_->metrics = metrics;
return *this;
}
WriterOptions& WriterOptions::setUseTightNumericVector(bool useTightNumericVector) {
- privateBits->useTightNumericVector = useTightNumericVector;
+ privateBits_->useTightNumericVector = useTightNumericVector;
return *this;
}
bool WriterOptions::getUseTightNumericVector() const {
- return privateBits->useTightNumericVector;
+ return privateBits_->useTightNumericVector;
}
WriterOptions& WriterOptions::setOutputBufferCapacity(uint64_t capacity) {
- privateBits->outputBufferCapacity = capacity;
+ privateBits_->outputBufferCapacity = capacity;
return *this;
}
uint64_t WriterOptions::getOutputBufferCapacity() const {
- return privateBits->outputBufferCapacity;
+ return privateBits_->outputBufferCapacity;
+ }
+
+ WriterOptions& WriterOptions::setMemoryBlockSize(uint64_t capacity) {
+ privateBits_->memoryBlockSize = capacity;
+ return *this;
+ }
+
+ uint64_t WriterOptions::getMemoryBlockSize() const {
+ return privateBits_->memoryBlockSize;
+ }
+
+ WriterOptions& WriterOptions::setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup) {
+ privateBits_->alignBlockBoundToRowGroup = alignBlockBoundToRowGroup;
+ return *this;
+ }
+
+ bool WriterOptions::getAlignBlockBoundToRowGroup() const {
+ return privateBits_->alignBlockBoundToRowGroup;
}
Writer::~Writer() {
@@ -293,25 +315,25 @@ namespace orc {
class WriterImpl : public Writer {
private:
- std::unique_ptr<ColumnWriter> columnWriter;
- std::unique_ptr<BufferedOutputStream> compressionStream;
- std::unique_ptr<BufferedOutputStream> bufferedStream;
- std::unique_ptr<StreamsFactory> streamsFactory;
- OutputStream* outStream;
- WriterOptions options;
- const Type& type;
- uint64_t stripeRows, totalRows, indexRows;
- uint64_t currentOffset;
- proto::Footer fileFooter;
- proto::PostScript postScript;
- proto::StripeInformation stripeInfo;
- proto::Metadata metadata;
+ std::unique_ptr<ColumnWriter> columnWriter_;
+ std::unique_ptr<BufferedOutputStream> compressionStream_;
+ std::unique_ptr<BufferedOutputStream> bufferedStream_;
+ std::unique_ptr<StreamsFactory> streamsFactory_;
+ OutputStream* outStream_;
+ WriterOptions options_;
+ const Type& type_;
+ uint64_t stripeRows_, totalRows_, indexRows_;
+ uint64_t currentOffset_;
+ proto::Footer fileFooter_;
+ proto::PostScript postScript_;
+ proto::StripeInformation stripeInfo_;
+ proto::Metadata metadata_;
static const char* magicId;
static const WriterId writerId;
- bool useTightNumericVector;
- int32_t stripesAtLastFlush;
- uint64_t lastFlushOffset;
+ bool useTightNumericVector_;
+ int32_t stripesAtLastFlush_;
+ uint64_t lastFlushOffset_;
public:
WriterImpl(const Type& type, OutputStream* stream, const WriterOptions& options);
@@ -342,93 +364,101 @@ namespace orc {
const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER;
WriterImpl::WriterImpl(const Type& t, OutputStream* stream, const WriterOptions& opts)
- : outStream(stream), options(opts), type(t) {
- streamsFactory = createStreamsFactory(options, outStream);
- columnWriter = buildWriter(type, *streamsFactory, options);
- stripeRows = totalRows = indexRows = 0;
- currentOffset = 0;
- stripesAtLastFlush = 0;
- lastFlushOffset = 0;
-
- useTightNumericVector = opts.getUseTightNumericVector();
+ : outStream_(stream), options_(opts), type_(t) {
+ streamsFactory_ = createStreamsFactory(options_, outStream_);
+ columnWriter_ = buildWriter(type_, *streamsFactory_, options_);
+ stripeRows_ = totalRows_ = indexRows_ = 0;
+ currentOffset_ = 0;
+ stripesAtLastFlush_ = 0;
+ lastFlushOffset_ = 0;
+
+ useTightNumericVector_ = opts.getUseTightNumericVector();
+
+ if (options_.getCompressionBlockSize() % options_.getMemoryBlockSize() != 0) {
+ throw std::invalid_argument(
+ "Compression block size must be a multiple of memory block size.");
+ }
// compression stream for stripe footer, file footer and metadata
- compressionStream =
- createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(),
- options.getOutputBufferCapacity(), options.getCompressionBlockSize(),
- *options.getMemoryPool(), options.getWriterMetrics());
+ compressionStream_ = createCompressor(
+ options_.getCompression(), outStream_, options_.getCompressionStrategy(),
+ options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(),
+ options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics());
// uncompressed stream for post script
- bufferedStream.reset(new BufferedOutputStream(*options.getMemoryPool(), outStream,
- 1024, // buffer capacity: 1024 bytes
- options.getCompressionBlockSize(),
- options.getWriterMetrics()));
+ bufferedStream_.reset(new BufferedOutputStream(*options_.getMemoryPool(), outStream_,
+ 1024, // buffer capacity: 1024 bytes
+ options_.getCompressionBlockSize(),
+ options_.getWriterMetrics()));
init();
}
std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) const {
- return type.createRowBatch(size, *options.getMemoryPool(), false, useTightNumericVector);
+ return type_.createRowBatch(size, *options_.getMemoryPool(), false, useTightNumericVector_);
}
void WriterImpl::add(ColumnVectorBatch& rowsToAdd) {
- if (options.getEnableIndex()) {
+ if (options_.getEnableIndex()) {
uint64_t pos = 0;
uint64_t chunkSize = 0;
- uint64_t rowIndexStride = options.getRowIndexStride();
+ uint64_t rowIndexStride = options_.getRowIndexStride();
while (pos < rowsToAdd.numElements) {
- chunkSize = std::min(rowsToAdd.numElements - pos, rowIndexStride - indexRows);
- columnWriter->add(rowsToAdd, pos, chunkSize, nullptr);
+ chunkSize = std::min(rowsToAdd.numElements - pos, rowIndexStride - indexRows_);
+ columnWriter_->add(rowsToAdd, pos, chunkSize, nullptr);
pos += chunkSize;
- indexRows += chunkSize;
- stripeRows += chunkSize;
-
- if (indexRows >= rowIndexStride) {
- columnWriter->createRowIndexEntry();
- indexRows = 0;
+ indexRows_ += chunkSize;
+ stripeRows_ += chunkSize;
+
+ if (indexRows_ >= rowIndexStride) {
+ if (options_.getAlignBlockBoundToRowGroup()) {
+ columnWriter_->finishStreams();
+ }
+ columnWriter_->createRowIndexEntry();
+ indexRows_ = 0;
}
}
} else {
- stripeRows += rowsToAdd.numElements;
- columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr);
+ stripeRows_ += rowsToAdd.numElements;
+ columnWriter_->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr);
}
- if (columnWriter->getEstimatedSize() >= options.getStripeSize()) {
+ if (columnWriter_->getEstimatedSize() >= options_.getStripeSize()) {
writeStripe();
}
}
void WriterImpl::close() {
- if (stripeRows > 0) {
+ if (stripeRows_ > 0) {
writeStripe();
}
writeMetadata();
writeFileFooter();
writePostscript();
- outStream->close();
+ outStream_->close();
}
uint64_t WriterImpl::writeIntermediateFooter() {
- if (stripeRows > 0) {
+ if (stripeRows_ > 0) {
writeStripe();
}
- if (stripesAtLastFlush != fileFooter.stripes_size()) {
+ if (stripesAtLastFlush_ != fileFooter_.stripes_size()) {
writeMetadata();
writeFileFooter();
writePostscript();
- stripesAtLastFlush = fileFooter.stripes_size();
- outStream->flush();
- lastFlushOffset = outStream->getLength();
- currentOffset = lastFlushOffset;
+ stripesAtLastFlush_ = fileFooter_.stripes_size();
+ outStream_->flush();
+ lastFlushOffset_ = outStream_->getLength();
+ currentOffset_ = lastFlushOffset_;
// init stripe now that we adjusted the currentOffset
initStripe();
}
- return lastFlushOffset;
+ return lastFlushOffset_;
}
void WriterImpl::addUserMetadata(const std::string& name, const std::string& value) {
- proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata();
+ proto::UserMetadataItem* userMetadataItem = fileFooter_.add_metadata();
userMetadataItem->set_name(name);
userMetadataItem->set_value(value);
}
@@ -437,65 +467,65 @@ namespace orc {
// Write file header
const static size_t magicIdLength = strlen(WriterImpl::magicId);
{
- SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount);
- outStream->write(WriterImpl::magicId, magicIdLength);
+ SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount);
+ outStream_->write(WriterImpl::magicId, magicIdLength);
}
- currentOffset += magicIdLength;
+ currentOffset_ += magicIdLength;
// Initialize file footer
- fileFooter.set_header_length(currentOffset);
- fileFooter.set_content_length(0);
- fileFooter.set_number_of_rows(0);
- fileFooter.set_row_index_stride(static_cast<uint32_t>(options.getRowIndexStride()));
- fileFooter.set_writer(writerId);
- fileFooter.set_software_version(ORC_VERSION);
+ fileFooter_.set_header_length(currentOffset_);
+ fileFooter_.set_content_length(0);
+ fileFooter_.set_number_of_rows(0);
+ fileFooter_.set_row_index_stride(static_cast<uint32_t>(options_.getRowIndexStride()));
+ fileFooter_.set_writer(writerId);
+ fileFooter_.set_software_version(ORC_VERSION);
uint32_t index = 0;
- buildFooterType(type, fileFooter, index);
+ buildFooterType(type_, fileFooter_, index);
// Initialize post script
- postScript.set_footer_length(0);
- postScript.set_compression(WriterImpl::convertCompressionKind(options.getCompression()));
- postScript.set_compression_block_size(options.getCompressionBlockSize());
+ postScript_.set_footer_length(0);
+ postScript_.set_compression(WriterImpl::convertCompressionKind(options_.getCompression()));
+ postScript_.set_compression_block_size(options_.getCompressionBlockSize());
- postScript.add_version(options.getFileVersion().getMajor());
- postScript.add_version(options.getFileVersion().getMinor());
+ postScript_.add_version(options_.getFileVersion().getMajor());
+ postScript_.add_version(options_.getFileVersion().getMinor());
- postScript.set_writer_version(WriterVersion_ORC_135);
- postScript.set_magic("ORC");
+ postScript_.set_writer_version(WriterVersion_ORC_135);
+ postScript_.set_magic("ORC");
// Initialize first stripe
initStripe();
}
void WriterImpl::initStripe() {
- stripeInfo.set_offset(currentOffset);
- stripeInfo.set_index_length(0);
- stripeInfo.set_data_length(0);
- stripeInfo.set_footer_length(0);
- stripeInfo.set_number_of_rows(0);
+ stripeInfo_.set_offset(currentOffset_);
+ stripeInfo_.set_index_length(0);
+ stripeInfo_.set_data_length(0);
+ stripeInfo_.set_footer_length(0);
+ stripeInfo_.set_number_of_rows(0);
- stripeRows = indexRows = 0;
+ stripeRows_ = indexRows_ = 0;
}
void WriterImpl::writeStripe() {
- if (options.getEnableIndex() && indexRows != 0) {
- columnWriter->createRowIndexEntry();
- indexRows = 0;
+ if (options_.getEnableIndex() && indexRows_ != 0) {
+ columnWriter_->createRowIndexEntry();
+ indexRows_ = 0;
} else {
- columnWriter->mergeRowGroupStatsIntoStripeStats();
+ columnWriter_->mergeRowGroupStatsIntoStripeStats();
}
// dictionary should be written before any stream is flushed
- columnWriter->writeDictionary();
+ columnWriter_->writeDictionary();
std::vector<proto::Stream> streams;
// write ROW_INDEX streams
- if (options.getEnableIndex()) {
- columnWriter->writeIndex(streams);
+ if (options_.getEnableIndex()) {
+ columnWriter_->writeIndex(streams);
}
// write streams like PRESENT, DATA, etc.
- columnWriter->flush(streams);
+ columnWriter_->flush(streams);
// generate and write stripe footer
proto::StripeFooter stripeFooter;
@@ -504,28 +534,28 @@ namespace orc {
}
std::vector<proto::ColumnEncoding> encodings;
- columnWriter->getColumnEncoding(encodings);
+ columnWriter_->getColumnEncoding(encodings);
for (uint32_t i = 0; i < encodings.size(); ++i) {
*stripeFooter.add_columns() = encodings[i];
}
- stripeFooter.set_writer_timezone(options.getTimezoneName());
+ stripeFooter.set_writer_timezone(options_.getTimezoneName());
// add stripe statistics to metadata
- proto::StripeStatistics* stripeStats = metadata.add_stripe_stats();
+ proto::StripeStatistics* stripeStats = metadata_.add_stripe_stats();
std::vector<proto::ColumnStatistics> colStats;
- columnWriter->getStripeStatistics(colStats);
+ columnWriter_->getStripeStatistics(colStats);
for (uint32_t i = 0; i != colStats.size(); ++i) {
*stripeStats->add_col_stats() = colStats[i];
}
// merge stripe stats into file stats and clear stripe stats
- columnWriter->mergeStripeStatsIntoFileStats();
+ columnWriter_->mergeStripeStatsIntoFileStats();
- if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) {
+ if (!stripeFooter.SerializeToZeroCopyStream(compressionStream_.get())) {
throw std::logic_error("Failed to write stripe footer.");
}
- uint64_t footerLength = compressionStream->flush();
+ uint64_t footerLength = compressionStream_->flush();
// calculate data length and index length
uint64_t dataLength = 0;
@@ -540,53 +570,53 @@ namespace orc {
}
// update stripe info
- stripeInfo.set_index_length(indexLength);
- stripeInfo.set_data_length(dataLength);
- stripeInfo.set_footer_length(footerLength);
- stripeInfo.set_number_of_rows(stripeRows);
+ stripeInfo_.set_index_length(indexLength);
+ stripeInfo_.set_data_length(dataLength);
+ stripeInfo_.set_footer_length(footerLength);
+ stripeInfo_.set_number_of_rows(stripeRows_);
- *fileFooter.add_stripes() = stripeInfo;
+ *fileFooter_.add_stripes() = stripeInfo_;
- currentOffset = currentOffset + indexLength + dataLength + footerLength;
- totalRows += stripeRows;
+ currentOffset_ = currentOffset_ + indexLength + dataLength + footerLength;
+ totalRows_ += stripeRows_;
- columnWriter->reset();
+ columnWriter_->reset();
initStripe();
}
void WriterImpl::writeMetadata() {
- if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) {
+ if (!metadata_.SerializeToZeroCopyStream(compressionStream_.get())) {
throw std::logic_error("Failed to write metadata.");
}
- postScript.set_metadata_length(compressionStream.get()->flush());
+ postScript_.set_metadata_length(compressionStream_.get()->flush());
}
void WriterImpl::writeFileFooter() {
- fileFooter.set_content_length(currentOffset - fileFooter.header_length());
- fileFooter.set_number_of_rows(totalRows);
+ fileFooter_.set_content_length(currentOffset_ - fileFooter_.header_length());
+ fileFooter_.set_number_of_rows(totalRows_);
// update file statistics
std::vector<proto::ColumnStatistics> colStats;
- columnWriter->getFileStatistics(colStats);
- fileFooter.clear_statistics();
+ columnWriter_->getFileStatistics(colStats);
+ fileFooter_.clear_statistics();
for (uint32_t i = 0; i != colStats.size(); ++i) {
- *fileFooter.add_statistics() = colStats[i];
+ *fileFooter_.add_statistics() = colStats[i];
}
- if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) {
+ if (!fileFooter_.SerializeToZeroCopyStream(compressionStream_.get())) {
throw std::logic_error("Failed to write file footer.");
}
- postScript.set_footer_length(compressionStream->flush());
+ postScript_.set_footer_length(compressionStream_->flush());
}
void WriterImpl::writePostscript() {
- if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) {
+ if (!postScript_.SerializeToZeroCopyStream(bufferedStream_.get())) {
throw std::logic_error("Failed to write post script.");
}
- unsigned char psLength = static_cast<unsigned char>(bufferedStream->flush());
- SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount);
- outStream->write(&psLength, sizeof(unsigned char));
+ unsigned char psLength = static_cast<unsigned char>(bufferedStream_->flush());
+ SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount);
+ outStream_->write(&psLength, sizeof(unsigned char));
}
void WriterImpl::buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index) {
diff --git a/contrib/libs/apache/orc/c++/src/io/Cache.cc b/contrib/libs/apache/orc/c++/src/io/Cache.cc
new file mode 100644
index 0000000000..39f63fdd2b
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/io/Cache.cc
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+
+#include "Cache.hh"
+
+namespace orc {
+
+ std::vector<ReadRange> ReadRangeCombiner::coalesce(std::vector<ReadRange> ranges) const {
+ if (ranges.empty()) {
+ return ranges;
+ }
+
+ // Remove zero-sized ranges
+ auto end = std::remove_if(ranges.begin(), ranges.end(),
+ [](const ReadRange& range) { return range.length == 0; });
+ // Sort in position order
+ std::sort(ranges.begin(), end, [](const ReadRange& a, const ReadRange& b) {
+ return a.offset != b.offset ? a.offset < b.offset : a.length > b.length;
+ });
+
+ // Remove ranges that overlap 100%
+ std::vector<ReadRange> uniqueRanges;
+ uniqueRanges.reserve(ranges.size());
+ for (auto it = ranges.begin(); it != end; ++it) {
+ if (uniqueRanges.empty() || !uniqueRanges.back().contains(*it)) {
+ uniqueRanges.push_back(*it);
+ }
+ }
+ ranges = std::move(uniqueRanges);
+
+ // Skip further processing if ranges is empty after removing zero-sized ranges.
+ if (ranges.empty()) {
+ return ranges;
+ }
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < ranges.size() - 1; ++i) {
+ const auto& left = ranges[i];
+ const auto& right = ranges[i + 1];
+ assert(left.offset < right.offset);
+ assert(!left.contains(right));
+ }
+#endif
+
+ std::vector<ReadRange> coalesced;
+ auto itr = ranges.begin();
+
+ // Start of the current coalesced range and end (exclusive) of previous range.
+ // Both are initialized with the start of first range which is a placeholder value.
+ uint64_t coalescedStart = itr->offset;
+ uint64_t coalescedEnd = coalescedStart + itr->length;
+
+ for (++itr; itr < ranges.end(); ++itr) {
+ const uint64_t currentRangeStart = itr->offset;
+ const uint64_t currentRangeEnd = currentRangeStart + itr->length;
+
+ assert(coalescedStart < coalescedEnd);
+ assert(currentRangeStart < currentRangeEnd);
+
+ // At this point, the coalesced range is [coalesced_start, prev_range_end).
+ // Stop coalescing if:
+ // - coalesced range is too large, or
+ // - distance (hole/gap) between consecutive ranges is too large.
+ if ((currentRangeEnd - coalescedStart > rangeSizeLimit) ||
+ (currentRangeStart > coalescedEnd + holeSizeLimit)) {
+ coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart});
+ coalescedStart = currentRangeStart;
+ }
+
+ // Update the prev_range_end with the current range.
+ coalescedEnd = currentRangeEnd;
+ }
+ coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart});
+
+ assert(coalesced.front().offset == ranges.front().offset);
+ assert(coalesced.back().offset + coalesced.back().length ==
+ ranges.back().offset + ranges.back().length);
+ return coalesced;
+ }
+
+ std::vector<ReadRange> ReadRangeCombiner::coalesceReadRanges(std::vector<ReadRange> ranges,
+ uint64_t holeSizeLimit,
+ uint64_t rangeSizeLimit) {
+ assert(rangeSizeLimit > holeSizeLimit);
+
+ ReadRangeCombiner combiner{holeSizeLimit, rangeSizeLimit};
+ return combiner.coalesce(std::move(ranges));
+ }
+
+ void ReadRangeCache::cache(std::vector<ReadRange> ranges) {
+ ranges = ReadRangeCombiner::coalesceReadRanges(std::move(ranges), options_.holeSizeLimit,
+ options_.rangeSizeLimit);
+
+ std::vector<RangeCacheEntry> newEntries = makeCacheEntries(ranges);
+ // Add new entries, themselves ordered by offset
+ if (entries_.size() > 0) {
+ std::vector<RangeCacheEntry> merged(entries_.size() + newEntries.size());
+ std::merge(entries_.begin(), entries_.end(), newEntries.begin(), newEntries.end(),
+ merged.begin());
+ entries_ = std::move(merged);
+ } else {
+ entries_ = std::move(newEntries);
+ }
+ }
+
+ BufferSlice ReadRangeCache::read(const ReadRange& range) {
+ if (range.length == 0) {
+ return {std::make_shared<Buffer>(*memoryPool_, 0), 0, 0};
+ }
+
+ const auto it = std::lower_bound(entries_.begin(), entries_.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length <
+ range.offset + range.length;
+ });
+
+ BufferSlice result{};
+ bool hit_cache = false;
+ if (it != entries_.end() && it->range.contains(range)) {
+ hit_cache = it->future.valid();
+ it->future.get();
+ result = BufferSlice{it->buffer, range.offset - it->range.offset, range.length};
+ }
+
+ if (metrics_) {
+ if (hit_cache)
+ metrics_->ReadRangeCacheHits.fetch_add(1);
+ else
+ metrics_->ReadRangeCacheMisses.fetch_add(1);
+ }
+ return result;
+ }
+
+ void ReadRangeCache::evictEntriesBefore(uint64_t boundary) {
+ auto it = std::lower_bound(entries_.begin(), entries_.end(), boundary,
+ [](const RangeCacheEntry& entry, uint64_t offset) {
+ return entry.range.offset + entry.range.length <= offset;
+ });
+ entries_.erase(entries_.begin(), it);
+ }
+
+ std::vector<RangeCacheEntry> ReadRangeCache::makeCacheEntries(
+ const std::vector<ReadRange>& ranges) const {
+ std::vector<RangeCacheEntry> newEntries;
+ newEntries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ BufferPtr buffer = std::make_shared<Buffer>(*memoryPool_, range.length);
+ std::future<void> future = stream_->readAsync(buffer->data(), buffer->size(), range.offset);
+ newEntries.emplace_back(range, std::move(buffer), std::move(future));
+ }
+ return newEntries;
+ }
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/io/Cache.hh b/contrib/libs/apache/orc/c++/src/io/Cache.hh
new file mode 100644
index 0000000000..7fc79718aa
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/io/Cache.hh
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "orc/MemoryPool.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <future>
+#include <utility>
+#include <vector>
+
+namespace orc {
+
+ struct ReadRange {
+ uint64_t offset;
+ uint64_t length;
+
+ ReadRange() = default;
+ ReadRange(uint64_t offset, uint64_t length) : offset(offset), length(length) {}
+
+ friend bool operator==(const ReadRange& left, const ReadRange& right) {
+ return (left.offset == right.offset && left.length == right.length);
+ }
+ friend bool operator!=(const ReadRange& left, const ReadRange& right) {
+ return !(left == right);
+ }
+
+ bool contains(const ReadRange& other) const {
+ return (offset <= other.offset && offset + length >= other.offset + other.length);
+ }
+ };
+
+ struct ReadRangeCombiner {
+ const uint64_t holeSizeLimit;
+ const uint64_t rangeSizeLimit;
+
+ std::vector<ReadRange> coalesce(std::vector<ReadRange> ranges) const;
+
+ static std::vector<ReadRange> coalesceReadRanges(std::vector<ReadRange> ranges,
+ uint64_t holeSizeLimit,
+ uint64_t rangeSizeLimit);
+ };
+
+ using Buffer = DataBuffer<char>;
+ using BufferPtr = std::shared_ptr<Buffer>;
+
+ struct RangeCacheEntry {
+ ReadRange range;
+ BufferPtr buffer;
+ std::shared_future<void> future; // use shared_future in case of multiple get calls
+
+ RangeCacheEntry() = default;
+ RangeCacheEntry(const ReadRange& range, BufferPtr buffer, std::future<void> future)
+ : range(range), buffer(std::move(buffer)), future(std::move(future).share()) {}
+
+ friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) {
+ return left.range.offset < right.range.offset;
+ }
+ };
+
+ struct BufferSlice {
+ BufferPtr buffer = nullptr;
+ uint64_t offset = 0;
+ uint64_t length = 0;
+ };
+
+ /// A read cache designed to hide IO latencies when reading.
+ class ReadRangeCache {
+ public:
+ /// Construct a read cache with given options
+ explicit ReadRangeCache(InputStream* stream, CacheOptions options, MemoryPool* memoryPool,
+ ReaderMetrics* metrics = nullptr)
+ : stream_(stream),
+ options_(std::move(options)),
+ memoryPool_(memoryPool),
+ metrics_(metrics) {}
+
+ ~ReadRangeCache() = default;
+
+ /// Cache the given ranges in the background.
+ ///
+ /// The caller must ensure that the ranges do not overlap with each other,
+ /// nor with previously cached ranges. Otherwise, behaviour will be undefined.
+ void cache(std::vector<ReadRange> ranges);
+
+ /// Read a range previously given to Cache().
+ BufferSlice read(const ReadRange& range);
+
+ /// Evict cache entries with its range before given boundary.
+ void evictEntriesBefore(uint64_t boundary);
+
+ private:
+ std::vector<RangeCacheEntry> makeCacheEntries(const std::vector<ReadRange>& ranges) const;
+
+ InputStream* stream_;
+ CacheOptions options_;
+ // Ordered by offset (so as to find a matching region by binary search)
+ std::vector<RangeCacheEntry> entries_;
+ MemoryPool* memoryPool_;
+ ReaderMetrics* metrics_;
+ };
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
index 3bf1781747..727d7b3278 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
@@ -39,17 +39,17 @@ namespace orc {
}
PositionProvider::PositionProvider(const std::list<uint64_t>& posns) {
- position = posns.begin();
+ position_ = posns.begin();
}
uint64_t PositionProvider::next() {
- uint64_t result = *position;
- ++position;
+ uint64_t result = *position_;
+ ++position_;
return result;
}
uint64_t PositionProvider::current() {
- return *position;
+ return *position_;
}
SeekableInputStream::~SeekableInputStream() {
@@ -62,26 +62,26 @@ namespace orc {
SeekableArrayInputStream::SeekableArrayInputStream(const unsigned char* values, uint64_t size,
uint64_t blkSize)
- : data(reinterpret_cast<const char*>(values)) {
- length = size;
- position = 0;
- blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
+ : data_(reinterpret_cast<const char*>(values)) {
+ length_ = size;
+ position_ = 0;
+ blockSize_ = blkSize == 0 ? length_ : static_cast<uint64_t>(blkSize);
}
SeekableArrayInputStream::SeekableArrayInputStream(const char* values, uint64_t size,
uint64_t blkSize)
- : data(values) {
- length = size;
- position = 0;
- blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
+ : data_(values) {
+ length_ = size;
+ position_ = 0;
+ blockSize_ = blkSize == 0 ? length_ : static_cast<uint64_t>(blkSize);
}
bool SeekableArrayInputStream::Next(const void** buffer, int* size) {
- uint64_t currentSize = std::min(length - position, blockSize);
+ uint64_t currentSize = std::min(length_ - position_, blockSize_);
if (currentSize > 0) {
- *buffer = data + position;
+ *buffer = data_ + position_;
*size = static_cast<int>(currentSize);
- position += currentSize;
+ position_ += currentSize;
return true;
}
*size = 0;
@@ -91,8 +91,8 @@ namespace orc {
void SeekableArrayInputStream::BackUp(int count) {
if (count >= 0) {
uint64_t unsignedCount = static_cast<uint64_t>(count);
- if (unsignedCount <= blockSize && unsignedCount <= position) {
- position -= unsignedCount;
+ if (unsignedCount <= blockSize_ && unsignedCount <= position_) {
+ position_ -= unsignedCount;
} else {
throw std::logic_error("Can't backup that much!");
}
@@ -102,27 +102,27 @@ namespace orc {
bool SeekableArrayInputStream::Skip(int count) {
if (count >= 0) {
uint64_t unsignedCount = static_cast<uint64_t>(count);
- if (unsignedCount + position <= length) {
- position += unsignedCount;
+ if (unsignedCount + position_ <= length_) {
+ position_ += unsignedCount;
return true;
} else {
- position = length;
+ position_ = length_;
}
}
return false;
}
int64_t SeekableArrayInputStream::ByteCount() const {
- return static_cast<google::protobuf::int64>(position);
+ return static_cast<google::protobuf::int64>(position_);
}
void SeekableArrayInputStream::seek(PositionProvider& seekPosition) {
- position = seekPosition.next();
+ position_ = seekPosition.next();
}
std::string SeekableArrayInputStream::getName() const {
std::ostringstream result;
- result << "SeekableArrayInputStream " << position << " of " << length;
+ result << "SeekableArrayInputStream " << position_ << " of " << length_;
return result.str();
}
@@ -131,16 +131,16 @@ namespace orc {
}
SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, uint64_t offset,
- uint64_t byteCount, MemoryPool& _pool,
- uint64_t _blockSize)
- : pool(_pool),
- input(stream),
- start(offset),
- length(byteCount),
- blockSize(computeBlock(_blockSize, length)) {
- position = 0;
- buffer.reset(new DataBuffer<char>(pool));
- pushBack = 0;
+ uint64_t byteCount, MemoryPool& pool,
+ uint64_t blockSize)
+ : pool_(pool),
+ input_(stream),
+ start_(offset),
+ length_(byteCount),
+ blockSize_(computeBlock(blockSize, length_)) {
+ position_ = 0;
+ buffer_.reset(new DataBuffer<char>(pool_));
+ pushBack_ = 0;
}
SeekableFileInputStream::~SeekableFileInputStream() {
@@ -149,19 +149,19 @@ namespace orc {
bool SeekableFileInputStream::Next(const void** data, int* size) {
uint64_t bytesRead;
- if (pushBack != 0) {
- *data = buffer->data() + (buffer->size() - pushBack);
- bytesRead = pushBack;
+ if (pushBack_ != 0) {
+ *data = buffer_->data() + (buffer_->size() - pushBack_);
+ bytesRead = pushBack_;
} else {
- bytesRead = std::min(length - position, blockSize);
- buffer->resize(bytesRead);
+ bytesRead = std::min(length_ - position_, blockSize_);
+ buffer_->resize(bytesRead);
if (bytesRead > 0) {
- input->read(buffer->data(), bytesRead, start + position);
- *data = static_cast<void*>(buffer->data());
+ input_->read(buffer_->data(), bytesRead, start_ + position_);
+ *data = static_cast<void*>(buffer_->data());
}
}
- position += bytesRead;
- pushBack = 0;
+ position_ += bytesRead;
+ pushBack_ = 0;
*size = static_cast<int>(bytesRead);
return bytesRead != 0;
}
@@ -171,14 +171,14 @@ namespace orc {
throw std::logic_error("can't backup negative distances");
}
uint64_t count = static_cast<uint64_t>(signedCount);
- if (pushBack > 0) {
+ if (pushBack_ > 0) {
throw std::logic_error("can't backup unless we just called Next");
}
- if (count > blockSize || count > position) {
+ if (count > blockSize_ || count > position_) {
throw std::logic_error("can't backup that far");
}
- pushBack = static_cast<uint64_t>(count);
- position -= pushBack;
+ pushBack_ = static_cast<uint64_t>(count);
+ position_ -= pushBack_;
}
bool SeekableFileInputStream::Skip(int signedCount) {
@@ -186,27 +186,27 @@ namespace orc {
return false;
}
uint64_t count = static_cast<uint64_t>(signedCount);
- position = std::min(position + count, length);
- pushBack = 0;
- return position < length;
+ position_ = std::min(position_ + count, length_);
+ pushBack_ = 0;
+ return position_ < length_;
}
int64_t SeekableFileInputStream::ByteCount() const {
- return static_cast<int64_t>(position);
+ return static_cast<int64_t>(position_);
}
void SeekableFileInputStream::seek(PositionProvider& location) {
- position = location.next();
- if (position > length) {
- position = length;
+ position_ = location.next();
+ if (position_ > length_) {
+ position_ = length_;
throw std::logic_error("seek too far");
}
- pushBack = 0;
+ pushBack_ = 0;
}
std::string SeekableFileInputStream::getName() const {
std::ostringstream result;
- result << input->getName() << " from " << start << " for " << length;
+ result << input_->getName() << " from " << start_ << " for " << length_;
return result.str();
}
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
index 33c64f8809..8b251c9301 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
@@ -35,7 +35,7 @@ namespace orc {
class PositionProvider {
private:
- std::list<uint64_t>::const_iterator position;
+ std::list<uint64_t>::const_iterator position_;
public:
PositionProvider(const std::list<uint64_t>& positions);
@@ -60,14 +60,14 @@ namespace orc {
*/
class SeekableArrayInputStream : public SeekableInputStream {
private:
- const char* data;
- uint64_t length;
- uint64_t position;
- uint64_t blockSize;
+ const char* data_;
+ uint64_t length_;
+ uint64_t position_;
+ uint64_t blockSize_;
public:
- SeekableArrayInputStream(const unsigned char* list, uint64_t length, uint64_t block_size = 0);
- SeekableArrayInputStream(const char* list, uint64_t length, uint64_t block_size = 0);
+ SeekableArrayInputStream(const unsigned char* list, uint64_t length, uint64_t blockSize = 0);
+ SeekableArrayInputStream(const char* list, uint64_t length, uint64_t blockSize = 0);
virtual ~SeekableArrayInputStream() override;
virtual bool Next(const void** data, int* size) override;
virtual void BackUp(int count) override;
@@ -82,14 +82,14 @@ namespace orc {
*/
class SeekableFileInputStream : public SeekableInputStream {
private:
- MemoryPool& pool;
- InputStream* const input;
- const uint64_t start;
- const uint64_t length;
- const uint64_t blockSize;
- std::unique_ptr<DataBuffer<char> > buffer;
- uint64_t position;
- uint64_t pushBack;
+ MemoryPool& pool_;
+ InputStream* const input_;
+ const uint64_t start_;
+ const uint64_t length_;
+ const uint64_t blockSize_;
+ std::unique_ptr<DataBuffer<char> > buffer_;
+ uint64_t position_;
+ uint64_t pushBack_;
public:
SeekableFileInputStream(InputStream* input, uint64_t offset, uint64_t byteCount,
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
index 7d9fb92206..4ca59dbe95 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
@@ -29,11 +29,11 @@ namespace orc {
}
BufferedOutputStream::BufferedOutputStream(MemoryPool& pool, OutputStream* outStream,
- uint64_t capacity_, uint64_t blockSize_,
- WriterMetrics* metrics_)
- : outputStream(outStream), blockSize(blockSize_), metrics(metrics_) {
- dataBuffer.reset(new BlockBuffer(pool, blockSize));
- dataBuffer->reserve(capacity_);
+ uint64_t capacity, uint64_t blockSize,
+ WriterMetrics* metrics)
+ : outputStream_(outStream), blockSize_(blockSize), metrics_(metrics) {
+ dataBuffer_.reset(new BlockBuffer(pool, blockSize_));
+ dataBuffer_->reserve(capacity);
}
BufferedOutputStream::~BufferedOutputStream() {
@@ -41,7 +41,7 @@ namespace orc {
}
bool BufferedOutputStream::Next(void** buffer, int* size) {
- auto block = dataBuffer->getNextBlock();
+ auto block = dataBuffer_->getNextBlock();
if (block.data == nullptr) {
throw std::logic_error("Failed to get next buffer from block buffer.");
}
@@ -53,16 +53,20 @@ namespace orc {
void BufferedOutputStream::BackUp(int count) {
if (count >= 0) {
uint64_t unsignedCount = static_cast<uint64_t>(count);
- if (unsignedCount <= dataBuffer->size()) {
- dataBuffer->resize(dataBuffer->size() - unsignedCount);
+ if (unsignedCount <= dataBuffer_->size()) {
+ dataBuffer_->resize(dataBuffer_->size() - unsignedCount);
} else {
throw std::logic_error("Can't backup that much!");
}
}
}
+ void BufferedOutputStream::finishStream() {
+ // PASS
+ }
+
int64_t BufferedOutputStream::ByteCount() const {
- return static_cast<google::protobuf::int64>(dataBuffer->size());
+ return static_cast<google::protobuf::int64>(dataBuffer_->size());
}
bool BufferedOutputStream::WriteAliasedRaw(const void*, int) {
@@ -75,70 +79,80 @@ namespace orc {
std::string BufferedOutputStream::getName() const {
std::ostringstream result;
- result << "BufferedOutputStream " << dataBuffer->size() << " of " << dataBuffer->capacity();
+ result << "BufferedOutputStream " << dataBuffer_->size() << " of " << dataBuffer_->capacity();
return result.str();
}
uint64_t BufferedOutputStream::getSize() const {
- return dataBuffer->size();
+ return dataBuffer_->size();
}
uint64_t BufferedOutputStream::flush() {
- uint64_t dataSize = dataBuffer->size();
+ uint64_t dataSize = dataBuffer_->size();
// flush data buffer into outputStream
if (dataSize > 0) {
- SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount);
- dataBuffer->writeTo(outputStream, metrics);
+ SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount);
+ dataBuffer_->writeTo(outputStream_, metrics_);
}
- dataBuffer->resize(0);
+ dataBuffer_->resize(0);
return dataSize;
}
void BufferedOutputStream::suppress() {
- dataBuffer->resize(0);
+ dataBuffer_->resize(0);
+ }
+
+ uint64_t BufferedOutputStream::getRawInputBufferSize() const {
+ throw std::logic_error("getRawInputBufferSize is not supported.");
}
void AppendOnlyBufferedStream::write(const char* data, size_t size) {
size_t dataOffset = 0;
while (size > 0) {
- if (bufferOffset == bufferLength) {
- if (!outStream->Next(reinterpret_cast<void**>(&buffer), &bufferLength)) {
+ if (bufferOffset_ == bufferLength_) {
+ if (!outStream_->Next(reinterpret_cast<void**>(&buffer_), &bufferLength_)) {
throw std::logic_error("Failed to allocate buffer.");
}
- bufferOffset = 0;
+ bufferOffset_ = 0;
}
- size_t len = std::min(static_cast<size_t>(bufferLength - bufferOffset), size);
- memcpy(buffer + bufferOffset, data + dataOffset, len);
- bufferOffset += static_cast<int>(len);
+ size_t len = std::min(static_cast<size_t>(bufferLength_ - bufferOffset_), size);
+ memcpy(buffer_ + bufferOffset_, data + dataOffset, len);
+ bufferOffset_ += static_cast<int>(len);
dataOffset += len;
size -= len;
}
}
uint64_t AppendOnlyBufferedStream::getSize() const {
- return outStream->getSize();
+ return outStream_->getSize();
}
uint64_t AppendOnlyBufferedStream::flush() {
- outStream->BackUp(bufferLength - bufferOffset);
- bufferOffset = bufferLength = 0;
- buffer = nullptr;
- return outStream->flush();
+ finishStream();
+ return outStream_->flush();
}
void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const {
- uint64_t flushedSize = outStream->getSize();
- uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset);
- if (outStream->isCompressed()) {
+ uint64_t flushedSize = outStream_->getSize();
+ uint64_t unusedBufferSize = static_cast<uint64_t>(bufferLength_ - bufferOffset_);
+ if (outStream_->isCompressed()) {
// start of the compression chunk in the stream
recorder->add(flushedSize);
- // number of decompressed bytes that need to be consumed
- recorder->add(unflushedSize);
+ // There are multiple blocks in the input buffer, but bufferPosition only records the
+ // effective length of the last block. We need rawInputBufferSize to record the total length
+ // of all variable blocks.
+ recorder->add(outStream_->getRawInputBufferSize() - unusedBufferSize);
} else {
- flushedSize -= static_cast<uint64_t>(bufferLength);
// byte offset of the start location
- recorder->add(flushedSize + unflushedSize);
+ recorder->add(flushedSize - unusedBufferSize);
}
}
+ void AppendOnlyBufferedStream::finishStream() {
+ outStream_->BackUp(bufferLength_ - bufferOffset_);
+ outStream_->finishStream();
+ bufferOffset_ = bufferLength_ = 0;
+ buffer_ = nullptr;
+ }
+
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
index d8bc21ce6d..b029818125 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
@@ -49,14 +49,14 @@ namespace orc {
*/
class BufferedOutputStream : public google::protobuf::io::ZeroCopyOutputStream {
private:
- OutputStream* outputStream;
- std::unique_ptr<BlockBuffer> dataBuffer;
- uint64_t blockSize;
- WriterMetrics* metrics;
+ OutputStream* outputStream_;
+ std::unique_ptr<BlockBuffer> dataBuffer_;
+ uint64_t blockSize_;
+ WriterMetrics* metrics_;
public:
BufferedOutputStream(MemoryPool& pool, OutputStream* outStream, uint64_t capacity,
- uint64_t block_size, WriterMetrics* metrics);
+ uint64_t blockSize, WriterMetrics* metrics);
virtual ~BufferedOutputStream() override;
virtual bool Next(void** data, int* size) override;
@@ -69,10 +69,12 @@ namespace orc {
virtual uint64_t getSize() const;
virtual uint64_t flush();
virtual void suppress();
+ virtual uint64_t getRawInputBufferSize() const;
virtual bool isCompressed() const {
return false;
}
+ virtual void finishStream();
};
DIAGNOSTIC_POP
@@ -84,20 +86,21 @@ namespace orc {
*/
class AppendOnlyBufferedStream {
private:
- std::unique_ptr<BufferedOutputStream> outStream;
- char* buffer;
- int bufferOffset, bufferLength;
+ std::unique_ptr<BufferedOutputStream> outStream_;
+ char* buffer_;
+ int bufferOffset_, bufferLength_;
public:
- AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream)
- : outStream(std::move(_outStream)) {
- buffer = nullptr;
- bufferOffset = bufferLength = 0;
+ AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> outStream)
+ : outStream_(std::move(outStream)) {
+ buffer_ = nullptr;
+ bufferOffset_ = bufferLength_ = 0;
}
void write(const char* data, size_t size);
uint64_t getSize() const;
uint64_t flush();
+ void finishStream();
void recordPosition(PositionRecorder* recorder) const;
};
diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
index 9176c1f6c3..e49bca4b77 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
@@ -24,39 +24,39 @@
namespace orc {
ExpressionTree::ExpressionTree(Operator op)
- : mOperator(op), mLeaf(UNUSED_LEAF), mConstant(TruthValue::YES_NO_NULL) {}
+ : operator_(op), leaf_(UNUSED_LEAF), constant_(TruthValue::YES_NO_NULL) {}
ExpressionTree::ExpressionTree(Operator op, std::initializer_list<TreeNode> children)
- : mOperator(op),
- mChildren(children.begin(), children.end()),
- mLeaf(UNUSED_LEAF),
- mConstant(TruthValue::YES_NO_NULL) {
+ : operator_(op),
+ children_(children.begin(), children.end()),
+ leaf_(UNUSED_LEAF),
+ constant_(TruthValue::YES_NO_NULL) {
// PASS
}
ExpressionTree::ExpressionTree(size_t leaf)
- : mOperator(Operator::LEAF), mChildren(), mLeaf(leaf), mConstant(TruthValue::YES_NO_NULL) {
+ : operator_(Operator::LEAF), children_(), leaf_(leaf), constant_(TruthValue::YES_NO_NULL) {
// PASS
}
ExpressionTree::ExpressionTree(TruthValue constant)
- : mOperator(Operator::CONSTANT), mChildren(), mLeaf(UNUSED_LEAF), mConstant(constant) {
+ : operator_(Operator::CONSTANT), children_(), leaf_(UNUSED_LEAF), constant_(constant) {
// PASS
}
ExpressionTree::ExpressionTree(const ExpressionTree& other)
- : mOperator(other.mOperator), mLeaf(other.mLeaf), mConstant(other.mConstant) {
- for (TreeNode child : other.mChildren) {
- mChildren.emplace_back(std::make_shared<ExpressionTree>(*child));
+ : operator_(other.operator_), leaf_(other.leaf_), constant_(other.constant_) {
+ for (TreeNode child : other.children_) {
+ children_.emplace_back(std::make_shared<ExpressionTree>(*child));
}
}
ExpressionTree::Operator ExpressionTree::getOperator() const {
- return mOperator;
+ return operator_;
}
const std::vector<TreeNode>& ExpressionTree::getChildren() const {
- return mChildren;
+ return children_;
}
std::vector<TreeNode>& ExpressionTree::getChildren() {
@@ -65,7 +65,7 @@ namespace orc {
}
const TreeNode ExpressionTree::getChild(size_t i) const {
- return mChildren.at(i);
+ return children_.at(i);
}
TreeNode ExpressionTree::getChild(size_t i) {
@@ -74,47 +74,47 @@ namespace orc {
}
TruthValue ExpressionTree::getConstant() const {
- assert(mOperator == Operator::CONSTANT);
- return mConstant;
+ assert(operator_ == Operator::CONSTANT);
+ return constant_;
}
size_t ExpressionTree::getLeaf() const {
- assert(mOperator == Operator::LEAF);
- return mLeaf;
+ assert(operator_ == Operator::LEAF);
+ return leaf_;
}
void ExpressionTree::setLeaf(size_t leaf) {
- assert(mOperator == Operator::LEAF);
- mLeaf = leaf;
+ assert(operator_ == Operator::LEAF);
+ leaf_ = leaf;
}
void ExpressionTree::addChild(TreeNode child) {
- mChildren.push_back(child);
+ children_.push_back(child);
}
TruthValue ExpressionTree::evaluate(const std::vector<TruthValue>& leaves) const {
TruthValue result;
- switch (mOperator) {
+ switch (operator_) {
case Operator::OR: {
- result = mChildren.at(0)->evaluate(leaves);
- for (size_t i = 1; i < mChildren.size() && !isNeeded(result); ++i) {
- result = mChildren.at(i)->evaluate(leaves) || result;
+ result = children_.at(0)->evaluate(leaves);
+ for (size_t i = 1; i < children_.size() && !isNeeded(result); ++i) {
+ result = children_.at(i)->evaluate(leaves) || result;
}
return result;
}
case Operator::AND: {
- result = mChildren.at(0)->evaluate(leaves);
- for (size_t i = 1; i < mChildren.size() && isNeeded(result); ++i) {
- result = mChildren.at(i)->evaluate(leaves) && result;
+ result = children_.at(0)->evaluate(leaves);
+ for (size_t i = 1; i < children_.size() && isNeeded(result); ++i) {
+ result = children_.at(i)->evaluate(leaves) && result;
}
return result;
}
case Operator::NOT:
- return !mChildren.at(0)->evaluate(leaves);
+ return !children_.at(0)->evaluate(leaves);
case Operator::LEAF:
- return leaves[mLeaf];
+ return leaves[leaf_];
case Operator::CONSTANT:
- return mConstant;
+ return constant_;
default:
throw std::invalid_argument("Unknown operator!");
}
@@ -143,29 +143,29 @@ namespace orc {
std::string ExpressionTree::toString() const {
std::ostringstream sstream;
- switch (mOperator) {
+ switch (operator_) {
case Operator::OR:
sstream << "(or";
- for (const auto& child : mChildren) {
+ for (const auto& child : children_) {
sstream << ' ' << child->toString();
}
sstream << ')';
break;
case Operator::AND:
sstream << "(and";
- for (const auto& child : mChildren) {
+ for (const auto& child : children_) {
sstream << ' ' << child->toString();
}
sstream << ')';
break;
case Operator::NOT:
- sstream << "(not " << mChildren.at(0)->toString() << ')';
+ sstream << "(not " << children_.at(0)->toString() << ')';
break;
case Operator::LEAF:
- sstream << "leaf-" << mLeaf;
+ sstream << "leaf-" << leaf_;
break;
case Operator::CONSTANT:
- sstream << to_string(mConstant);
+ sstream << to_string(constant_);
break;
default:
throw std::invalid_argument("unknown operator!");
diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh
index 3e0b331a2d..0f801852f8 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh
@@ -74,10 +74,10 @@ namespace orc {
TruthValue evaluate(const std::vector<TruthValue>& leaves) const;
private:
- Operator mOperator;
- std::vector<TreeNode> mChildren;
- size_t mLeaf;
- TruthValue mConstant;
+ Operator operator_;
+ std::vector<TreeNode> children_;
+ size_t leaf_;
+ TruthValue constant_;
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
index c0cdd62201..f36db79437 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
@@ -26,196 +26,196 @@
namespace orc {
Literal::Literal(PredicateDataType type) {
- mType = type;
- mValue.DecimalVal = 0;
- mSize = 0;
- mIsNull = true;
- mPrecision = 0;
- mScale = 0;
- mHashCode = 0;
+ type_ = type;
+ value_.DecimalVal = 0;
+ size_ = 0;
+ isNull_ = true;
+ precision_ = 0;
+ scale_ = 0;
+ hashCode_ = 0;
}
Literal::Literal(int64_t val) {
- mType = PredicateDataType::LONG;
- mValue.IntVal = val;
- mSize = sizeof(val);
- mIsNull = false;
- mPrecision = 0;
- mScale = 0;
- mHashCode = hashCode();
+ type_ = PredicateDataType::LONG;
+ value_.IntVal = val;
+ size_ = sizeof(val);
+ isNull_ = false;
+ precision_ = 0;
+ scale_ = 0;
+ hashCode_ = hashCode();
}
Literal::Literal(double val) {
- mType = PredicateDataType::FLOAT;
- mValue.DoubleVal = val;
- mSize = sizeof(val);
- mIsNull = false;
- mPrecision = 0;
- mScale = 0;
- mHashCode = hashCode();
+ type_ = PredicateDataType::FLOAT;
+ value_.DoubleVal = val;
+ size_ = sizeof(val);
+ isNull_ = false;
+ precision_ = 0;
+ scale_ = 0;
+ hashCode_ = hashCode();
}
Literal::Literal(bool val) {
- mType = PredicateDataType::BOOLEAN;
- mValue.BooleanVal = val;
- mSize = sizeof(val);
- mIsNull = false;
- mPrecision = 0;
- mScale = 0;
- mHashCode = hashCode();
+ type_ = PredicateDataType::BOOLEAN;
+ value_.BooleanVal = val;
+ size_ = sizeof(val);
+ isNull_ = false;
+ precision_ = 0;
+ scale_ = 0;
+ hashCode_ = hashCode();
}
Literal::Literal(PredicateDataType type, int64_t val) {
if (type != PredicateDataType::DATE) {
throw std::invalid_argument("only DATE is supported here!");
}
- mType = type;
- mValue.IntVal = val;
- mSize = sizeof(val);
- mIsNull = false;
- mPrecision = 0;
- mScale = 0;
- mHashCode = hashCode();
+ type_ = type;
+ value_.IntVal = val;
+ size_ = sizeof(val);
+ isNull_ = false;
+ precision_ = 0;
+ scale_ = 0;
+ hashCode_ = hashCode();
}
Literal::Literal(const char* str, size_t size) {
- mType = PredicateDataType::STRING;
- mValue.Buffer = new char[size];
- memcpy(mValue.Buffer, str, size);
- mSize = size;
- mIsNull = false;
- mPrecision = 0;
- mScale = 0;
- mHashCode = hashCode();
+ type_ = PredicateDataType::STRING;
+ value_.Buffer = new char[size];
+ memcpy(value_.Buffer, str, size);
+ size_ = size;
+ isNull_ = false;
+ precision_ = 0;
+ scale_ = 0;
+ hashCode_ = hashCode();
}
Literal::Literal(Int128 val, int32_t precision, int32_t scale) {
- mType = PredicateDataType::DECIMAL;
- mValue.DecimalVal = val;
- mPrecision = precision;
- mScale = scale;
- mSize = sizeof(Int128);
- mIsNull = false;
- mHashCode = hashCode();
+ type_ = PredicateDataType::DECIMAL;
+ value_.DecimalVal = val;
+ precision_ = precision;
+ scale_ = scale;
+ size_ = sizeof(Int128);
+ isNull_ = false;
+ hashCode_ = hashCode();
}
Literal::Literal(int64_t second, int32_t nanos) {
- mType = PredicateDataType::TIMESTAMP;
- mValue.TimeStampVal.second = second;
- mValue.TimeStampVal.nanos = nanos;
- mPrecision = 0;
- mScale = 0;
- mSize = sizeof(Timestamp);
- mIsNull = false;
- mHashCode = hashCode();
+ type_ = PredicateDataType::TIMESTAMP;
+ value_.TimeStampVal.second = second;
+ value_.TimeStampVal.nanos = nanos;
+ precision_ = 0;
+ scale_ = 0;
+ size_ = sizeof(Timestamp);
+ isNull_ = false;
+ hashCode_ = hashCode();
}
Literal::Literal(const Literal& r)
- : mType(r.mType), mSize(r.mSize), mIsNull(r.mIsNull), mHashCode(r.mHashCode) {
- if (mType == PredicateDataType::STRING) {
- mValue.Buffer = new char[r.mSize];
- memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize);
- mPrecision = 0;
- mScale = 0;
- } else if (mType == PredicateDataType::DECIMAL) {
- mPrecision = r.mPrecision;
- mScale = r.mScale;
- mValue = r.mValue;
- } else if (mType == PredicateDataType::TIMESTAMP) {
- mValue.TimeStampVal = r.mValue.TimeStampVal;
+ : type_(r.type_), size_(r.size_), isNull_(r.isNull_), hashCode_(r.hashCode_) {
+ if (type_ == PredicateDataType::STRING) {
+ value_.Buffer = new char[r.size_];
+ memcpy(value_.Buffer, r.value_.Buffer, r.size_);
+ precision_ = 0;
+ scale_ = 0;
+ } else if (type_ == PredicateDataType::DECIMAL) {
+ precision_ = r.precision_;
+ scale_ = r.scale_;
+ value_ = r.value_;
+ } else if (type_ == PredicateDataType::TIMESTAMP) {
+ value_.TimeStampVal = r.value_.TimeStampVal;
} else {
- mValue = r.mValue;
- mPrecision = 0;
- mScale = 0;
+ value_ = r.value_;
+ precision_ = 0;
+ scale_ = 0;
}
}
Literal::~Literal() {
- if (mType == PredicateDataType::STRING && mValue.Buffer) {
- delete[] mValue.Buffer;
- mValue.Buffer = nullptr;
+ if (type_ == PredicateDataType::STRING && value_.Buffer) {
+ delete[] value_.Buffer;
+ value_.Buffer = nullptr;
}
}
Literal& Literal::operator=(const Literal& r) {
if (this != &r) {
- if (mType == PredicateDataType::STRING && mValue.Buffer) {
- delete[] mValue.Buffer;
- mValue.Buffer = nullptr;
+ if (type_ == PredicateDataType::STRING && value_.Buffer) {
+ delete[] value_.Buffer;
+ value_.Buffer = nullptr;
}
- mType = r.mType;
- mSize = r.mSize;
- mIsNull = r.mIsNull;
- mPrecision = r.mPrecision;
- mScale = r.mScale;
- if (mType == PredicateDataType::STRING) {
- mValue.Buffer = new char[r.mSize];
- memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize);
- } else if (mType == PredicateDataType::TIMESTAMP) {
- mValue.TimeStampVal = r.mValue.TimeStampVal;
+ type_ = r.type_;
+ size_ = r.size_;
+ isNull_ = r.isNull_;
+ precision_ = r.precision_;
+ scale_ = r.scale_;
+ if (type_ == PredicateDataType::STRING) {
+ value_.Buffer = new char[r.size_];
+ memcpy(value_.Buffer, r.value_.Buffer, r.size_);
+ } else if (type_ == PredicateDataType::TIMESTAMP) {
+ value_.TimeStampVal = r.value_.TimeStampVal;
} else {
- mValue = r.mValue;
+ value_ = r.value_;
}
- mHashCode = r.mHashCode;
+ hashCode_ = r.hashCode_;
}
return *this;
}
std::string Literal::toString() const {
- if (mIsNull) {
+ if (isNull_) {
return "null";
}
std::ostringstream sstream;
- switch (mType) {
+ switch (type_) {
case PredicateDataType::LONG:
- sstream << mValue.IntVal;
+ sstream << value_.IntVal;
break;
case PredicateDataType::DATE:
- sstream << mValue.DateVal;
+ sstream << value_.DateVal;
break;
case PredicateDataType::TIMESTAMP:
- sstream << mValue.TimeStampVal.second << "." << mValue.TimeStampVal.nanos;
+ sstream << value_.TimeStampVal.second << "." << value_.TimeStampVal.nanos;
break;
case PredicateDataType::FLOAT:
- sstream << mValue.DoubleVal;
+ sstream << value_.DoubleVal;
break;
case PredicateDataType::BOOLEAN:
- sstream << (mValue.BooleanVal ? "true" : "false");
+ sstream << (value_.BooleanVal ? "true" : "false");
break;
case PredicateDataType::STRING:
- sstream << std::string(mValue.Buffer, mSize);
+ sstream << std::string(value_.Buffer, size_);
break;
case PredicateDataType::DECIMAL:
- sstream << mValue.DecimalVal.toDecimalString(mScale);
+ sstream << value_.DecimalVal.toDecimalString(scale_);
break;
}
return sstream.str();
}
size_t Literal::hashCode() const {
- if (mIsNull) {
+ if (isNull_) {
return 0;
}
- switch (mType) {
+ switch (type_) {
case PredicateDataType::LONG:
- return std::hash<int64_t>{}(mValue.IntVal);
+ return std::hash<int64_t>{}(value_.IntVal);
case PredicateDataType::DATE:
- return std::hash<int64_t>{}(mValue.DateVal);
+ return std::hash<int64_t>{}(value_.DateVal);
case PredicateDataType::TIMESTAMP:
- return std::hash<int64_t>{}(mValue.TimeStampVal.second) * 17 +
- std::hash<int32_t>{}(mValue.TimeStampVal.nanos);
+ return std::hash<int64_t>{}(value_.TimeStampVal.second) * 17 +
+ std::hash<int32_t>{}(value_.TimeStampVal.nanos);
case PredicateDataType::FLOAT:
- return std::hash<double>{}(mValue.DoubleVal);
+ return std::hash<double>{}(value_.DoubleVal);
case PredicateDataType::BOOLEAN:
- return std::hash<bool>{}(mValue.BooleanVal);
+ return std::hash<bool>{}(value_.BooleanVal);
case PredicateDataType::STRING:
- return std::hash<std::string>{}(std::string(mValue.Buffer, mSize));
+ return std::hash<std::string>{}(std::string(value_.Buffer, size_));
case PredicateDataType::DECIMAL:
// current glibc does not support hash<int128_t>
- return std::hash<int64_t>{}(mValue.IntVal);
+ return std::hash<int64_t>{}(value_.IntVal);
default:
return 0;
}
@@ -225,30 +225,30 @@ namespace orc {
if (this == &r) {
return true;
}
- if (mHashCode != r.mHashCode || mType != r.mType || mIsNull != r.mIsNull) {
+ if (hashCode_ != r.hashCode_ || type_ != r.type_ || isNull_ != r.isNull_) {
return false;
}
- if (mIsNull) {
+ if (isNull_) {
return true;
}
- switch (mType) {
+ switch (type_) {
case PredicateDataType::LONG:
- return mValue.IntVal == r.mValue.IntVal;
+ return value_.IntVal == r.value_.IntVal;
case PredicateDataType::DATE:
- return mValue.DateVal == r.mValue.DateVal;
+ return value_.DateVal == r.value_.DateVal;
case PredicateDataType::TIMESTAMP:
- return mValue.TimeStampVal == r.mValue.TimeStampVal;
+ return value_.TimeStampVal == r.value_.TimeStampVal;
case PredicateDataType::FLOAT:
- return std::fabs(mValue.DoubleVal - r.mValue.DoubleVal) <
+ return std::fabs(value_.DoubleVal - r.value_.DoubleVal) <
std::numeric_limits<double>::epsilon();
case PredicateDataType::BOOLEAN:
- return mValue.BooleanVal == r.mValue.BooleanVal;
+ return value_.BooleanVal == r.value_.BooleanVal;
case PredicateDataType::STRING:
- return mSize == r.mSize && memcmp(mValue.Buffer, r.mValue.Buffer, mSize) == 0;
+ return size_ == r.size_ && memcmp(value_.Buffer, r.value_.Buffer, size_) == 0;
case PredicateDataType::DECIMAL:
- return mValue.DecimalVal == r.mValue.DecimalVal;
+ return value_.DecimalVal == r.value_.DecimalVal;
default:
return true;
}
@@ -269,38 +269,38 @@ namespace orc {
}
int64_t Literal::getLong() const {
- validate(mIsNull, mType, PredicateDataType::LONG);
- return mValue.IntVal;
+ validate(isNull_, type_, PredicateDataType::LONG);
+ return value_.IntVal;
}
int64_t Literal::getDate() const {
- validate(mIsNull, mType, PredicateDataType::DATE);
- return mValue.DateVal;
+ validate(isNull_, type_, PredicateDataType::DATE);
+ return value_.DateVal;
}
Literal::Timestamp Literal::getTimestamp() const {
- validate(mIsNull, mType, PredicateDataType::TIMESTAMP);
- return mValue.TimeStampVal;
+ validate(isNull_, type_, PredicateDataType::TIMESTAMP);
+ return value_.TimeStampVal;
}
double Literal::getFloat() const {
- validate(mIsNull, mType, PredicateDataType::FLOAT);
- return mValue.DoubleVal;
+ validate(isNull_, type_, PredicateDataType::FLOAT);
+ return value_.DoubleVal;
}
std::string Literal::getString() const {
- validate(mIsNull, mType, PredicateDataType::STRING);
- return std::string(mValue.Buffer, mSize);
+ validate(isNull_, type_, PredicateDataType::STRING);
+ return std::string(value_.Buffer, size_);
}
bool Literal::getBool() const {
- validate(mIsNull, mType, PredicateDataType::BOOLEAN);
- return mValue.BooleanVal;
+ validate(isNull_, type_, PredicateDataType::BOOLEAN);
+ return value_.BooleanVal;
}
Decimal Literal::getDecimal() const {
- validate(mIsNull, mType, PredicateDataType::DECIMAL);
- return Decimal(mValue.DecimalVal, mScale);
+ validate(isNull_, type_, PredicateDataType::DECIMAL);
+ return Decimal(value_.DecimalVal, scale_);
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
index 3c23e28beb..3ee58bfef5 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
@@ -30,77 +30,77 @@ namespace orc {
PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
Literal literal)
- : mOperator(op), mType(type), mColumnName(colName), mHasColumnName(true), mColumnId(0) {
- mLiterals.emplace_back(literal);
- mHashCode = hashCode();
+ : operator_(op), type_(type), columnName_(colName), hasColumnName_(true), columnId_(0) {
+ literals_.emplace_back(literal);
+ hashCode_ = hashCode();
validate();
}
PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
Literal literal)
- : mOperator(op), mType(type), mHasColumnName(false), mColumnId(columnId) {
- mLiterals.emplace_back(literal);
- mHashCode = hashCode();
+ : operator_(op), type_(type), hasColumnName_(false), columnId_(columnId) {
+ literals_.emplace_back(literal);
+ hashCode_ = hashCode();
validate();
}
PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
const std::initializer_list<Literal>& literals)
- : mOperator(op),
- mType(type),
- mColumnName(colName),
- mHasColumnName(true),
- mLiterals(literals.begin(), literals.end()) {
- mHashCode = hashCode();
+ : operator_(op),
+ type_(type),
+ columnName_(colName),
+ hasColumnName_(true),
+ literals_(literals.begin(), literals.end()) {
+ hashCode_ = hashCode();
validate();
}
PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
const std::initializer_list<Literal>& literals)
- : mOperator(op),
- mType(type),
- mHasColumnName(false),
- mColumnId(columnId),
- mLiterals(literals.begin(), literals.end()) {
- mHashCode = hashCode();
+ : operator_(op),
+ type_(type),
+ hasColumnName_(false),
+ columnId_(columnId),
+ literals_(literals.begin(), literals.end()) {
+ hashCode_ = hashCode();
validate();
}
PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
const std::vector<Literal>& literals)
- : mOperator(op),
- mType(type),
- mColumnName(colName),
- mHasColumnName(true),
- mLiterals(literals.begin(), literals.end()) {
- mHashCode = hashCode();
+ : operator_(op),
+ type_(type),
+ columnName_(colName),
+ hasColumnName_(true),
+ literals_(literals.begin(), literals.end()) {
+ hashCode_ = hashCode();
validate();
}
PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
const std::vector<Literal>& literals)
- : mOperator(op),
- mType(type),
- mHasColumnName(false),
- mColumnId(columnId),
- mLiterals(literals.begin(), literals.end()) {
- mHashCode = hashCode();
+ : operator_(op),
+ type_(type),
+ hasColumnName_(false),
+ columnId_(columnId),
+ literals_(literals.begin(), literals.end()) {
+ hashCode_ = hashCode();
validate();
}
void PredicateLeaf::validateColumn() const {
- if (mHasColumnName && mColumnName.empty()) {
+ if (hasColumnName_ && columnName_.empty()) {
throw std::invalid_argument("column name should not be empty");
- } else if (!mHasColumnName && mColumnId == INVALID_COLUMN_ID) {
+ } else if (!hasColumnName_ && columnId_ == INVALID_COLUMN_ID) {
throw std::invalid_argument("invalid column id");
}
}
void PredicateLeaf::validate() const {
- switch (mOperator) {
+ switch (operator_) {
case Operator::IS_NULL:
validateColumn();
- if (!mLiterals.empty()) {
+ if (!literals_.empty()) {
throw std::invalid_argument("No literal is required!");
}
break;
@@ -109,28 +109,28 @@ namespace orc {
case Operator::LESS_THAN:
case Operator::LESS_THAN_EQUALS:
validateColumn();
- if (mLiterals.size() != 1) {
+ if (literals_.size() != 1) {
throw std::invalid_argument("One literal is required!");
}
- if (static_cast<int>(mLiterals.at(0).getType()) != static_cast<int>(mType)) {
+ if (static_cast<int>(literals_.at(0).getType()) != static_cast<int>(type_)) {
throw std::invalid_argument("leaf and literal types do not match!");
}
break;
case Operator::IN:
validateColumn();
- if (mLiterals.size() < 2) {
+ if (literals_.size() < 2) {
throw std::invalid_argument("At least two literals are required!");
}
- for (auto literal : mLiterals) {
- if (static_cast<int>(literal.getType()) != static_cast<int>(mType)) {
+ for (auto literal : literals_) {
+ if (static_cast<int>(literal.getType()) != static_cast<int>(type_)) {
throw std::invalid_argument("leaf and literal types do not match!");
}
}
break;
case Operator::BETWEEN:
validateColumn();
- for (auto literal : mLiterals) {
- if (static_cast<int>(literal.getType()) != static_cast<int>(mType)) {
+ for (auto literal : literals_) {
+ if (static_cast<int>(literal.getType()) != static_cast<int>(type_)) {
throw std::invalid_argument("leaf and literal types do not match!");
}
}
@@ -141,40 +141,40 @@ namespace orc {
}
PredicateLeaf::Operator PredicateLeaf::getOperator() const {
- return mOperator;
+ return operator_;
}
PredicateDataType PredicateLeaf::getType() const {
- return mType;
+ return type_;
}
bool PredicateLeaf::hasColumnName() const {
- return mHasColumnName;
+ return hasColumnName_;
}
/**
* Get the simple column name.
*/
const std::string& PredicateLeaf::getColumnName() const {
- return mColumnName;
+ return columnName_;
}
uint64_t PredicateLeaf::getColumnId() const {
- return mColumnId;
+ return columnId_;
}
/**
* Get the literal half of the predicate leaf.
*/
Literal PredicateLeaf::getLiteral() const {
- return mLiterals.at(0);
+ return literals_.at(0);
}
/**
* For operators with multiple literals (IN and BETWEEN), get the literals.
*/
const std::vector<Literal>& PredicateLeaf::getLiteralList() const {
- return mLiterals;
+ return literals_;
}
static std::string getLiteralString(const std::vector<Literal>& literals) {
@@ -195,40 +195,40 @@ namespace orc {
}
std::string PredicateLeaf::columnDebugString() const {
- if (mHasColumnName) return mColumnName;
+ if (hasColumnName_) return columnName_;
std::ostringstream sstream;
- sstream << "column(id=" << mColumnId << ')';
+ sstream << "column(id=" << columnId_ << ')';
return sstream.str();
}
std::string PredicateLeaf::toString() const {
std::ostringstream sstream;
sstream << '(';
- switch (mOperator) {
+ switch (operator_) {
case Operator::IS_NULL:
sstream << columnDebugString() << " is null";
break;
case Operator::EQUALS:
- sstream << columnDebugString() << " = " << getLiteralString(mLiterals);
+ sstream << columnDebugString() << " = " << getLiteralString(literals_);
break;
case Operator::NULL_SAFE_EQUALS:
- sstream << columnDebugString() << " null_safe_= " << getLiteralString(mLiterals);
+ sstream << columnDebugString() << " null_safe_= " << getLiteralString(literals_);
break;
case Operator::LESS_THAN:
- sstream << columnDebugString() << " < " << getLiteralString(mLiterals);
+ sstream << columnDebugString() << " < " << getLiteralString(literals_);
break;
case Operator::LESS_THAN_EQUALS:
- sstream << columnDebugString() << " <= " << getLiteralString(mLiterals);
+ sstream << columnDebugString() << " <= " << getLiteralString(literals_);
break;
case Operator::IN:
- sstream << columnDebugString() << " in " << getLiteralsString(mLiterals);
+ sstream << columnDebugString() << " in " << getLiteralsString(literals_);
break;
case Operator::BETWEEN:
- sstream << columnDebugString() << " between " << getLiteralsString(mLiterals);
+ sstream << columnDebugString() << " between " << getLiteralsString(literals_);
break;
default:
sstream << "unknown operator, column: " << columnDebugString()
- << ", literals: " << getLiteralsString(mLiterals);
+ << ", literals: " << getLiteralsString(literals_);
}
sstream << ')';
return sstream.str();
@@ -236,25 +236,25 @@ namespace orc {
size_t PredicateLeaf::hashCode() const {
size_t value = 0;
- std::for_each(mLiterals.cbegin(), mLiterals.cend(),
+ std::for_each(literals_.cbegin(), literals_.cend(),
[&](const Literal& literal) { value = value * 17 + literal.getHashCode(); });
auto colHash =
- mHasColumnName ? std::hash<std::string>{}(mColumnName) : std::hash<uint64_t>{}(mColumnId);
- return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(mOperator)) +
- std::hash<int>{}(static_cast<int>(mType)) * 17 + colHash * 3 * 17;
+ hasColumnName_ ? std::hash<std::string>{}(columnName_) : std::hash<uint64_t>{}(columnId_);
+ return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(operator_)) +
+ std::hash<int>{}(static_cast<int>(type_)) * 17 + colHash * 3 * 17;
}
bool PredicateLeaf::operator==(const PredicateLeaf& r) const {
if (this == &r) {
return true;
}
- if (mHashCode != r.mHashCode || mType != r.mType || mOperator != r.mOperator ||
- mHasColumnName != r.mHasColumnName || mColumnName != r.mColumnName ||
- mColumnId != r.mColumnId || mLiterals.size() != r.mLiterals.size()) {
+ if (hashCode_ != r.hashCode_ || type_ != r.type_ || operator_ != r.operator_ ||
+ hasColumnName_ != r.hasColumnName_ || columnName_ != r.columnName_ ||
+ columnId_ != r.columnId_ || literals_.size() != r.literals_.size()) {
return false;
}
- for (size_t i = 0; i != mLiterals.size(); ++i) {
- if (mLiterals[i] != r.mLiterals[i]) {
+ for (size_t i = 0; i != literals_.size(); ++i) {
+ if (literals_[i] != r.literals_[i]) {
return false;
}
}
@@ -507,12 +507,12 @@ namespace orc {
TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const {
TruthValue result = TruthValue::YES_NO_NULL;
- switch (mType) {
+ switch (type_) {
case PredicateDataType::LONG: {
if (colStats.has_int_statistics() && colStats.int_statistics().has_minimum() &&
colStats.int_statistics().has_maximum()) {
const auto& stats = colStats.int_statistics();
- result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(),
+ result = evaluatePredicateRange(operator_, literal2Long(literals_), stats.minimum(),
stats.maximum(), colStats.has_null());
}
break;
@@ -524,7 +524,7 @@ namespace orc {
if (!std::isfinite(stats.sum())) {
result = colStats.has_null() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
} else {
- result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(),
+ result = evaluatePredicateRange(operator_, literal2Double(literals_), stats.minimum(),
stats.maximum(), colStats.has_null());
}
}
@@ -535,7 +535,7 @@ namespace orc {
if (colStats.has_string_statistics() && colStats.string_statistics().has_minimum() &&
colStats.string_statistics().has_maximum()) {
const auto& stats = colStats.string_statistics();
- result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(),
+ result = evaluatePredicateRange(operator_, literal2String(literals_), stats.minimum(),
stats.maximum(), colStats.has_null());
}
break;
@@ -544,7 +544,7 @@ namespace orc {
if (colStats.has_date_statistics() && colStats.date_statistics().has_minimum() &&
colStats.date_statistics().has_maximum()) {
const auto& stats = colStats.date_statistics();
- result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(),
+ result = evaluatePredicateRange(operator_, literal2Date(literals_), stats.minimum(),
stats.maximum(), colStats.has_null());
}
break;
@@ -566,7 +566,7 @@ namespace orc {
Literal::Timestamp maxTimestamp(
stats.maximum_utc() / 1000,
static_cast<int32_t>((stats.maximum_utc() % 1000) * 1000000) + maxNano);
- result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp,
+ result = evaluatePredicateRange(operator_, literal2Timestamp(literals_), minTimestamp,
maxTimestamp, colStats.has_null());
}
break;
@@ -575,7 +575,7 @@ namespace orc {
if (colStats.has_decimal_statistics() && colStats.decimal_statistics().has_minimum() &&
colStats.decimal_statistics().has_maximum()) {
const auto& stats = colStats.decimal_statistics();
- result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals),
+ result = evaluatePredicateRange(operator_, literal2Decimal(literals_),
Decimal(stats.minimum()), Decimal(stats.maximum()),
colStats.has_null());
}
@@ -583,7 +583,7 @@ namespace orc {
}
case PredicateDataType::BOOLEAN: {
if (colStats.has_bucket_statistics()) {
- result = evaluateBoolPredicate(mOperator, mLiterals, colStats);
+ result = evaluateBoolPredicate(operator_, literals_, colStats);
}
break;
}
@@ -592,8 +592,8 @@ namespace orc {
}
// make sure null literal is respected for IN operator
- if (mOperator == Operator::IN && colStats.has_null()) {
- for (const auto& literal : mLiterals) {
+ if (operator_ == Operator::IN && colStats.has_null()) {
+ for (const auto& literal : literals_) {
if (literal.isNull()) {
result = TruthValue::YES_NO_NULL;
break;
@@ -664,18 +664,18 @@ namespace orc {
}
TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter* bf, bool hasNull) const {
- switch (mOperator) {
+ switch (operator_) {
case Operator::NULL_SAFE_EQUALS:
// null safe equals does not return *_NULL variant.
// So set hasNull to false
- return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, false);
+ return checkInBloomFilter(operator_, type_, literals_.front(), bf, false);
case Operator::EQUALS:
- return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, hasNull);
+ return checkInBloomFilter(operator_, type_, literals_.front(), bf, hasNull);
case Operator::IN:
- for (const auto& literal : mLiterals) {
+ for (const auto& literal : literals_) {
// if at least one value in IN list exist in bloom filter,
// qualify the row group/stripe
- TruthValue result = checkInBloomFilter(mOperator, mType, literal, bf, hasNull);
+ TruthValue result = checkInBloomFilter(operator_, type_, literal, bf, hasNull);
if (result == TruthValue::YES_NO_NULL || result == TruthValue::YES_NO) {
return result;
}
@@ -695,7 +695,7 @@ namespace orc {
const BloomFilter* bloomFilter) const {
// files written before ORC-135 stores timestamp wrt to local timezone
// causing issues with PPD. disable PPD for timestamp for all old files
- if (mType == PredicateDataType::TIMESTAMP) {
+ if (type_ == PredicateDataType::TIMESTAMP) {
if (writerVersion < WriterVersion::WriterVersion_ORC_135) {
return TruthValue::YES_NO_NULL;
}
@@ -705,9 +705,9 @@ namespace orc {
if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL;
bool allNull = colStats.has_null() && colStats.number_of_values() == 0;
- if (mOperator == Operator::IS_NULL ||
- ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) &&
- mLiterals.at(0).isNull())) {
+ if (operator_ == Operator::IS_NULL ||
+ ((operator_ == Operator::EQUALS || operator_ == Operator::NULL_SAFE_EQUALS) &&
+ literals_.at(0).isNull())) {
// IS_NULL operator does not need to check min/max stats and bloom filter
return allNull ? TruthValue::YES
: (colStats.has_null() ? TruthValue::YES_NO : TruthValue::NO);
@@ -717,7 +717,7 @@ namespace orc {
}
TruthValue result = evaluatePredicateMinMax(colStats);
- if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) {
+ if (shouldEvaluateBloomFilter(operator_, result, bloomFilter)) {
return evaluatePredicateBloomFiter(bloomFilter, colStats.has_null());
} else {
return result;
diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh
index 21ed456155..81fd6d98b7 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh
@@ -127,7 +127,7 @@ namespace orc {
bool operator==(const PredicateLeaf& r) const;
size_t getHashCode() const {
- return mHashCode;
+ return hashCode_;
}
private:
@@ -143,13 +143,13 @@ namespace orc {
TruthValue evaluatePredicateBloomFiter(const BloomFilter* bloomFilter, bool hasNull) const;
private:
- Operator mOperator;
- PredicateDataType mType;
- std::string mColumnName;
- bool mHasColumnName;
- uint64_t mColumnId;
- std::vector<Literal> mLiterals;
- size_t mHashCode;
+ Operator operator_;
+ PredicateDataType type_;
+ std::string columnName_;
+ bool hasColumnName_;
+ uint64_t columnId_;
+ std::vector<Literal> literals_;
+ size_t hashCode_;
};
struct PredicateLeafHash {
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
index 0e369bf453..b3085964d4 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
@@ -40,24 +40,24 @@ namespace orc {
SargsApplier::SargsApplier(const Type& type, const SearchArgument* searchArgument,
uint64_t rowIndexStride, WriterVersion writerVersion,
ReaderMetrics* metrics, const SchemaEvolution* schemaEvolution)
- : mType(type),
- mSearchArgument(searchArgument),
- mSchemaEvolution(schemaEvolution),
- mRowIndexStride(rowIndexStride),
- mWriterVersion(writerVersion),
- mHasEvaluatedFileStats(false),
- mFileStatsEvalResult(true),
- mMetrics(metrics) {
- const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument);
+ : type_(type),
+ searchArgument_(searchArgument),
+ schemaEvolution_(schemaEvolution),
+ rowIndexStride_(rowIndexStride),
+ writerVersion_(writerVersion),
+ hasEvaluatedFileStats_(false),
+ fileStatsEvalResult_(true),
+ metrics_(metrics) {
+ const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(searchArgument_);
// find the mapping from predicate leaves to columns
const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
- mFilterColumns.resize(leaves.size(), INVALID_COLUMN_ID);
- for (size_t i = 0; i != mFilterColumns.size(); ++i) {
+ filterColumns_.resize(leaves.size(), INVALID_COLUMN_ID);
+ for (size_t i = 0; i != filterColumns_.size(); ++i) {
if (leaves[i].hasColumnName()) {
- mFilterColumns[i] = findColumn(type, leaves[i].getColumnName());
+ filterColumns_[i] = findColumn(type, leaves[i].getColumnName());
} else {
- mFilterColumns[i] = leaves[i].getColumnId();
+ filterColumns_[i] = leaves[i].getColumnId();
}
}
}
@@ -66,30 +66,30 @@ namespace orc {
const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes,
const std::map<uint32_t, BloomFilterIndex>& bloomFilters) {
// init state of each row group
- uint64_t groupsInStripe = (rowsInStripe + mRowIndexStride - 1) / mRowIndexStride;
- mNextSkippedRows.resize(groupsInStripe);
- mTotalRowsInStripe = rowsInStripe;
+ uint64_t groupsInStripe = (rowsInStripe + rowIndexStride_ - 1) / rowIndexStride_;
+ nextSkippedRows_.resize(groupsInStripe);
+ totalRowsInStripe_ = rowsInStripe;
// row indexes do not exist, simply read all rows
if (rowIndexes.empty()) {
return true;
}
- const auto& leaves = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument)->getLeaves();
+ const auto& leaves = dynamic_cast<const SearchArgumentImpl*>(searchArgument_)->getLeaves();
std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL);
- mHasSelected = false;
- mHasSkipped = false;
+ hasSelected_ = false;
+ hasSkipped_ = false;
uint64_t nextSkippedRowGroup = groupsInStripe;
size_t rowGroup = groupsInStripe;
do {
--rowGroup;
for (size_t pred = 0; pred != leaves.size(); ++pred) {
- uint64_t columnIdx = mFilterColumns[pred];
+ uint64_t columnIdx = filterColumns_[pred];
auto rowIndexIter = rowIndexes.find(columnIdx);
if (columnIdx == INVALID_COLUMN_ID || rowIndexIter == rowIndexes.cend()) {
// this column does not exist in current file
leafValues[pred] = TruthValue::YES_NO_NULL;
- } else if (mSchemaEvolution && !mSchemaEvolution->isSafePPDConversion(columnIdx)) {
+ } else if (schemaEvolution_ && !schemaEvolution_->isSafePPDConversion(columnIdx)) {
// cannot evaluate predicate when ppd is not safe
leafValues[pred] = TruthValue::YES_NO_NULL;
} else {
@@ -104,37 +104,37 @@ namespace orc {
bloomFilter = iter->second.entries.at(rowGroup);
}
- leafValues[pred] = leaves[pred].evaluate(mWriterVersion, statistics, bloomFilter.get());
+ leafValues[pred] = leaves[pred].evaluate(writerVersion_, statistics, bloomFilter.get());
}
}
- bool needed = isNeeded(mSearchArgument->evaluate(leafValues));
+ bool needed = isNeeded(searchArgument_->evaluate(leafValues));
if (!needed) {
- mNextSkippedRows[rowGroup] = 0;
+ nextSkippedRows_[rowGroup] = 0;
nextSkippedRowGroup = rowGroup;
} else {
- mNextSkippedRows[rowGroup] = (nextSkippedRowGroup == groupsInStripe)
+ nextSkippedRows_[rowGroup] = (nextSkippedRowGroup == groupsInStripe)
? rowsInStripe
- : (nextSkippedRowGroup * mRowIndexStride);
+ : (nextSkippedRowGroup * rowIndexStride_);
}
- mHasSelected |= needed;
- mHasSkipped |= !needed;
+ hasSelected_ |= needed;
+ hasSkipped_ |= !needed;
} while (rowGroup != 0);
// update stats
uint64_t selectedRGs = std::accumulate(
- mNextSkippedRows.cbegin(), mNextSkippedRows.cend(), 0UL,
+ nextSkippedRows_.cbegin(), nextSkippedRows_.cend(), 0UL,
[](uint64_t initVal, uint64_t rg) { return rg > 0 ? initVal + 1 : initVal; });
- if (mMetrics != nullptr) {
- mMetrics->SelectedRowGroupCount.fetch_add(selectedRGs);
- mMetrics->EvaluatedRowGroupCount.fetch_add(groupsInStripe);
+ if (metrics_ != nullptr) {
+ metrics_->SelectedRowGroupCount.fetch_add(selectedRGs);
+ metrics_->EvaluatedRowGroupCount.fetch_add(groupsInStripe);
}
- return mHasSelected;
+ return hasSelected_;
}
bool SargsApplier::evaluateColumnStatistics(const PbColumnStatistics& colStats) const {
- const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument);
+ const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(searchArgument_);
if (sargs == nullptr) {
throw InvalidArgument("Failed to cast to SearchArgumentImpl");
}
@@ -143,14 +143,14 @@ namespace orc {
std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL);
for (size_t pred = 0; pred != leaves.size(); ++pred) {
- uint64_t columnId = mFilterColumns[pred];
+ uint64_t columnId = filterColumns_[pred];
if (columnId != INVALID_COLUMN_ID && colStats.size() > static_cast<int>(columnId)) {
- leafValues[pred] = leaves[pred].evaluate(mWriterVersion,
+ leafValues[pred] = leaves[pred].evaluate(writerVersion_,
colStats.Get(static_cast<int>(columnId)), nullptr);
}
}
- return isNeeded(mSearchArgument->evaluate(leafValues));
+ return isNeeded(searchArgument_->evaluate(leafValues));
}
bool SargsApplier::evaluateStripeStatistics(const proto::StripeStatistics& stripeStats,
@@ -160,29 +160,29 @@ namespace orc {
}
bool ret = evaluateColumnStatistics(stripeStats.col_stats());
- if (mMetrics != nullptr) {
- mMetrics->EvaluatedRowGroupCount.fetch_add(stripeRowGroupCount);
+ if (metrics_ != nullptr) {
+ metrics_->EvaluatedRowGroupCount.fetch_add(stripeRowGroupCount);
}
if (!ret) {
// reset mNextSkippedRows when the current stripe does not satisfy the PPD
- mNextSkippedRows.clear();
+ nextSkippedRows_.clear();
}
return ret;
}
bool SargsApplier::evaluateFileStatistics(const proto::Footer& footer,
uint64_t numRowGroupsInStripeRange) {
- if (!mHasEvaluatedFileStats) {
+ if (!hasEvaluatedFileStats_) {
if (footer.statistics_size() == 0) {
- mFileStatsEvalResult = true;
+ fileStatsEvalResult_ = true;
} else {
- mFileStatsEvalResult = evaluateColumnStatistics(footer.statistics());
- if (mMetrics != nullptr) {
- mMetrics->EvaluatedRowGroupCount.fetch_add(numRowGroupsInStripeRange);
+ fileStatsEvalResult_ = evaluateColumnStatistics(footer.statistics());
+ if (metrics_ != nullptr) {
+ metrics_->EvaluatedRowGroupCount.fetch_add(numRowGroupsInStripeRange);
}
}
- mHasEvaluatedFileStats = true;
+ hasEvaluatedFileStats_ = true;
}
- return mFileStatsEvalResult;
+ return fileStatsEvalResult_;
}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh
index 73703dcf6b..65c8dec83b 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh
@@ -75,30 +75,30 @@ namespace orc {
* Only valid after invoking pickRowGroups().
*/
const std::vector<uint64_t>& getNextSkippedRows() const {
- return mNextSkippedRows;
+ return nextSkippedRows_;
}
/**
* Indicate whether any row group is selected in the last evaluation
*/
bool hasSelected() const {
- return mHasSelected;
+ return hasSelected_;
}
/**
* Indicate whether any row group is skipped in the last evaluation
*/
bool hasSkipped() const {
- return mHasSkipped;
+ return hasSkipped_;
}
/**
* Whether any row group from current row in the stripe matches PPD.
*/
bool hasSelectedFrom(uint64_t currentRowInStripe) const {
- uint64_t rg = currentRowInStripe / mRowIndexStride;
- for (; rg < mNextSkippedRows.size(); ++rg) {
- if (mNextSkippedRows[rg]) {
+ uint64_t rg = currentRowInStripe / rowIndexStride_;
+ for (; rg < nextSkippedRows_.size(); ++rg) {
+ if (nextSkippedRows_[rg]) {
return true;
}
}
@@ -106,9 +106,9 @@ namespace orc {
}
std::pair<uint64_t, uint64_t> getStats() const {
- if (mMetrics != nullptr) {
- return std::make_pair(mMetrics->SelectedRowGroupCount.load(),
- mMetrics->EvaluatedRowGroupCount.load());
+ if (metrics_ != nullptr) {
+ return std::make_pair(metrics_->SelectedRowGroupCount.load(),
+ metrics_->EvaluatedRowGroupCount.load());
} else {
return {0, 0};
}
@@ -125,27 +125,27 @@ namespace orc {
static uint64_t findColumn(const Type& type, const std::string& colName);
private:
- const Type& mType;
- const SearchArgument* mSearchArgument;
- const SchemaEvolution* mSchemaEvolution;
- uint64_t mRowIndexStride;
- WriterVersion mWriterVersion;
+ const Type& type_;
+ const SearchArgument* searchArgument_;
+ const SchemaEvolution* schemaEvolution_;
+ uint64_t rowIndexStride_;
+ WriterVersion writerVersion_;
// column ids for each predicate leaf in the search argument
- std::vector<uint64_t> mFilterColumns;
+ std::vector<uint64_t> filterColumns_;
// Map from RowGroup index to the next skipped row of the selected range it
// locates. If the RowGroup is not selected, set the value to 0.
// Calculated in pickRowGroups().
- std::vector<uint64_t> mNextSkippedRows;
- uint64_t mTotalRowsInStripe;
- bool mHasSelected;
- bool mHasSkipped;
+ std::vector<uint64_t> nextSkippedRows_;
+ uint64_t totalRowsInStripe_;
+ bool hasSelected_;
+ bool hasSkipped_;
// store result of file stats evaluation
- bool mHasEvaluatedFileStats;
- bool mFileStatsEvalResult;
+ bool hasEvaluatedFileStats_;
+ bool fileStatsEvalResult_;
// use the SelectedRowGroupCount and EvaluatedRowGroupCount to
// keep stats of selected RGs and evaluated RGs
- ReaderMetrics* mMetrics;
+ ReaderMetrics* metrics_;
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
index 806727f0a0..83d4af2435 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
@@ -30,23 +30,23 @@ namespace orc {
}
const std::vector<PredicateLeaf>& SearchArgumentImpl::getLeaves() const {
- return mLeaves;
+ return leaves_;
}
const ExpressionTree* SearchArgumentImpl::getExpression() const {
- return mExpressionTree.get();
+ return expressionTree_.get();
}
TruthValue SearchArgumentImpl::evaluate(const std::vector<TruthValue>& leaves) const {
- return mExpressionTree == nullptr ? TruthValue::YES : mExpressionTree->evaluate(leaves);
+ return expressionTree_ == nullptr ? TruthValue::YES : expressionTree_->evaluate(leaves);
}
std::string SearchArgumentImpl::toString() const {
std::ostringstream sstream;
- for (size_t i = 0; i != mLeaves.size(); ++i) {
- sstream << "leaf-" << i << " = " << mLeaves.at(i).toString() << ", ";
+ for (size_t i = 0; i != leaves_.size(); ++i) {
+ sstream << "leaf-" << i << " = " << leaves_.at(i).toString() << ", ";
}
- sstream << "expr = " << mExpressionTree->toString();
+ sstream << "expr = " << expressionTree_->toString();
return sstream.str();
}
@@ -55,14 +55,14 @@ namespace orc {
}
SearchArgumentBuilderImpl::SearchArgumentBuilderImpl() {
- mRoot.reset(new ExpressionTree(ExpressionTree::Operator::AND));
- mCurrTree.push_back(mRoot);
+ root_.reset(new ExpressionTree(ExpressionTree::Operator::AND));
+ currTree_.push_back(root_);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::start(ExpressionTree::Operator op) {
TreeNode node = std::make_shared<ExpressionTree>(op);
- mCurrTree.front()->addChild(node);
- mCurrTree.push_front(node);
+ currTree_.front()->addChild(node);
+ currTree_.push_front(node);
return *this;
}
@@ -79,9 +79,9 @@ namespace orc {
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::end() {
- TreeNode& current = mCurrTree.front();
+ TreeNode& current = currTree_.front();
if (current->getChildren().empty()) {
- throw std::invalid_argument("Cannot create expression " + mRoot->toString() +
+ throw std::invalid_argument("Cannot create expression " + root_->toString() +
" with no children.");
}
if (current->getOperator() == ExpressionTree::Operator::NOT &&
@@ -89,13 +89,13 @@ namespace orc {
throw std::invalid_argument("Can't create NOT expression " + current->toString() +
" with more than 1 child.");
}
- mCurrTree.pop_front();
+ currTree_.pop_front();
return *this;
}
size_t SearchArgumentBuilderImpl::addLeaf(PredicateLeaf leaf) {
- size_t id = mLeaves.size();
- const auto& result = mLeaves.insert(std::make_pair(leaf, id));
+ size_t id = leaves_.size();
+ const auto& result = leaves_.insert(std::make_pair(leaf, id));
return result.first->second;
}
@@ -112,7 +112,7 @@ namespace orc {
T column,
PredicateDataType type,
Literal literal) {
- TreeNode parent = mCurrTree.front();
+ TreeNode parent = currTree_.front();
if (isInvalidColumn(column)) {
parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
} else {
@@ -181,7 +181,7 @@ namespace orc {
template <typename T, typename CONTAINER>
SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIn(T column, PredicateDataType type,
const CONTAINER& literals) {
- TreeNode& parent = mCurrTree.front();
+ TreeNode& parent = currTree_.front();
if (isInvalidColumn(column)) {
parent->addChild(std::make_shared<ExpressionTree>((TruthValue::YES_NO_NULL)));
} else {
@@ -219,7 +219,7 @@ namespace orc {
template <typename T>
SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIsNull(T column,
PredicateDataType type) {
- TreeNode& parent = mCurrTree.front();
+ TreeNode& parent = currTree_.front();
if (isInvalidColumn(column)) {
parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
} else {
@@ -244,7 +244,7 @@ namespace orc {
PredicateDataType type,
Literal lower,
Literal upper) {
- TreeNode& parent = mCurrTree.front();
+ TreeNode& parent = currTree_.front();
if (isInvalidColumn(column)) {
parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
} else {
@@ -267,7 +267,7 @@ namespace orc {
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::literal(TruthValue truth) {
- TreeNode& parent = mCurrTree.front();
+ TreeNode& parent = currTree_.front();
parent->addChild(std::make_shared<ExpressionTree>(truth));
return *this;
}
@@ -555,34 +555,34 @@ namespace orc {
}
SearchArgumentImpl::SearchArgumentImpl(TreeNode root, const std::vector<PredicateLeaf>& leaves)
- : mExpressionTree(root), mLeaves(leaves) {
+ : expressionTree_(root), leaves_(leaves) {
// PASS
}
std::unique_ptr<SearchArgument> SearchArgumentBuilderImpl::build() {
- if (mCurrTree.size() != 1) {
- throw std::invalid_argument("Failed to end " + std::to_string(mCurrTree.size()) +
+ if (currTree_.size() != 1) {
+ throw std::invalid_argument("Failed to end " + std::to_string(currTree_.size()) +
" operations.");
}
- mRoot = pushDownNot(mRoot);
- mRoot = foldMaybe(mRoot);
- mRoot = flatten(mRoot);
- mRoot = convertToCNF(mRoot);
- mRoot = flatten(mRoot);
- std::vector<size_t> leafReorder(mLeaves.size(), UNUSED_LEAF);
- size_t newLeafCount = compactLeaves(mRoot, 0, leafReorder.data());
- mRoot = rewriteLeaves(mRoot, leafReorder.data());
+ root_ = pushDownNot(root_);
+ root_ = foldMaybe(root_);
+ root_ = flatten(root_);
+ root_ = convertToCNF(root_);
+ root_ = flatten(root_);
+ std::vector<size_t> leafReorder(leaves_.size(), UNUSED_LEAF);
+ size_t newLeafCount = compactLeaves(root_, 0, leafReorder.data());
+ root_ = rewriteLeaves(root_, leafReorder.data());
std::vector<PredicateLeaf> leafList(newLeafCount, PredicateLeaf());
// build the new list
- for (auto& leaf : mLeaves) {
+ for (auto& leaf : leaves_) {
size_t newLoc = leafReorder[leaf.second];
if (newLoc != UNUSED_LEAF) {
leafList[newLoc] = leaf.first;
}
}
- return std::make_unique<SearchArgumentImpl>(mRoot, leafList);
+ return std::make_unique<SearchArgumentImpl>(root_, leafList);
}
std::unique_ptr<SearchArgumentBuilder> SearchArgumentFactory::newBuilder() {
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
index 4b74b28743..1963c993d6 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
@@ -66,8 +66,8 @@ namespace orc {
std::string toString() const override;
private:
- std::shared_ptr<ExpressionTree> mExpressionTree;
- std::vector<PredicateLeaf> mLeaves;
+ std::shared_ptr<ExpressionTree> expressionTree_;
+ std::vector<PredicateLeaf> leaves_;
};
/**
@@ -304,9 +304,9 @@ namespace orc {
static TreeNode convertToCNF(TreeNode root);
private:
- std::deque<TreeNode> mCurrTree;
- std::unordered_map<PredicateLeaf, size_t, PredicateLeafHash, PredicateLeafComparator> mLeaves;
- std::shared_ptr<ExpressionTree> mRoot;
+ std::deque<TreeNode> currTree_;
+ std::unordered_map<PredicateLeaf, size_t, PredicateLeafHash, PredicateLeafComparator> leaves_;
+ std::shared_ptr<ExpressionTree> root_;
};
} // namespace orc
diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make
index 12617d59ab..b757fec915 100644
--- a/contrib/libs/apache/orc/ya.make
+++ b/contrib/libs/apache/orc/ya.make
@@ -6,9 +6,9 @@ LICENSE(Apache-2.0)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(2.0.3)
+VERSION(2.1.0)
-ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.0.3.tar.gz)
+ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.1.0.tar.gz)
PEERDIR(
contrib/libs/apache/orc-format
@@ -65,6 +65,7 @@ SRCS(
c++/src/TypeImpl.cc
c++/src/Vector.cc
c++/src/Writer.cc
+ c++/src/io/Cache.cc
c++/src/io/InputStream.cc
c++/src/io/OutputStream.cc
c++/src/sargs/ExpressionTree.cc