aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/orc
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2024-03-17 04:47:32 +0300
committerthegeorg <thegeorg@yandex-team.com>2024-03-17 04:57:12 +0300
commit0816a937aebb4bb8ff5d68730c625cb1c99c9b4b (patch)
tree45dd2b2d18017590838384a1a7687279ac280444 /contrib/libs/apache/orc
parent6d5eb3aff8e43031b7dcb8be42d649799cd8a6c3 (diff)
downloadydb-0816a937aebb4bb8ff5d68730c625cb1c99c9b4b.tar.gz
Update contrib/libs/apache/orc to 2.0.0
28031d32eb02ad8a790abc416b7db3264738c474
Diffstat (limited to 'contrib/libs/apache/orc')
-rw-r--r--contrib/libs/apache/orc/README.md30
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh8
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh16
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Common.hh58
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Exceptions.hh38
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Int128.hh100
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh81
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/OrcFile.hh42
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Reader.hh135
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Statistics.hh78
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Type.hh54
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Vector.hh261
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Writer.hh90
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/orc-config.hh68
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh66
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh62
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh18
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor-linux.hh69
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor.cc56
-rw-r--r--contrib/libs/apache/orc/c++/src/BlockBuffer.cc131
-rw-r--r--contrib/libs/apache/orc/c++/src/BlockBuffer.hh124
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.cc85
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.hh37
-rw-r--r--contrib/libs/apache/orc/c++/src/Bpacking.hh34
-rw-r--r--contrib/libs/apache/orc/c++/src/BpackingDefault.cc368
-rw-r--r--contrib/libs/apache/orc/c++/src/BpackingDefault.hh59
-rw-r--r--contrib/libs/apache/orc/c++/src/ByteRLE.cc208
-rw-r--r--contrib/libs/apache/orc/c++/src/ByteRLE.hh27
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnPrinter.cc477
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnReader.cc1323
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnReader.hh47
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.cc1330
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.hh66
-rw-r--r--contrib/libs/apache/orc/c++/src/Common.cc18
-rw-r--r--contrib/libs/apache/orc/c++/src/Compression.cc949
-rw-r--r--contrib/libs/apache/orc/c++/src/Compression.hh24
-rw-r--r--contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc1001
-rw-r--r--contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh53
-rw-r--r--contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc589
-rw-r--r--contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh113
-rw-r--r--contrib/libs/apache/orc/c++/src/Dispatch.hh110
-rw-r--r--contrib/libs/apache/orc/c++/src/Exceptions.cc51
-rw-r--r--contrib/libs/apache/orc/c++/src/Int128.cc283
-rw-r--r--contrib/libs/apache/orc/c++/src/LzoDecompressor.cc89
-rw-r--r--contrib/libs/apache/orc/c++/src/LzoDecompressor.hh8
-rw-r--r--contrib/libs/apache/orc/c++/src/MemoryPool.cc155
-rw-r--r--contrib/libs/apache/orc/c++/src/Murmur3.cc18
-rw-r--r--contrib/libs/apache/orc/c++/src/Murmur3.hh10
-rw-r--r--contrib/libs/apache/orc/c++/src/Options.hh92
-rw-r--r--contrib/libs/apache/orc/c++/src/OrcFile.cc63
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.cc99
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.hh66
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEV2Util.cc91
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEV2Util.hh38
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv1.cc440
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv1.hh84
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv2.hh385
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.cc743
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.hh141
-rw-r--r--contrib/libs/apache/orc/c++/src/RleDecoderV2.cc1020
-rw-r--r--contrib/libs/apache/orc/c++/src/RleEncoderV2.cc794
-rw-r--r--contrib/libs/apache/orc/c++/src/SchemaEvolution.cc255
-rw-r--r--contrib/libs/apache/orc/c++/src/SchemaEvolution.hh64
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.cc280
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.hh592
-rw-r--r--contrib/libs/apache/orc/c++/src/StripeStream.cc108
-rw-r--r--contrib/libs/apache/orc/c++/src/StripeStream.hh117
-rw-r--r--contrib/libs/apache/orc/c++/src/Timezone.cc340
-rw-r--r--contrib/libs/apache/orc/c++/src/Timezone.hh25
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.cc771
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.hh69
-rw-r--r--contrib/libs/apache/orc/c++/src/Utils.hh75
-rw-r--r--contrib/libs/apache/orc/c++/src/Vector.cc251
-rw-r--r--contrib/libs/apache/orc/c++/src/Writer.cc402
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.cc61
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.hh52
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.cc53
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.hh53
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc46
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh8
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/Literal.cc30
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc324
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh62
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc104
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh72
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc173
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh99
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc2
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h29
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh49
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h25
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h31
-rw-r--r--contrib/libs/apache/orc/proto/orc_proto.proto451
-rw-r--r--contrib/libs/apache/orc/ya.make16
94 files changed, 10066 insertions, 8196 deletions
diff --git a/contrib/libs/apache/orc/README.md b/contrib/libs/apache/orc/README.md
index a7d959247e..60b0da5fcb 100644
--- a/contrib/libs/apache/orc/README.md
+++ b/contrib/libs/apache/orc/README.md
@@ -18,9 +18,9 @@ lists, maps, and unions.
This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files.
Releases:
-* Latest: <a href="http://orc.apache.org/releases">Apache ORC releases</a>
-* Maven Central: <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a>
-* Downloads: <a href="http://orc.apache.org/downloads">Apache ORC downloads</a>
+* Latest: <a href="https://orc.apache.org/releases">Apache ORC releases</a>
+* Maven Central: <a href="https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a>
+* Downloads: <a href="https://orc.apache.org/downloads">Apache ORC downloads</a>
* Release tags: <a href="https://github.com/apache/orc/releases">Apache ORC release tags</a>
* Plan: <a href="https://github.com/apache/orc/milestones">Apache ORC future release plan</a>
@@ -28,7 +28,7 @@ The current build status:
* Main branch <a href="https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain">
![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main)</a>
-Bug tracking: <a href="http://orc.apache.org/bugs">Apache Jira</a>
+Bug tracking: <a href="https://orc.apache.org/bugs">Apache Jira</a>
The subdirectories are:
@@ -37,15 +37,14 @@ The subdirectories are:
* docker - docker scripts to build and test on various linuxes
* examples - various ORC example files that are used to test compatibility
* java - the java reader and writer
-* proto - the protocol buffer definition for the ORC metadata
* site - the website and documentation
* tools - the c++ tools for reading and inspecting ORC files
### Building
-* Install java 1.8 or higher
-* Install maven 3.8.6 or higher
-* Install cmake
+* Install java 17 or higher
+* Install maven 3.9.6 or higher
+* Install cmake 3.12 or higher
To build a release version with debug information:
```shell
@@ -93,3 +92,18 @@ To build only the C++ library:
% make test-out
```
+
+To build the C++ library with AVX512 enabled:
+```shell
+export ORC_USER_SIMD_LEVEL=AVX512
+% mkdir build
+% cd build
+% cmake .. -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON
+% make package
+% make test-out
+```
+Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries.
+
+Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization.
+
+Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time.
diff --git a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh
index 91277392c7..d08f6deac7 100644
--- a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh
@@ -27,11 +27,11 @@
namespace orc {
class BloomFilter {
- public:
+ public:
virtual ~BloomFilter();
// test if the element exists in BloomFilter
- virtual bool testBytes(const char * data, int64_t length) const = 0;
+ virtual bool testBytes(const char* data, int64_t length) const = 0;
virtual bool testLong(int64_t data) const = 0;
virtual bool testDouble(double data) const = 0;
};
@@ -40,6 +40,6 @@ namespace orc {
std::vector<std::shared_ptr<BloomFilter>> entries;
};
-}
+} // namespace orc
-#endif //ORC_BLOOMFILTER_HH
+#endif // ORC_BLOOMFILTER_HH
diff --git a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh
index aa19214738..328c0e84b6 100644
--- a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh
@@ -19,12 +19,11 @@
#ifndef ORC_COLUMN_PRINTER_HH
#define ORC_COLUMN_PRINTER_HH
-#include "orc/orc-config.hh"
#include "orc/OrcFile.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
#include <stdio.h>
-#include <string>
#include <memory>
#include <string>
#include <vector>
@@ -32,12 +31,12 @@
namespace orc {
class ColumnPrinter {
- protected:
- std::string &buffer;
- bool hasNulls ;
+ protected:
+ std::string& buffer;
+ bool hasNulls;
const char* notNull;
- public:
+ public:
ColumnPrinter(std::string&);
virtual ~ColumnPrinter();
virtual void printRow(uint64_t rowId) = 0;
@@ -45,7 +44,6 @@ namespace orc {
virtual void reset(const ColumnVectorBatch& batch);
};
- ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&,
- const Type* type);
-}
+ std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&, const Type* type);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Common.hh b/contrib/libs/apache/orc/c++/include/orc/Common.hh
index e51e37e710..9da67a3f19 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Common.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Common.hh
@@ -19,47 +19,45 @@
#ifndef ORC_COMMON_HH
#define ORC_COMMON_HH
-#include "orc/Vector.hh"
-#include "orc/Type.hh"
#include "orc/Exceptions.hh"
+#include "orc/Type.hh"
+#include "orc/Vector.hh"
#include <string>
namespace orc {
class FileVersion {
- private:
+ private:
uint32_t majorVersion;
uint32_t minorVersion;
- public:
+
+ public:
static const FileVersion& v_0_11();
static const FileVersion& v_0_12();
static const FileVersion& UNSTABLE_PRE_2_0();
- FileVersion(uint32_t major, uint32_t minor) :
- majorVersion(major), minorVersion(minor) {
- }
+ FileVersion(uint32_t major, uint32_t minor) : majorVersion(major), minorVersion(minor) {}
/**
* Get major version
*/
uint32_t getMajor() const {
- return this->majorVersion;
+ return this->majorVersion;
}
/**
* Get minor version
*/
uint32_t getMinor() const {
- return this->minorVersion;
+ return this->minorVersion;
}
- bool operator == (const FileVersion & right) const {
- return this->majorVersion == right.getMajor() &&
- this->minorVersion == right.getMinor();
+ bool operator==(const FileVersion& right) const {
+ return this->majorVersion == right.getMajor() && this->minorVersion == right.getMinor();
}
- bool operator != (const FileVersion & right) const {
+ bool operator!=(const FileVersion& right) const {
return !(*this == right);
}
@@ -72,6 +70,7 @@ namespace orc {
PRESTO_WRITER = 2,
SCRITCHLEY_GO = 3,
TRINO_WRITER = 4,
+ CUDF_WRITER = 5,
UNKNOWN_WRITER = INT32_MAX
};
@@ -140,7 +139,7 @@ namespace orc {
std::string streamKindToString(StreamKind kind);
class StreamInformation {
- public:
+ public:
virtual ~StreamInformation();
virtual StreamKind getKind() const = 0;
@@ -159,7 +158,7 @@ namespace orc {
std::string columnEncodingKindToString(ColumnEncodingKind kind);
class StripeInformation {
- public:
+ public:
virtual ~StripeInformation();
/**
@@ -184,7 +183,7 @@ namespace orc {
* Get the length of the stripe's data.
* @return the number of bytes in the stripe
*/
- virtual uint64_t getDataLength()const = 0;
+ virtual uint64_t getDataLength() const = 0;
/**
* Get the length of the stripe's tail section, which contains its index.
@@ -206,8 +205,7 @@ namespace orc {
/**
* Get the StreamInformation for the given stream.
*/
- virtual ORC_UNIQUE_PTR<StreamInformation>
- getStreamInformation(uint64_t streamId) const = 0;
+ virtual std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId) const = 0;
/**
* Get the column encoding for the given column.
@@ -238,10 +236,8 @@ namespace orc {
template <>
inline bool compare(Decimal val1, Decimal val2) {
// compare integral parts
- Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value,
- val1.scale);
- Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value,
- val2.scale);
+ Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, val1.scale);
+ Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, val2.scale);
if (integral1 < integral2) {
return true;
@@ -253,25 +249,17 @@ namespace orc {
// unnecessary to check overflow here because the scaled number will not
// exceed original ones
bool overflow = false, positive = val1.value >= 0;
- val1.value -= scaleUpInt128ByPowerOfTen(integral1,
- val1.scale,
- overflow);
- val2.value -= scaleUpInt128ByPowerOfTen(integral2,
- val2.scale,
- overflow);
+ val1.value -= scaleUpInt128ByPowerOfTen(integral1, val1.scale, overflow);
+ val2.value -= scaleUpInt128ByPowerOfTen(integral2, val2.scale, overflow);
int32_t diff = val1.scale - val2.scale;
if (diff > 0) {
- val2.value = scaleUpInt128ByPowerOfTen(val2.value,
- diff,
- overflow);
+ val2.value = scaleUpInt128ByPowerOfTen(val2.value, diff, overflow);
if (overflow) {
return positive ? true : false;
}
} else {
- val1.value = scaleUpInt128ByPowerOfTen(val1.value,
- -diff,
- overflow);
+ val1.value = scaleUpInt128ByPowerOfTen(val1.value, -diff, overflow);
if (overflow) {
return positive ? false : true;
}
@@ -317,6 +305,6 @@ namespace orc {
return !(lhs != rhs);
}
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh
index 9765d4fd6b..0536dbd164 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh
@@ -26,35 +26,47 @@
namespace orc {
- class NotImplementedYet: public std::logic_error {
- public:
+ class NotImplementedYet : public std::logic_error {
+ public:
explicit NotImplementedYet(const std::string& what_arg);
explicit NotImplementedYet(const char* what_arg);
- virtual ~NotImplementedYet() ORC_NOEXCEPT;
+ ~NotImplementedYet() noexcept override;
NotImplementedYet(const NotImplementedYet&);
- private:
+
+ private:
NotImplementedYet& operator=(const NotImplementedYet&);
};
- class ParseError: public std::runtime_error {
- public:
+ class ParseError : public std::runtime_error {
+ public:
explicit ParseError(const std::string& what_arg);
explicit ParseError(const char* what_arg);
- virtual ~ParseError() ORC_NOEXCEPT;
+ ~ParseError() noexcept override;
ParseError(const ParseError&);
- private:
+
+ private:
ParseError& operator=(const ParseError&);
};
- class InvalidArgument: public std::runtime_error {
- public:
+ class InvalidArgument : public std::runtime_error {
+ public:
explicit InvalidArgument(const std::string& what_arg);
explicit InvalidArgument(const char* what_arg);
- virtual ~InvalidArgument() ORC_NOEXCEPT;
+ ~InvalidArgument() noexcept override;
InvalidArgument(const InvalidArgument&);
- private:
+
+ private:
InvalidArgument& operator=(const InvalidArgument&);
};
-}
+
+ class SchemaEvolutionError : public std::logic_error {
+ public:
+ explicit SchemaEvolutionError(const std::string& what_arg);
+ explicit SchemaEvolutionError(const char* what_arg);
+ virtual ~SchemaEvolutionError() noexcept override;
+ SchemaEvolutionError(const SchemaEvolutionError&);
+ SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete;
+ };
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh
index 1f68b2b119..bcb4a58e22 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh
@@ -35,7 +35,7 @@ namespace orc {
*
*/
class Int128 {
- public:
+ public:
Int128() {
highbits = 0;
lowbits = 0;
@@ -110,7 +110,7 @@ namespace orc {
* @param right the number to add
* @return *this
*/
- Int128& operator+=(const Int128 &right) {
+ Int128& operator+=(const Int128& right) {
uint64_t sum = lowbits + right.lowbits;
highbits += right.highbits;
if (sum < lowbits) {
@@ -125,7 +125,7 @@ namespace orc {
* @param right the number to subtract
* @return *this
*/
- Int128& operator-=(const Int128 &right) {
+ Int128& operator-=(const Int128& right) {
uint64_t diff = lowbits - right.lowbits;
highbits -= right.highbits;
if (diff > lowbits) {
@@ -140,7 +140,7 @@ namespace orc {
* @param right the number to multiply by
* @return *this
*/
- Int128& operator*=(const Int128 &right);
+ Int128& operator*=(const Int128& right);
/**
* Divide this number by right and return the result. This operation is
@@ -154,14 +154,14 @@ namespace orc {
* @param right the number to divide by
* @param remainder the remainder after the division
*/
- Int128 divide(const Int128 &right, Int128& remainder) const;
+ Int128 divide(const Int128& right, Int128& remainder) const;
/**
* Logical or between two Int128.
* @param right the number to or in
* @return *this
*/
- Int128& operator|=(const Int128 &right) {
+ Int128& operator|=(const Int128& right) {
lowbits |= right.lowbits;
highbits |= right.highbits;
return *this;
@@ -172,7 +172,7 @@ namespace orc {
* @param right the number to and in
* @return *this
*/
- Int128& operator&=(const Int128 &right) {
+ Int128& operator&=(const Int128& right) {
lowbits &= right.lowbits;
highbits &= right.highbits;
return *this;
@@ -183,7 +183,7 @@ namespace orc {
* @param right the number to and in
* @return logical and result
*/
- Int128 operator&(const Int128 &right) {
+ Int128 operator&(const Int128& right) {
Int128 value = *this;
value &= right;
return value;
@@ -219,8 +219,7 @@ namespace orc {
if (bits < 64) {
lowbits >>= bits;
lowbits |= static_cast<uint64_t>(highbits << (64 - bits));
- highbits = static_cast<int64_t>
- (static_cast<uint64_t>(highbits) >> bits);
+ highbits = static_cast<int64_t>(static_cast<uint64_t>(highbits) >> bits);
} else if (bits < 128) {
lowbits = static_cast<uint64_t>(highbits >> (bits - 64));
highbits = highbits >= 0 ? 0 : -1l;
@@ -240,7 +239,7 @@ namespace orc {
return highbits != right.highbits || lowbits != right.lowbits;
}
- bool operator<(const Int128 &right) const {
+ bool operator<(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits < right.lowbits;
} else {
@@ -248,7 +247,7 @@ namespace orc {
}
}
- bool operator<=(const Int128 &right) const {
+ bool operator<=(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits <= right.lowbits;
} else {
@@ -256,7 +255,7 @@ namespace orc {
}
}
- bool operator>(const Int128 &right) const {
+ bool operator>(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits > right.lowbits;
} else {
@@ -264,7 +263,7 @@ namespace orc {
}
}
- bool operator>=(const Int128 &right) const {
+ bool operator>=(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits >= right.lowbits;
} else {
@@ -273,10 +272,8 @@ namespace orc {
}
uint32_t hash() const {
- return static_cast<uint32_t>(highbits >> 32) ^
- static_cast<uint32_t>(highbits) ^
- static_cast<uint32_t>(lowbits >> 32) ^
- static_cast<uint32_t>(lowbits);
+ return static_cast<uint32_t>(highbits >> 32) ^ static_cast<uint32_t>(highbits) ^
+ static_cast<uint32_t>(lowbits >> 32) ^ static_cast<uint32_t>(lowbits);
}
/**
@@ -284,17 +281,17 @@ namespace orc {
*/
bool fitsInLong() const {
switch (highbits) {
- case 0:
- return 0 == (lowbits & LONG_SIGN_BIT);
- case -1:
- return 0 != (lowbits & LONG_SIGN_BIT);
- default:
- return false;
+ case 0:
+ return 0 == (lowbits & LONG_SIGN_BIT);
+ case -1:
+ return 0 != (lowbits & LONG_SIGN_BIT);
+ default:
+ return false;
}
}
/**
- * Convert the value to a long and
+ * Convert the value to a long and throw std::range_error on overflow.
*/
int64_t toLong() const {
if (fitsInLong()) {
@@ -304,6 +301,11 @@ namespace orc {
}
/**
+ * Convert the value to a double, the return value may not be precise.
+ */
+ double toDouble() const;
+
+ /**
* Return the base 10 string representation of the integer.
*/
std::string toString() const;
@@ -316,8 +318,7 @@ namespace orc {
* @param trimTrailingZeros whether or not to trim trailing zeros
* @return converted string representation
*/
- std::string toDecimalString(int32_t scale = 0,
- bool trimTrailingZeros = false) const;
+ std::string toDecimalString(int32_t scale = 0, bool trimTrailingZeros = false) const;
/**
* Return the base 16 string representation of the two's complement with
@@ -329,14 +330,14 @@ namespace orc {
/**
* Get the high bits of the twos complement representation of the number.
*/
- int64_t getHighBits() {
+ int64_t getHighBits() const {
return highbits;
}
/**
* Get the low bits of the twos complement representation of the number.
*/
- uint64_t getLowBits() {
+ uint64_t getLowBits() const {
return lowbits;
}
@@ -347,15 +348,14 @@ namespace orc {
* @param wasNegative set to true if the original number was negative
* @return the number of elements that were set in the array (1 to 4)
*/
- int64_t fillInArray(uint32_t* array, bool &wasNegative) const;
+ int64_t fillInArray(uint32_t* array, bool& wasNegative) const;
- private:
+ private:
static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u;
int64_t highbits;
uint64_t lowbits;
};
-
/**
* Scales up an Int128 value
* @param value the Int128 value to scale
@@ -363,9 +363,7 @@ namespace orc {
* @param overflow returns whether the result overflows or not
* @return the scaled value
*/
- Int128 scaleUpInt128ByPowerOfTen(Int128 value,
- int32_t power,
- bool &overflow);
+ Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power, bool& overflow);
/**
* Scales down an Int128 value
* @param value the Int128 value to scale
@@ -373,5 +371,35 @@ namespace orc {
* @return the scaled value
*/
Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power);
-}
+
+ /**
+ * Converts decimal value to different precision/scale
+ * @param value the Int128 value to convert
+ * @param fromScale the scale of the value
+ * @param toPrecision the precision to convert to
+ * @param toScale the scale to convert to
+ * @param round whether to round the value or truncate
+ * @return whether the conversion overflows and the converted value if does not overflow
+ */
+ std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+ int32_t toScale, bool round = true);
+
+ /**
+ * Converts a float value to decimal
+ * @param value the float value to convert
+ * @param precision the precision of the decimal
+ * @param scale the scale of the decimal
+ * @return whether the conversion overflows and the converted value if does not overflow
+ */
+ template <typename T>
+ std::enable_if_t<std::is_floating_point_v<T>, std::pair<bool, Int128>> convertDecimal(
+ T value, int32_t precision, int32_t scale);
+
+ extern template std::pair<bool, Int128> convertDecimal<float>(float value, int32_t precision,
+ int32_t scale);
+
+ extern template std::pair<bool, Int128> convertDecimal<double>(double value, int32_t precision,
+ int32_t scale);
+
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh
index 71d76c438a..6d999d3aa8 100644
--- a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh
@@ -19,15 +19,13 @@
#ifndef MEMORYPOOL_HH_
#define MEMORYPOOL_HH_
-#include "orc/orc-config.hh"
-#include "orc/Int128.hh"
-
#include <memory>
-
+#include "orc/Int128.hh"
+#include "orc/orc-config.hh"
namespace orc {
class MemoryPool {
- public:
+ public:
virtual ~MemoryPool();
virtual char* malloc(uint64_t size) = 0;
@@ -37,7 +35,7 @@ namespace orc {
template <class T>
class DataBuffer {
- private:
+ private:
MemoryPool& memoryPool;
T* buf;
// current size
@@ -49,10 +47,10 @@ namespace orc {
DataBuffer(DataBuffer& buffer);
DataBuffer& operator=(DataBuffer& buffer);
- public:
+ public:
DataBuffer(MemoryPool& pool, uint64_t _size = 0);
- DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT;
+ DataBuffer(DataBuffer<T>&& buffer) noexcept;
virtual ~DataBuffer();
@@ -64,20 +62,25 @@ namespace orc {
return buf;
}
- uint64_t size() {
+ uint64_t size() const {
return currentSize;
}
- uint64_t capacity() {
+ uint64_t capacity() const {
return currentCapacity;
}
+ const T& operator[](uint64_t i) const {
+ return buf[i];
+ }
+
T& operator[](uint64_t i) {
return buf[i];
}
void reserve(uint64_t _size);
void resize(uint64_t _size);
+ void zeroOut();
};
// Specializations for char
@@ -104,6 +107,14 @@ namespace orc {
template <>
void DataBuffer<double>::resize(uint64_t newSize);
+ // Specializations for float
+
+ template <>
+ DataBuffer<float>::~DataBuffer();
+
+ template <>
+ void DataBuffer<float>::resize(uint64_t newSize);
+
// Specializations for int64_t
template <>
@@ -112,6 +123,30 @@ namespace orc {
template <>
void DataBuffer<int64_t>::resize(uint64_t newSize);
+ // Specializations for int32_t
+
+ template <>
+ DataBuffer<int32_t>::~DataBuffer();
+
+ template <>
+ void DataBuffer<int32_t>::resize(uint64_t newSize);
+
+ // Specializations for int16_t
+
+ template <>
+ DataBuffer<int16_t>::~DataBuffer();
+
+ template <>
+ void DataBuffer<int16_t>::resize(uint64_t newSize);
+
+ // Specializations for int8_t
+
+ template <>
+ DataBuffer<int8_t>::~DataBuffer();
+
+ template <>
+ void DataBuffer<int8_t>::resize(uint64_t newSize);
+
// Specializations for uint64_t
template <>
@@ -128,23 +163,31 @@ namespace orc {
template <>
void DataBuffer<unsigned char>::resize(uint64_t newSize);
- #ifdef __clang__
- #pragma clang diagnostic push
- #pragma clang diagnostic ignored "-Wweak-template-vtables"
- #endif
+ // Specializations for Int128
+
+ template <>
+ void DataBuffer<Int128>::zeroOut();
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wweak-template-vtables"
+#endif
extern template class DataBuffer<char>;
extern template class DataBuffer<char*>;
extern template class DataBuffer<double>;
+ extern template class DataBuffer<float>;
extern template class DataBuffer<Int128>;
extern template class DataBuffer<int64_t>;
+ extern template class DataBuffer<int32_t>;
+ extern template class DataBuffer<int16_t>;
+ extern template class DataBuffer<int8_t>;
extern template class DataBuffer<uint64_t>;
extern template class DataBuffer<unsigned char>;
- #ifdef __clang__
- #pragma clang diagnostic pop
- #endif
-} // namespace orc
-
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+} // namespace orc
#endif /* MEMORYPOOL_HH_ */
diff --git a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh
index c64853168a..6e4a07bf7c 100644
--- a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh
@@ -21,9 +21,9 @@
#include <string>
-#include "orc/orc-config.hh"
#include "orc/Reader.hh"
#include "orc/Writer.hh"
+#include "orc/orc-config.hh"
/** /file orc/OrcFile.hh
@brief The top level interface to ORC.
@@ -35,7 +35,7 @@ namespace orc {
* An abstract interface for providing ORC readers a stream of bytes.
*/
class InputStream {
- public:
+ public:
virtual ~InputStream();
/**
@@ -56,9 +56,7 @@ namespace orc {
* @param length the number of bytes to read.
* @param offset the position in the stream to read from.
*/
- virtual void read(void* buf,
- uint64_t length,
- uint64_t offset) = 0;
+ virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
/**
* Get the name of the stream for error messages.
@@ -70,7 +68,7 @@ namespace orc {
* An abstract interface for providing ORC writer a stream of bytes.
*/
class OutputStream {
- public:
+ public:
virtual ~OutputStream();
/**
@@ -100,38 +98,50 @@ namespace orc {
* Close the stream and flush any pending data to the disk.
*/
virtual void close() = 0;
+
+ /**
+ * Flush any pending data to the disk.
+ */
+ virtual void flush() {
+ throw NotImplementedYet("Not supported");
+ }
};
/**
* Create a stream to a local file or HDFS file if path begins with "hdfs://"
* @param path the name of the file in the local file system or HDFS
+ * @param metrics the metrics of the reader
*/
- ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path);
+ std::unique_ptr<InputStream> readFile(const std::string& path, ReaderMetrics* metrics = nullptr);
/**
* Create a stream to a local file.
* @param path the name of the file in the local file system
+ * @param metrics the metrics of the reader
*/
- ORC_UNIQUE_PTR<InputStream> readLocalFile(const std::string& path);
+ std::unique_ptr<InputStream> readLocalFile(const std::string& path,
+ ReaderMetrics* metrics = nullptr);
/**
* Create a stream to an HDFS file.
* @param path the uri of the file in HDFS
+ * @param metrics the metrics of the reader
*/
- ORC_UNIQUE_PTR<InputStream> readHdfsFile(const std::string& path);
+ std::unique_ptr<InputStream> readHdfsFile(const std::string& path,
+ ReaderMetrics* metrics = nullptr);
/**
* Create a reader to read the ORC file.
* @param stream the stream to read
* @param options the options for reading the file
*/
- ORC_UNIQUE_PTR<Reader> createReader(ORC_UNIQUE_PTR<InputStream> stream,
- const ReaderOptions& options);
+ std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream,
+ const ReaderOptions& options);
/**
* Create a stream to write to a local file.
* @param path the name of the file in the local file system
*/
- ORC_UNIQUE_PTR<OutputStream> writeLocalFile(const std::string& path);
+ std::unique_ptr<OutputStream> writeLocalFile(const std::string& path);
/**
* Create a writer to write the ORC file.
@@ -139,10 +149,8 @@ namespace orc {
* @param stream the stream to write to
* @param options the options for writing the file
*/
- ORC_UNIQUE_PTR<Writer> createWriter(
- const Type& type,
- OutputStream* stream,
- const WriterOptions& options);
-}
+ std::unique_ptr<Writer> createWriter(const Type& type, OutputStream* stream,
+ const WriterOptions& options);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh
index ddc8b55055..b631c2c6ea 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh
@@ -21,12 +21,13 @@
#include "orc/BloomFilter.hh"
#include "orc/Common.hh"
-#include "orc/orc-config.hh"
#include "orc/Statistics.hh"
-#include "orc/sargs/SearchArgument.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
+#include "orc/sargs/SearchArgument.hh"
+#include <atomic>
#include <map>
#include <memory>
#include <set>
@@ -40,13 +41,35 @@ namespace orc {
struct RowReaderOptionsPrivate;
/**
+ * Expose the reader metrics including the latency and
+ * number of calls of the decompression/decoding/IO modules.
+ */
+ struct ReaderMetrics {
+ std::atomic<uint64_t> ReaderCall{0};
+ // ReaderInclusiveLatencyUs contains the latency of
+ // the decompression/decoding/IO modules.
+ std::atomic<uint64_t> ReaderInclusiveLatencyUs{0};
+ std::atomic<uint64_t> DecompressionCall{0};
+ std::atomic<uint64_t> DecompressionLatencyUs{0};
+ std::atomic<uint64_t> DecodingCall{0};
+ std::atomic<uint64_t> DecodingLatencyUs{0};
+ std::atomic<uint64_t> ByteDecodingCall{0};
+ std::atomic<uint64_t> ByteDecodingLatencyUs{0};
+ std::atomic<uint64_t> IOCount{0};
+ std::atomic<uint64_t> IOBlockingLatencyUs{0};
+ std::atomic<uint64_t> SelectedRowGroupCount{0};
+ std::atomic<uint64_t> EvaluatedRowGroupCount{0};
+ };
+ ReaderMetrics* getDefaultReaderMetrics();
+
+ /**
* Options for creating a Reader.
*/
class ReaderOptions {
- private:
- ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits;
+ private:
+ std::unique_ptr<ReaderOptionsPrivate> privateBits;
- public:
+ public:
ReaderOptions();
ReaderOptions(const ReaderOptions&);
ReaderOptions(ReaderOptions&);
@@ -77,6 +100,14 @@ namespace orc {
ReaderOptions& setMemoryPool(MemoryPool& pool);
/**
+ * Set the reader metrics.
+ *
+ * Defaults to nullptr.
+ * When set to nullptr, the reader metrics will be disabled.
+ */
+ ReaderOptions& setReaderMetrics(ReaderMetrics* metrics);
+
+ /**
* Set the location of the tail as defined by the logical length of the
* file.
*/
@@ -102,16 +133,21 @@ namespace orc {
* Get the memory allocator.
*/
MemoryPool* getMemoryPool() const;
+
+ /**
+ * Get the reader metrics.
+ */
+ ReaderMetrics* getReaderMetrics() const;
};
/**
* Options for creating a RowReader.
*/
class RowReaderOptions {
- private:
- ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits;
+ private:
+ std::unique_ptr<RowReaderOptionsPrivate> privateBits;
- public:
+ public:
RowReaderOptions();
RowReaderOptions(const RowReaderOptions&);
RowReaderOptions(RowReaderOptions&);
@@ -164,8 +200,7 @@ namespace orc {
* @param idReadIntentMap a map of IdReadIntentMap.
* @return this
*/
- RowReaderOptions&
- includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap);
+ RowReaderOptions& includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap);
/**
* Set the section of the file to process.
@@ -289,8 +324,39 @@ namespace orc {
* Get the IdReadIntentMap map that was supplied by client.
*/
const IdReadIntentMap getIdReadIntentMap() const;
- };
+ /**
+ * Set whether use fixed width numeric vectorBatch or not, such as int32_t / int16_t / int8_t /
+ * float vectorBatch.
+ */
+ RowReaderOptions& setUseTightNumericVector(bool useTightNumericVector);
+
+ /**
+ * Get whether or not to use fixed width numeric columnVectorBatch.
+ * @return if not set, the default is false
+ */
+ bool getUseTightNumericVector() const;
+
+ /**
+ * Set read type for schema evolution
+ */
+ RowReaderOptions& setReadType(std::shared_ptr<Type> type);
+
+ /**
+ * Get read type for schema evolution
+ */
+ std::shared_ptr<Type>& getReadType() const;
+
+ /**
+ * Set whether reader throws or returns null when value overflows for schema evolution.
+ */
+ RowReaderOptions& throwOnSchemaEvolutionOverflow(bool shouldThrow);
+
+ /**
+ * Whether reader throws or returns null when value overflows for schema evolution.
+ */
+ bool getThrowOnSchemaEvolutionOverflow() const;
+ };
class RowReader;
@@ -299,7 +365,7 @@ namespace orc {
* This is an an abstract class that will be subclassed as necessary.
*/
class Reader {
- public:
+ public:
virtual ~Reader();
/**
@@ -389,8 +455,7 @@ namespace orc {
* @param stripeIndex the index of the stripe (0 to N-1) to get information about
* @return the information about that stripe
*/
- virtual ORC_UNIQUE_PTR<StripeInformation>
- getStripe(uint64_t stripeIndex) const = 0;
+ virtual std::unique_ptr<StripeInformation> getStripe(uint64_t stripeIndex) const = 0;
/**
* Get the number of stripe statistics in the file.
@@ -403,8 +468,7 @@ namespace orc {
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
* @return the statistics about that stripe
*/
- virtual ORC_UNIQUE_PTR<StripeStatistics>
- getStripeStatistics(uint64_t stripeIndex) const = 0;
+ virtual std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const = 0;
/**
* Get the length of the data stripes in the file.
@@ -440,15 +504,14 @@ namespace orc {
* Get the statistics about the columns in the file.
* @return the information about the column
*/
- virtual ORC_UNIQUE_PTR<Statistics> getStatistics() const = 0;
+ virtual std::unique_ptr<Statistics> getStatistics() const = 0;
/**
* Get the statistics about a single column in the file.
* @param columnId id of the column
* @return the information about the column
*/
- virtual ORC_UNIQUE_PTR<ColumnStatistics>
- getColumnStatistics(uint32_t columnId) const = 0;
+ virtual std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId) const = 0;
/**
* Check if the file has correct column statistics.
@@ -456,6 +519,12 @@ namespace orc {
virtual bool hasCorrectStatistics() const = 0;
/**
+ * Get metrics of the reader
+ * @return the accumulated reader metrics to current state.
+ */
+ virtual const ReaderMetrics* getReaderMetrics() const = 0;
+
+ /**
* Get the serialized file tail.
* Usefull if another reader of the same file wants to avoid re-reading
* the file tail. See ReaderOptions.setSerializedFileTail().
@@ -474,14 +543,14 @@ namespace orc {
* Create a RowReader based on this reader with the default options.
* @return a RowReader to read the rows
*/
- virtual ORC_UNIQUE_PTR<RowReader> createRowReader() const = 0;
+ virtual std::unique_ptr<RowReader> createRowReader() const = 0;
/**
* Create a RowReader based on this reader.
* @param options RowReader Options
* @return a RowReader to read the rows
*/
- virtual ORC_UNIQUE_PTR<RowReader> createRowReader(const RowReaderOptions& options) const = 0;
+ virtual std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options) const = 0;
/**
* Get the name of the input stream.
@@ -493,13 +562,13 @@ namespace orc {
* based on the information in the file footer.
* The bound is less tight if only few columns are read or compression is
* used.
- */
+ */
/**
* @param stripeIx index of the stripe to be read (if not specified,
* all stripes are considered).
* @return upper bound on memory use by all columns
*/
- virtual uint64_t getMemoryUse(int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUse(int stripeIx = -1) = 0;
/**
* @param include Column Field Ids
@@ -507,7 +576,8 @@ namespace orc {
* all stripes are considered).
* @return upper bound on memory use by selected columns
*/
- virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include,
+ int stripeIx = -1) = 0;
/**
* @param names Column Names
@@ -515,7 +585,7 @@ namespace orc {
* all stripes are considered).
* @return upper bound on memory use by selected columns
*/
- virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx = -1) = 0;
/**
* @param include Column Type Ids
@@ -523,7 +593,8 @@ namespace orc {
* all stripes are considered).
* @return upper bound on memory use by selected columns
*/
- virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include,
+ int stripeIx = -1) = 0;
/**
* Get BloomFiters of all selected columns in the specified stripe
@@ -532,8 +603,8 @@ namespace orc {
* all columns that have bloom filters are considered).
* @return map of bloom filters with the key standing for the index of column.
*/
- virtual std::map<uint32_t, BloomFilterIndex>
- getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
+ virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters(
+ uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
};
/**
@@ -541,7 +612,7 @@ namespace orc {
* This is an an abstract class that will be subclassed as necessary.
*/
class RowReader {
- public:
+ public:
virtual ~RowReader();
/**
* Get the selected type of the rows in the file. The file's row type
@@ -563,8 +634,7 @@ namespace orc {
* @param size the number of rows to read
* @return a new ColumnVectorBatch to read into
*/
- virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size
- ) const = 0;
+ virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const = 0;
/**
* Read the next row batch from the current position.
@@ -587,8 +657,7 @@ namespace orc {
* @param rowNumber the next row the reader should return
*/
virtual void seekToRow(uint64_t rowNumber) = 0;
-
};
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh
index 4d7caeab3d..4ba8c35f7d 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh
@@ -19,9 +19,11 @@
#ifndef ORC_STATISTICS_HH
#define ORC_STATISTICS_HH
-#include "orc/orc-config.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
+
+#include <sstream>
namespace orc {
@@ -29,7 +31,7 @@ namespace orc {
* Statistics that are available for all types of columns.
*/
class ColumnStatistics {
- public:
+ public:
virtual ~ColumnStatistics();
/**
@@ -54,9 +56,9 @@ namespace orc {
/**
* Statistics for binary columns.
*/
- class BinaryColumnStatistics: public ColumnStatistics {
- public:
- virtual ~BinaryColumnStatistics();
+ class BinaryColumnStatistics : public ColumnStatistics {
+ public:
+ ~BinaryColumnStatistics() override;
/**
* Check whether column has total length.
@@ -70,9 +72,9 @@ namespace orc {
/**
* Statistics for boolean columns.
*/
- class BooleanColumnStatistics: public ColumnStatistics {
- public:
- virtual ~BooleanColumnStatistics();
+ class BooleanColumnStatistics : public ColumnStatistics {
+ public:
+ ~BooleanColumnStatistics() override;
/**
* Check whether column has true/false count.
@@ -87,9 +89,9 @@ namespace orc {
/**
* Statistics for date columns.
*/
- class DateColumnStatistics: public ColumnStatistics {
- public:
- virtual ~DateColumnStatistics();
+ class DateColumnStatistics : public ColumnStatistics {
+ public:
+ ~DateColumnStatistics() override;
/**
* Check whether column has minimum.
@@ -119,9 +121,9 @@ namespace orc {
/**
* Statistics for decimal columns.
*/
- class DecimalColumnStatistics: public ColumnStatistics {
- public:
- virtual ~DecimalColumnStatistics();
+ class DecimalColumnStatistics : public ColumnStatistics {
+ public:
+ ~DecimalColumnStatistics() override;
/**
* Check whether column has minimum.
@@ -163,9 +165,9 @@ namespace orc {
/**
* Statistics for float and double columns.
*/
- class DoubleColumnStatistics: public ColumnStatistics {
- public:
- virtual ~DoubleColumnStatistics();
+ class DoubleColumnStatistics : public ColumnStatistics {
+ public:
+ ~DoubleColumnStatistics() override;
/**
* Check whether column has minimum.
@@ -210,9 +212,9 @@ namespace orc {
* Statistics for all of the integer columns, such as byte, short, int, and
* long.
*/
- class IntegerColumnStatistics: public ColumnStatistics {
- public:
- virtual ~IntegerColumnStatistics();
+ class IntegerColumnStatistics : public ColumnStatistics {
+ public:
+ ~IntegerColumnStatistics() override;
/**
* Check whether column has minimum.
@@ -256,9 +258,9 @@ namespace orc {
/**
* Statistics for string columns.
*/
- class StringColumnStatistics: public ColumnStatistics {
- public:
- virtual ~StringColumnStatistics();
+ class StringColumnStatistics : public ColumnStatistics {
+ public:
+ ~StringColumnStatistics() override;
/**
* Check whether column has minimum.
@@ -282,13 +284,13 @@ namespace orc {
* Get the minimum value for the column.
* @return minimum value
*/
- virtual const std::string & getMinimum() const = 0;
+ virtual const std::string& getMinimum() const = 0;
/**
* Get the maximum value for the column.
* @return maximum value
*/
- virtual const std::string & getMaximum() const = 0;
+ virtual const std::string& getMaximum() const = 0;
/**
* Get the total length of all values.
@@ -300,9 +302,9 @@ namespace orc {
/**
* Statistics for timestamp columns.
*/
- class TimestampColumnStatistics: public ColumnStatistics {
- public:
- virtual ~TimestampColumnStatistics();
+ class TimestampColumnStatistics : public ColumnStatistics {
+ public:
+ ~TimestampColumnStatistics() override;
/**
* Check whether minimum timestamp exists.
@@ -366,7 +368,7 @@ namespace orc {
};
class Statistics {
- public:
+ public:
virtual ~Statistics();
/**
@@ -374,8 +376,7 @@ namespace orc {
* @param colId id of the column
* @return one column's statistics
*/
- virtual const ColumnStatistics* getColumnStatistics(uint32_t colId
- ) const = 0;
+ virtual const ColumnStatistics* getColumnStatistics(uint32_t colId) const = 0;
/**
* Get the number of columns.
@@ -388,8 +389,8 @@ namespace orc {
* Statistics for all of collections such as Map and List.
*/
class CollectionColumnStatistics : public ColumnStatistics {
- public:
- virtual ~CollectionColumnStatistics();
+ public:
+ ~CollectionColumnStatistics() override;
/**
* check whether column has minimum number of children
@@ -453,8 +454,8 @@ namespace orc {
};
class StripeStatistics : public Statistics {
- public:
- virtual ~StripeStatistics();
+ public:
+ ~StripeStatistics() override;
/**
* Get the statistics of a given RowIndex entry in a given column.
@@ -462,9 +463,8 @@ namespace orc {
* @param rowIndexId RowIndex entry id
* @return statistics of the given RowIndex entry
*/
- virtual const ColumnStatistics*
- getRowIndexStatistics(
- uint32_t columnId, uint32_t rowIndexId) const = 0;
+ virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
+ uint32_t rowIndexId) const = 0;
/**
* Get the number of RowIndex statistics in a given column.
@@ -473,6 +473,6 @@ namespace orc {
*/
virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0;
};
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Type.hh b/contrib/libs/apache/orc/c++/include/orc/Type.hh
index a7df8307e6..82e0e3cc86 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Type.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Type.hh
@@ -19,9 +19,9 @@
#ifndef ORC_TYPE_HH
#define ORC_TYPE_HH
-#include "orc/orc-config.hh"
-#include "orc/Vector.hh"
#include "MemoryPool.hh"
+#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
namespace orc {
@@ -48,7 +48,7 @@ namespace orc {
};
class Type {
- public:
+ public:
virtual ~Type();
virtual uint64_t getColumnId() const = 0;
virtual uint64_t getMaximumColumnId() const = 0;
@@ -59,21 +59,28 @@ namespace orc {
virtual uint64_t getMaximumLength() const = 0;
virtual uint64_t getPrecision() const = 0;
virtual uint64_t getScale() const = 0;
- virtual Type& setAttribute(const std::string& key,
- const std::string& value) = 0;
+ virtual Type& setAttribute(const std::string& key, const std::string& value) = 0;
virtual bool hasAttributeKey(const std::string& key) const = 0;
virtual Type& removeAttribute(const std::string& key) = 0;
virtual std::vector<std::string> getAttributeKeys() const = 0;
virtual std::string getAttributeValue(const std::string& key) const = 0;
virtual std::string toString() const = 0;
+ /**
+ * Get the Type with the given column ID
+ * @param colId the column ID
+ * @return the type corresponding to the column Id, nullptr if not exists
+ */
+ virtual const Type* getTypeByColumnId(uint64_t colId) const = 0;
/**
* Create a row batch for this type.
*/
- virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size,
- MemoryPool& pool,
- bool encoded = false
- ) const = 0;
+ virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& pool,
+ bool encoded = false) const = 0;
+
+ virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& pool,
+ bool encoded,
+ bool useTightNumericVector) const = 0;
/**
* Add a new field to a struct type.
@@ -81,38 +88,33 @@ namespace orc {
* @param fieldType the type of the new field
* @return a reference to the struct type
*/
- virtual Type* addStructField(const std::string& fieldName,
- ORC_UNIQUE_PTR<Type> fieldType) = 0;
+ virtual Type* addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) = 0;
/**
* Add a new child to a union type.
* @param fieldType the type of the new field
* @return a reference to the union type
*/
- virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0;
+ virtual Type* addUnionChild(std::unique_ptr<Type> fieldType) = 0;
/**
* Build a Type object from string text representation.
*/
- static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input);
+ static std::unique_ptr<Type> buildTypeFromString(const std::string& input);
};
const int64_t DEFAULT_DECIMAL_SCALE = 18;
const int64_t DEFAULT_DECIMAL_PRECISION = 38;
- ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind);
- ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind,
- uint64_t maxLength);
- ORC_UNIQUE_PTR<Type>
- createDecimalType(uint64_t precision=
- DEFAULT_DECIMAL_PRECISION,
- uint64_t scale=DEFAULT_DECIMAL_SCALE);
+ std::unique_ptr<Type> createPrimitiveType(TypeKind kind);
+ std::unique_ptr<Type> createCharType(TypeKind kind, uint64_t maxLength);
+ std::unique_ptr<Type> createDecimalType(uint64_t precision = DEFAULT_DECIMAL_PRECISION,
+ uint64_t scale = DEFAULT_DECIMAL_SCALE);
- ORC_UNIQUE_PTR<Type> createStructType();
- ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements);
- ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key,
- ORC_UNIQUE_PTR<Type> value);
- ORC_UNIQUE_PTR<Type> createUnionType();
+ std::unique_ptr<Type> createStructType();
+ std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements);
+ std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, std::unique_ptr<Type> value);
+ std::unique_ptr<Type> createUnionType();
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Vector.hh b/contrib/libs/apache/orc/c++/include/orc/Vector.hh
index 752e1af78a..0dfe926965 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Vector.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Vector.hh
@@ -19,17 +19,17 @@
#ifndef ORC_VECTOR_HH
#define ORC_VECTOR_HH
-#include "orc/orc-config.hh"
-#include "MemoryPool.hh"
#include "Int128.hh"
+#include "MemoryPool.hh"
+#include "orc/orc-config.hh"
+#include <cstdlib>
+#include <cstring>
#include <list>
#include <memory>
-#include <cstring>
-#include <vector>
+#include <sstream>
#include <stdexcept>
-#include <cstdlib>
-#include <iostream>
+#include <vector>
namespace orc {
@@ -37,6 +37,11 @@ namespace orc {
* The base class for each of the column vectors. This class handles
* the generic attributes such as number of elements, capacity, and
* notNull vector.
+ * Note: If hasNull is false, the values in the notNull buffer are not required.
+ * On the writer side, it does not read values from notNull buffer so users are
+ * not expected to write notNull buffer if hasNull is false. On the reader side,
+ * it does not set notNull buffer if hasNull is false, meaning that it is undefined
+ * behavior to consume values from notNull buffer in this case by downstream users.
*/
struct ColumnVectorBatch {
ColumnVectorBatch(uint64_t capacity, MemoryPool& pool);
@@ -83,40 +88,128 @@ namespace orc {
*/
virtual bool hasVariableLength();
- private:
+ private:
ColumnVectorBatch(const ColumnVectorBatch&);
ColumnVectorBatch& operator=(const ColumnVectorBatch&);
};
- struct LongVectorBatch: public ColumnVectorBatch {
- LongVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~LongVectorBatch();
+ template <typename ValueType>
+ struct IntegerVectorBatch : public ColumnVectorBatch {
+ IntegerVectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool), data(pool, cap) {
+ // PASS
+ }
+
+ ~IntegerVectorBatch() override = default;
- DataBuffer<int64_t> data;
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
+ inline std::string toString() const override;
+
+ void resize(uint64_t cap) override {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ }
+ }
+
+ void clear() override {
+ numElements = 0;
+ }
+
+ uint64_t getMemoryUsage() override {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(data.capacity() * sizeof(ValueType));
+ }
+
+ DataBuffer<ValueType> data;
};
- struct DoubleVectorBatch: public ColumnVectorBatch {
- DoubleVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~DoubleVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
+ using LongVectorBatch = IntegerVectorBatch<int64_t>;
+ using IntVectorBatch = IntegerVectorBatch<int32_t>;
+ using ShortVectorBatch = IntegerVectorBatch<int16_t>;
+ using ByteVectorBatch = IntegerVectorBatch<int8_t>;
+
+ template <>
+ inline std::string LongVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Long vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ template <>
+ inline std::string IntVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Int vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ template <>
+ inline std::string ShortVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Short vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ template <>
+ inline std::string ByteVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Byte vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ template <typename FloatType>
+ struct FloatingVectorBatch : public ColumnVectorBatch {
+ FloatingVectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool), data(pool, cap) {
+ // PASS
+ }
- DataBuffer<double> data;
+ ~FloatingVectorBatch() override = default;
+
+ inline std::string toString() const override;
+
+ void resize(uint64_t cap) override {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ }
+ }
+
+ void clear() override {
+ numElements = 0;
+ }
+
+ uint64_t getMemoryUsage() override {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(data.capacity() * sizeof(FloatType));
+ }
+
+ DataBuffer<FloatType> data;
};
- struct StringVectorBatch: public ColumnVectorBatch {
+ using DoubleVectorBatch = FloatingVectorBatch<double>;
+ using FloatVectorBatch = FloatingVectorBatch<float>;
+
+ template <>
+ inline std::string DoubleVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Double vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ template <>
+ inline std::string FloatVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Float vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ struct StringVectorBatch : public ColumnVectorBatch {
StringVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~StringVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
+ ~StringVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
// pointers to the start of each string
DataBuffer<char*> data;
@@ -152,35 +245,35 @@ namespace orc {
*/
struct EncodedStringVectorBatch : public StringVectorBatch {
EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~EncodedStringVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
+ ~EncodedStringVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
std::shared_ptr<StringDictionary> dictionary;
// index for dictionary entry
DataBuffer<int64_t> index;
};
- struct StructVectorBatch: public ColumnVectorBatch {
+ struct StructVectorBatch : public ColumnVectorBatch {
StructVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~StructVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
- bool hasVariableLength();
+ ~StructVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
std::vector<ColumnVectorBatch*> fields;
};
- struct ListVectorBatch: public ColumnVectorBatch {
+ struct ListVectorBatch : public ColumnVectorBatch {
ListVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~ListVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
- bool hasVariableLength();
+ ~ListVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
/**
* The offset of the first element of each list.
@@ -189,17 +282,17 @@ namespace orc {
DataBuffer<int64_t> offsets;
// the concatenated elements
- ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
+ std::unique_ptr<ColumnVectorBatch> elements;
};
- struct MapVectorBatch: public ColumnVectorBatch {
+ struct MapVectorBatch : public ColumnVectorBatch {
MapVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~MapVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
- bool hasVariableLength();
+ ~MapVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
/**
* The offset of the first element of each map.
@@ -208,19 +301,19 @@ namespace orc {
DataBuffer<int64_t> offsets;
// the concatenated keys
- ORC_UNIQUE_PTR<ColumnVectorBatch> keys;
+ std::unique_ptr<ColumnVectorBatch> keys;
// the concatenated elements
- ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
+ std::unique_ptr<ColumnVectorBatch> elements;
};
- struct UnionVectorBatch: public ColumnVectorBatch {
+ struct UnionVectorBatch : public ColumnVectorBatch {
UnionVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~UnionVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
- bool hasVariableLength();
+ ~UnionVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
/**
* For each value, which element of children has the value.
@@ -246,13 +339,13 @@ namespace orc {
int32_t scale;
};
- struct Decimal64VectorBatch: public ColumnVectorBatch {
+ struct Decimal64VectorBatch : public ColumnVectorBatch {
Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~Decimal64VectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
+ ~Decimal64VectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
// total number of digits
int32_t precision;
@@ -262,7 +355,7 @@ namespace orc {
// the numeric values
DataBuffer<int64_t> values;
- protected:
+ protected:
/**
* Contains the scales that were read from the file. Should NOT be
* used.
@@ -272,13 +365,13 @@ namespace orc {
friend class Decimal64ColumnWriter;
};
- struct Decimal128VectorBatch: public ColumnVectorBatch {
+ struct Decimal128VectorBatch : public ColumnVectorBatch {
Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~Decimal128VectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
+ ~Decimal128VectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
// total number of digits
int32_t precision;
@@ -288,7 +381,7 @@ namespace orc {
// the numeric values
DataBuffer<Int128> values;
- protected:
+ protected:
/**
* Contains the scales that were read from the file. Should NOT be
* used.
@@ -304,13 +397,13 @@ namespace orc {
* The timestamps are stored split into the time_t value (seconds since
* 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value.
*/
- struct TimestampVectorBatch: public ColumnVectorBatch {
+ struct TimestampVectorBatch : public ColumnVectorBatch {
TimestampVectorBatch(uint64_t capacity, MemoryPool& pool);
- virtual ~TimestampVectorBatch();
- std::string toString() const;
- void resize(uint64_t capacity);
- void clear();
- uint64_t getMemoryUsage();
+ ~TimestampVectorBatch() override;
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ void clear() override;
+ uint64_t getMemoryUsage() override;
// the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t)
// Note that we always assume data is in GMT timezone; therefore it is
@@ -322,6 +415,6 @@ namespace orc {
DataBuffer<int64_t> nanoseconds;
};
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/Writer.hh b/contrib/libs/apache/orc/c++/include/orc/Writer.hh
index 78b0b97d25..047ee9ffc5 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Writer.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Writer.hh
@@ -20,10 +20,11 @@
#define ORC_WRITER_HH
#include "orc/Common.hh"
-#include "orc/orc-config.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
+#include <atomic>
#include <memory>
#include <set>
#include <string>
@@ -34,26 +35,29 @@ namespace orc {
// classes that hold data members so we can maintain binary compatibility
struct WriterOptionsPrivate;
- enum CompressionStrategy {
- CompressionStrategy_SPEED = 0,
- CompressionStrategy_COMPRESSION
- };
+ enum CompressionStrategy { CompressionStrategy_SPEED = 0, CompressionStrategy_COMPRESSION };
- enum RleVersion {
- RleVersion_1 = 0,
- RleVersion_2 = 1
- };
+ enum RleVersion { RleVersion_1 = 0, RleVersion_2 = 1 };
class Timezone;
/**
+ * Expose the IO metrics for write operation.
+ */
+ struct WriterMetrics {
+ // Record the number of IO requests written to the output file
+ std::atomic<uint64_t> IOCount{0};
+ // Record the lantency of IO blocking
+ std::atomic<uint64_t> IOBlockingLatencyUs{0};
+ };
+ /**
* Options for creating a Writer.
*/
class WriterOptions {
- private:
- ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits;
+ private:
+ std::unique_ptr<WriterOptionsPrivate> privateBits;
- public:
+ public:
WriterOptions();
WriterOptions(const WriterOptions&);
WriterOptions(WriterOptions&);
@@ -73,6 +77,8 @@ namespace orc {
/**
* Set the data compression block size.
+ * Should less then 1 << 23 bytes (8M) which is limited by the
+ * 3 bytes size of compression block header (1 bit for isOriginal and 23 bits for length)
*/
WriterOptions& setCompressionBlockSize(uint64_t size);
@@ -83,7 +89,8 @@ namespace orc {
uint64_t getCompressionBlockSize() const;
/**
- * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index.
+ * Set row index stride (the number of rows per an entry in the row index). Use value 0 to
+ * disable row index.
*/
WriterOptions& setRowIndexStride(uint64_t stride);
@@ -157,13 +164,13 @@ namespace orc {
/**
* Set the memory pool.
*/
- WriterOptions& setMemoryPool(MemoryPool * memoryPool);
+ WriterOptions& setMemoryPool(MemoryPool* memoryPool);
/**
* Get the memory pool.
* @return if not set, return default memory pool.
*/
- MemoryPool * getMemoryPool() const;
+ MemoryPool* getMemoryPool() const;
/**
* Set the error stream.
@@ -174,7 +181,7 @@ namespace orc {
* Get the error stream.
* @return if not set, return std::err.
*/
- std::ostream * getErrorStream() const;
+ std::ostream* getErrorStream() const;
/**
* Get the RLE version.
@@ -235,10 +242,45 @@ namespace orc {
* @param zone writer timezone name
*/
WriterOptions& setTimezoneName(const std::string& zone);
+
+ /**
+ * Set the writer metrics.
+ */
+ WriterOptions& setWriterMetrics(WriterMetrics* metrics);
+
+ /**
+ * Get the writer metrics.
+ * @return if not set, return nullptr.
+ */
+ WriterMetrics* getWriterMetrics() const;
+
+ /**
+ * Set use tight numeric vectorBatch or not.
+ */
+ WriterOptions& setUseTightNumericVector(bool useTightNumericVector);
+
+ /**
+ * Get whether or not to use dedicated columnVectorBatch
+ * @return if not set, the default is false
+ */
+ bool getUseTightNumericVector() const;
+
+ /**
+ * Set the initial capacity of output buffer in the class BufferedOutputStream.
+ * Each column contains one or more BufferOutputStream depending on its type,
+ * and these buffers will automatically expand when more memory is required.
+ */
+ WriterOptions& setOutputBufferCapacity(uint64_t capacity);
+
+ /**
+ * Get the initial capacity of output buffer in the class BufferedOutputStream.
+ * @return if not set, return default value which is 1 MB.
+ */
+ uint64_t getOutputBufferCapacity() const;
};
class Writer {
- public:
+ public:
virtual ~Writer();
/**
@@ -246,8 +288,7 @@ namespace orc {
* @param size the number of rows to write.
* @return a new ColumnVectorBatch to write into.
*/
- virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size
- ) const = 0;
+ virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const = 0;
/**
* Add a row batch into current writer.
@@ -263,8 +304,15 @@ namespace orc {
/**
* Add user metadata to the writer.
*/
- virtual void addUserMetadata(const std::string name, const std::string value) = 0;
+ virtual void addUserMetadata(const std::string& name, const std::string& value) = 0;
+
+ /**
+ * Write an intermediate footer on the file such that if the file is
+ * truncated to the returned offset, it would be a valid ORC file.
+ * @return the offset that would be a valid end location for an ORC file
+ */
+ virtual uint64_t writeIntermediateFooter() = 0;
};
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
index b8fb9fbd4e..ab1e16fa15 100644
--- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
@@ -1,7 +1,11 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@@ -15,14 +19,9 @@
#ifndef ORC_CONFIG_HH
#define ORC_CONFIG_HH
-#define ORC_VERSION "1.8.0"
+#define ORC_VERSION "2.0.0"
#define ORC_CXX_HAS_CSTDINT
-#define ORC_CXX_HAS_INITIALIZER_LIST
-#define ORC_CXX_HAS_NOEXCEPT
-#define ORC_CXX_HAS_NULLPTR
-#define ORC_CXX_HAS_OVERRIDE
-#define ORC_CXX_HAS_UNIQUE_PTR
#ifdef ORC_CXX_HAS_CSTDINT
#include <cstdint>
@@ -30,49 +29,10 @@
#include <stdint.h>
#endif
-#ifdef ORC_CXX_HAS_NOEXCEPT
- #define ORC_NOEXCEPT noexcept
-#else
- #define ORC_NOEXCEPT throw ()
-#endif
-
-#ifdef ORC_CXX_HAS_NULLPTR
- #define ORC_NULLPTR nullptr
-#else
- namespace orc {
- class nullptr_t {
- public:
- template<class T>
- operator T*() const {
- return 0;
- }
-
- template<class C, class T>
- operator T C::*() const {
- return 0;
- }
- private:
- void operator&() const; // whose address can't be taken
- };
- const nullptr_t nullptr = {};
- }
- #define ORC_NULLPTR orc::nullptr
-#endif
-
-#ifdef ORC_CXX_HAS_OVERRIDE
- #define ORC_OVERRIDE override
-#else
- #define ORC_OVERRIDE
-#endif
-
-#ifdef ORC_CXX_HAS_UNIQUE_PTR
- #define ORC_UNIQUE_PTR std::unique_ptr
-#else
- #define ORC_UNIQUE_PTR std::auto_ptr
- namespace std {
- template<typename T>
- inline T move(T& x) { return x; }
- }
-#endif
+// Following MACROS should be keeped for backward compatibility.
+#define ORC_NOEXCEPT noexcept
+#define ORC_NULLPTR nullptr
+#define ORC_OVERRIDE override
+#define ORC_UNIQUE_PTR std::unique_ptr
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh
index 36c9b37e3f..9ce958302d 100644
--- a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh
@@ -27,21 +27,19 @@ namespace orc {
/**
* Possible data types for predicates
*/
- enum class PredicateDataType {
- LONG = 0, FLOAT, STRING, DATE, DECIMAL, TIMESTAMP, BOOLEAN
- };
+ enum class PredicateDataType { LONG = 0, FLOAT, STRING, DATE, DECIMAL, TIMESTAMP, BOOLEAN };
/**
* Represents a literal value in a predicate
*/
class Literal {
- public:
+ public:
struct Timestamp {
Timestamp() = default;
Timestamp(const Timestamp&) = default;
Timestamp(Timestamp&&) = default;
~Timestamp() = default;
- Timestamp(int64_t second_, int32_t nanos_): second(second_), nanos(nanos_) {
+ Timestamp(int64_t second_, int32_t nanos_) : second(second_), nanos(nanos_) {
// PASS
}
Timestamp& operator=(const Timestamp&) = default;
@@ -55,15 +53,23 @@ namespace orc {
bool operator<=(const Timestamp& r) const {
return second < r.second || (second == r.second && nanos <= r.nanos);
}
- bool operator!=(const Timestamp& r) const { return !(*this == r); }
- bool operator>(const Timestamp& r) const { return r < *this; }
- bool operator>=(const Timestamp& r) const { return r <= *this; }
- int64_t getMillis() const { return second * 1000 + nanos / 1000000; }
+ bool operator!=(const Timestamp& r) const {
+ return !(*this == r);
+ }
+ bool operator>(const Timestamp& r) const {
+ return r < *this;
+ }
+ bool operator>=(const Timestamp& r) const {
+ return r <= *this;
+ }
+ int64_t getMillis() const {
+ return second * 1000 + nanos / 1000000;
+ }
int64_t second;
int32_t nanos;
};
- Literal(const Literal &r);
+ Literal(const Literal& r);
~Literal();
Literal& operator=(const Literal& r);
bool operator==(const Literal& r) const;
@@ -102,7 +108,7 @@ namespace orc {
/**
* Create a literal of STRING type
*/
- Literal(const char * str, size_t size);
+ Literal(const char* str, size_t size);
/**
* Create a literal of DECIMAL type
@@ -123,38 +129,44 @@ namespace orc {
/**
* Check if a literal is null
*/
- bool isNull() const { return mIsNull; }
+ bool isNull() const {
+ return mIsNull;
+ }
- PredicateDataType getType() const { return mType; }
+ PredicateDataType getType() const {
+ return mType;
+ }
std::string toString() const;
- size_t getHashCode() const { return mHashCode; }
+ size_t getHashCode() const {
+ return mHashCode;
+ }
- private:
+ private:
size_t hashCode() const;
union LiteralVal {
int64_t IntVal;
double DoubleVal;
int64_t DateVal;
- char * Buffer;
+ char* Buffer;
Timestamp TimeStampVal;
Int128 DecimalVal;
bool BooleanVal;
// explicitly define default constructor
- LiteralVal(): DecimalVal(0) {}
+ LiteralVal() : DecimalVal(0) {}
};
- private:
- LiteralVal mValue; // data value for this literal if not null
- PredicateDataType mType; // data type of the literal
- size_t mSize; // size of mValue if it is Buffer
- int32_t mPrecision; // precision of decimal type
- int32_t mScale; // scale of decimal type
- bool mIsNull; // whether this literal is null
- size_t mHashCode; // precomputed hash code for the literal
+ private:
+ LiteralVal mValue; // data value for this literal if not null
+ PredicateDataType mType; // data type of the literal
+ size_t mSize; // size of mValue if it is Buffer
+ int32_t mPrecision; // precision of decimal type
+ int32_t mScale; // scale of decimal type
+ bool mIsNull; // whether this literal is null
+ size_t mHashCode; // precomputed hash code for the literal
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_LITERAL_HH
+#endif // ORC_LITERAL_HH
diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh
index 44fde8f5e9..6493840a92 100644
--- a/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh
@@ -34,7 +34,7 @@ namespace orc {
* (<a href="http://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF</a>).
*/
class SearchArgument {
- public:
+ public:
virtual ~SearchArgument();
/**
@@ -52,7 +52,7 @@ namespace orc {
* must call startOr, startAnd, or startNot before adding any leaves.
*/
class SearchArgumentBuilder {
- public:
+ public:
virtual ~SearchArgumentBuilder();
/**
@@ -87,8 +87,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThan(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThan(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -98,8 +97,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThan(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThan(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -109,8 +107,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThanEquals(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThanEquals(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -120,8 +117,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThanEquals(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThanEquals(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -131,8 +127,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& equals(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& equals(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -142,8 +137,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& equals(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& equals(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -153,8 +147,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& nullSafeEquals(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& nullSafeEquals(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -164,8 +157,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& nullSafeEquals(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -175,8 +167,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(const std::string& column, PredicateDataType type,
const std::initializer_list<Literal>& literals) = 0;
/**
@@ -186,8 +177,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type,
const std::initializer_list<Literal>& literals) = 0;
/**
@@ -197,8 +187,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(const std::string& column, PredicateDataType type,
const std::vector<Literal>& literals) = 0;
/**
@@ -208,8 +197,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type,
const std::vector<Literal>& literals) = 0;
/**
@@ -218,8 +206,7 @@ namespace orc {
* @param type the type of the expression
* @return this
*/
- virtual SearchArgumentBuilder& isNull(const std::string& column,
- PredicateDataType type) = 0;
+ virtual SearchArgumentBuilder& isNull(const std::string& column, PredicateDataType type) = 0;
/**
* Add an is null leaf to the current item on the stack.
@@ -227,8 +214,7 @@ namespace orc {
* @param type the type of the expression
* @return this
*/
- virtual SearchArgumentBuilder& isNull(uint64_t columnId,
- PredicateDataType type) = 0;
+ virtual SearchArgumentBuilder& isNull(uint64_t columnId, PredicateDataType type) = 0;
/**
* Add a between leaf to the current item on the stack.
@@ -238,10 +224,8 @@ namespace orc {
* @param upper the literal
* @return this
*/
- virtual SearchArgumentBuilder& between(const std::string& column,
- PredicateDataType type,
- Literal lower,
- Literal upper) = 0;
+ virtual SearchArgumentBuilder& between(const std::string& column, PredicateDataType type,
+ Literal lower, Literal upper) = 0;
/**
* Add a between leaf to the current item on the stack.
@@ -251,9 +235,7 @@ namespace orc {
* @param upper the literal
* @return this
*/
- virtual SearchArgumentBuilder& between(uint64_t columnId,
- PredicateDataType type,
- Literal lower,
+ virtual SearchArgumentBuilder& between(uint64_t columnId, PredicateDataType type, Literal lower,
Literal upper) = 0;
/**
@@ -275,10 +257,10 @@ namespace orc {
* Factory to create SearchArgumentBuilder which builds SearchArgument
*/
class SearchArgumentFactory {
- public:
+ public:
static std::unique_ptr<SearchArgumentBuilder> newBuilder();
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_SEARCHARGUMENT_HH
+#endif // ORC_SEARCHARGUMENT_HH
diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh
index b3ea6b76ce..fa3dce06f8 100644
--- a/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh
@@ -25,13 +25,13 @@ namespace orc {
* The potential result sets of logical operations.
*/
enum class TruthValue {
- YES, // all rows satisfy the predicate
- NO, // all rows dissatisfy the predicate
- IS_NULL, // all rows are null value
- YES_NULL, // null values exist, not-null rows satisfy the predicate
- NO_NULL, // null values exist, not-null rows dissatisfy the predicate
- YES_NO, // some rows satisfy the predicate and the others not
- YES_NO_NULL // null values exist, some rows satisfy predicate and some not
+ YES, // all rows satisfy the predicate
+ NO, // all rows dissatisfy the predicate
+ IS_NULL, // all rows are null value
+ YES_NULL, // null values exist, not-null rows satisfy the predicate
+ NO_NULL, // null values exist, not-null rows dissatisfy the predicate
+ YES_NO, // some rows satisfy the predicate and the others not
+ YES_NO_NULL // null values exist, some rows satisfy predicate and some not
};
// Compute logical or between the two values.
@@ -46,6 +46,6 @@ namespace orc {
// Do we need to read the data based on the TruthValue?
bool isNeeded(TruthValue val);
-} // namespace orc
+} // namespace orc
-#endif //ORC_TRUTHVALUE_HH
+#endif // ORC_TRUTHVALUE_HH
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
index 625c1befb2..b11cdf74cd 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
+++ b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
@@ -19,20 +19,15 @@
#ifndef ADAPTER_HH
#define ADAPTER_HH
-/* #undef INT64_IS_LL */
-#define HAS_CONSTEXPR
#define HAS_PREAD
#define HAS_STRPTIME
-#define HAS_STOLL
#define HAS_DIAGNOSTIC_PUSH
#define HAS_DOUBLE_TO_STRING
#define HAS_INT64_TO_STRING
#define HAS_PRE_1970
#define HAS_POST_2038
#define HAS_STD_ISNAN
-#define HAS_STD_MUTEX
#define HAS_BUILTIN_OVERFLOW_CHECK
-/* #undef NEEDS_REDUNDANT_MOVE */
/* #undef NEEDS_Z_PREFIX */
#include "orc/orc-config.hh"
@@ -46,13 +41,6 @@ typedef SSIZE_T ssize_t;
#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf)
#endif
-#ifndef HAS_STOLL
- // A poor man's stoll that converts str to a long long int base 10
- namespace std {
- int64_t stoll(std::string str);
- }
-#endif
-
#ifndef HAS_STRPTIME
char* strptime(const char* buf, const char* format, struct tm* tm);
#endif
@@ -61,20 +49,6 @@ typedef SSIZE_T ssize_t;
ssize_t pread(int fd, void* buf, size_t count, off_t offset);
#endif
-#ifdef INT64_IS_LL
- #define INT64_FORMAT_STRING "ll"
-#else
- #define INT64_FORMAT_STRING "l"
-#endif
-
-#ifndef ORC_CXX_HAS_NOEXCEPT
- #define noexcept ORC_NOEXCEPT
-#endif
-
-#ifndef ORC_CXX_HAS_OVERRIDE
- #define override ORC_OVERRIDE
-#endif
-
#ifdef HAS_DIAGNOSTIC_PUSH
#ifdef __clang__
#define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
@@ -105,10 +79,6 @@ typedef SSIZE_T ssize_t;
#define DIAGNOSTIC_IGNORE(XXX)
#endif
-#ifndef ORC_CXX_HAS_UNIQUE_PTR
- #define unique_ptr auto_ptr
-#endif
-
#ifndef UINT32_MAX
#define UINT32_MAX 0xffffffff
#endif
@@ -123,12 +93,6 @@ typedef SSIZE_T ssize_t;
#define GTEST_LANG_CXX11 0
-#ifdef NEEDS_REDUNDANT_MOVE
- #define REDUNDANT_MOVE(XXX) std::move(XXX)
-#else
- #define REDUNDANT_MOVE(XXX) XXX
-#endif
-
#ifndef HAS_STD_ISNAN
#include <math.h>
#define std::isnan(XXX) isnan(XXX)
@@ -136,34 +100,7 @@ typedef SSIZE_T ssize_t;
#include <cmath>
#endif
-#ifndef HAS_STD_MUTEX
- #include <pthread.h>
- namespace orc {
- /**
- * Lock guard for pthread_mutex_t object using RAII
- * The Lock is automatically release when exiting current scope.
- */
- class LockORC {
- public:
- explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) {
- pthread_mutex_lock(&mutex_ref_);
- }
- ~LockORC() { pthread_mutex_unlock(&mutex_ref_); }
- private:
- // no default constructor
- LockORC();
- // prohibit copying
- LockORC(const LockORC&);
- LockORC& operator=(const LockORC&);
-
- pthread_mutex_t& mutex_ref_;
- };
- }
- #define std::mutex pthread_mutex_t
- #define std::lock_guard<std::mutex> LockORC
-#else
- #include <mutex>
-#endif
+#include <mutex>
#ifdef NEEDS_Z_PREFIX
#define Z_PREFIX 1
@@ -208,8 +145,4 @@ namespace orc {
}
#endif
-#ifndef HAS_CONSTEXPR
-#define constexpr const
-#endif
-
#endif /* ADAPTER_HH */
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.cc b/contrib/libs/apache/orc/c++/src/Adaptor.cc
index bf3a3e181b..d9390131b6 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor.cc
+++ b/contrib/libs/apache/orc/c++/src/Adaptor.cc
@@ -1,36 +1,24 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#include "Adaptor.hh"
-#include <sstream>
#include <iomanip>
-
-#ifndef HAS_STOLL
-namespace std {
- int64_t std::stoll(std::string str) {
- int64_t val = 0;
- stringstream ss;
- ss << str;
- ss >> val;
- return val;
- }
-}
-#endif
+#include <sstream>
#ifndef HAS_STRPTIME
char* strptime(const char* s, const char* f, struct tm* tm) {
@@ -43,7 +31,7 @@ char* strptime(const char* s, const char* f, struct tm* tm) {
#endif
#ifndef HAS_PREAD
- #ifdef _WIN32
+#ifdef _WIN32
#include <Windows.h>
#include <io.h>
ssize_t pread(int fd, void* buf, size_t size, off_t offset) {
@@ -60,9 +48,9 @@ ssize_t pread(int fd, void* buf, size_t size, off_t offset) {
}
return static_cast<ssize_t>(rt);
}
- #else
- #error("pread() undefined: unknown environment")
- #endif
+#else
+#error("pread() undefined: unknown environment")
+#endif
#endif
namespace orc {
@@ -85,4 +73,4 @@ namespace orc {
return std::to_string(static_cast<long long int>(val));
}
#endif
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.cc b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc
new file mode 100644
index 0000000000..1f7843fad7
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BlockBuffer.hh"
+#include "orc/OrcFile.hh"
+#include "orc/Writer.hh"
+
+#include <algorithm>
+
+namespace orc {
+
+ BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t _blockSize)
+ : memoryPool(pool), currentSize(0), currentCapacity(0), blockSize(_blockSize) {
+ if (blockSize == 0) {
+ throw std::logic_error("Block size cannot be zero");
+ }
+ reserve(blockSize);
+ }
+
+ BlockBuffer::~BlockBuffer() {
+ for (size_t i = 0; i < blocks.size(); ++i) {
+ memoryPool.free(blocks[i]);
+ }
+ blocks.clear();
+ currentSize = currentCapacity = 0;
+ }
+
+ BlockBuffer::Block BlockBuffer::getBlock(uint64_t blockIndex) const {
+ if (blockIndex >= getBlockNumber()) {
+ throw std::out_of_range("Block index out of range");
+ }
+ return Block(blocks[blockIndex], std::min(currentSize - blockIndex * blockSize, blockSize));
+ }
+
+ BlockBuffer::Block BlockBuffer::getNextBlock() {
+ if (currentSize < currentCapacity) {
+ Block emptyBlock(blocks[currentSize / blockSize] + currentSize % blockSize,
+ blockSize - currentSize % blockSize);
+ currentSize = (currentSize / blockSize + 1) * blockSize;
+ return emptyBlock;
+ } else {
+ resize(currentSize + blockSize);
+ return Block(blocks.back(), blockSize);
+ }
+ }
+
+ void BlockBuffer::resize(uint64_t size) {
+ reserve(size);
+ if (currentCapacity >= size) {
+ currentSize = size;
+ } else {
+ throw std::logic_error("Block buffer resize error");
+ }
+ }
+
+ void BlockBuffer::reserve(uint64_t newCapacity) {
+ while (currentCapacity < newCapacity) {
+ char* newBlockPtr = memoryPool.malloc(blockSize);
+ if (newBlockPtr != nullptr) {
+ blocks.push_back(newBlockPtr);
+ currentCapacity += blockSize;
+ } else {
+ break;
+ }
+ }
+ }
+
+ void BlockBuffer::writeTo(OutputStream* output, WriterMetrics* metrics) {
+ if (currentSize == 0) {
+ return;
+ }
+ static uint64_t MAX_CHUNK_SIZE = 1024 * 1024 * 1024;
+ uint64_t chunkSize = std::min(output->getNaturalWriteSize(), MAX_CHUNK_SIZE);
+ if (chunkSize == 0) {
+ throw std::logic_error("Natural write size cannot be zero");
+ }
+ uint64_t ioCount = 0;
+ uint64_t blockNumber = getBlockNumber();
+ // if only exists one block, currentSize is equal to first block size
+ if (blockNumber == 1 && currentSize <= chunkSize) {
+ Block block = getBlock(0);
+ output->write(block.data, block.size);
+ ++ioCount;
+ } else {
+ char* chunk = memoryPool.malloc(chunkSize);
+ uint64_t chunkOffset = 0;
+ for (uint64_t i = 0; i < blockNumber; ++i) {
+ Block block = getBlock(i);
+ uint64_t blockOffset = 0;
+ while (blockOffset < block.size) {
+ // copy current block into chunk
+ uint64_t copySize = std::min(chunkSize - chunkOffset, block.size - blockOffset);
+ memcpy(chunk + chunkOffset, block.data + blockOffset, copySize);
+ chunkOffset += copySize;
+ blockOffset += copySize;
+
+ // chunk is full
+ if (chunkOffset >= chunkSize) {
+ output->write(chunk, chunkSize);
+ chunkOffset = 0;
+ ++ioCount;
+ }
+ }
+ }
+ if (chunkOffset != 0) {
+ output->write(chunk, chunkOffset);
+ ++ioCount;
+ }
+ memoryPool.free(chunk);
+ }
+
+ if (metrics != nullptr) {
+ metrics->IOCount.fetch_add(ioCount);
+ }
+ }
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.hh b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh
new file mode 100644
index 0000000000..0f5f78e3fe
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BLOCK_BUFFER_HH
+#define ORC_BLOCK_BUFFER_HH
+
+#include "orc/MemoryPool.hh"
+
+#include <vector>
+
+namespace orc {
+
+ class OutputStream;
+ struct WriterMetrics;
+ /**
+ * BlockBuffer implements a memory allocation policy based on
+ * equal-length blocks. BlockBuffer will reserve multiple blocks
+ * for allocation.
+ */
+ class BlockBuffer {
+ private:
+ MemoryPool& memoryPool;
+ // current buffer size
+ uint64_t currentSize;
+ // maximal capacity (actual allocated memory)
+ uint64_t currentCapacity;
+ // unit for buffer expansion
+ const uint64_t blockSize;
+ // pointers to the start of each block
+ std::vector<char*> blocks;
+
+ // non-copy-constructible
+ BlockBuffer(BlockBuffer& buffer) = delete;
+ BlockBuffer& operator=(BlockBuffer& buffer) = delete;
+ BlockBuffer(BlockBuffer&& buffer) = delete;
+ BlockBuffer& operator=(BlockBuffer&& buffer) = delete;
+
+ public:
+ BlockBuffer(MemoryPool& pool, uint64_t blockSize);
+
+ ~BlockBuffer();
+
+ /**
+ * Block points to a section of memory allocated by BlockBuffer,
+ * containing the corresponding physical memory address and available size.
+ */
+ struct Block {
+ // the start of block
+ char* data;
+ // number of bytes available at data
+ uint64_t size;
+
+ Block() : data(nullptr), size(0) {}
+ Block(char* _data, uint64_t _size) : data(_data), size(_size) {}
+ Block(const Block& block) = default;
+ ~Block() = default;
+ };
+
+ /**
+ * Get the allocated block object.
+ * The last allocated block size may be less than blockSize,
+ * and the rest of the blocks are all of size blockSize.
+ * @param blockIndex the index of blocks
+ * @return the allocated block object
+ */
+ Block getBlock(uint64_t blockIndex) const;
+
+ /**
+ * Get a empty block or allocate a new block to write.
+ * If the last allocated block size is less than blockSize,
+ * the size of empty block is equal to blockSize minus the size of
+ * the last allocated block size. Otherwise, the size of
+ * the empty block is equal to blockSize.
+ * @return a empty block object
+ */
+ Block getNextBlock();
+
+ /**
+ * Get the number of blocks that are fully or partially occupied
+ */
+ uint64_t getBlockNumber() const {
+ return (currentSize + blockSize - 1) / blockSize;
+ }
+
+ uint64_t size() const {
+ return currentSize;
+ }
+
+ uint64_t capacity() const {
+ return currentCapacity;
+ }
+
+ void resize(uint64_t size);
+ /**
+ * Requests the BlockBuffer to contain at least newCapacity bytes.
+ * Reallocation happens if there is need of more space.
+ * @param newCapacity new capacity of BlockBuffer
+ */
+ void reserve(uint64_t newCapacity);
+ /**
+ * Write the BlockBuffer content into OutputStream
+ * @param output the output stream to write to
+ * @param metrics the metrics of the writer
+ */
+ void writeTo(OutputStream* output, WriterMetrics* metrics);
+ };
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
index 8a1f1880e7..882c6f4252 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
@@ -22,11 +22,14 @@
namespace orc {
constexpr uint64_t BITS_OF_LONG = 64;
- constexpr uint8_t SHIFT_6_BITS = 6;
- constexpr uint8_t SHIFT_3_BITS = 3;
+ constexpr uint8_t SHIFT_6_BITS = 6;
+ constexpr uint8_t SHIFT_3_BITS = 3;
static bool isLittleEndian() {
- static union { uint32_t i; char c[4]; } num = { 0x01020304 };
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
return num.c[0] == 4;
}
@@ -34,11 +37,10 @@ namespace orc {
* Implementation of BitSet
*/
BitSet::BitSet(uint64_t numBits) {
- mData.resize(static_cast<size_t>(ceil(
- static_cast<double>(numBits) / BITS_OF_LONG)), 0);
+ mData.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0);
}
- BitSet::BitSet(const uint64_t * bits, uint64_t numBits) {
+ BitSet::BitSet(const uint64_t* bits, uint64_t numBits) {
// caller should make sure numBits is multiple of 64
mData.resize(numBits >> SHIFT_6_BITS, 0);
memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS);
@@ -59,8 +61,8 @@ namespace orc {
void BitSet::merge(const BitSet& other) {
if (mData.size() != other.mData.size()) {
std::stringstream ss;
- ss << "BitSet must be of equal length ("
- << mData.size() << " != " << other.mData.size() << ")";
+ ss << "BitSet must be of equal length (" << mData.size() << " != " << other.mData.size()
+ << ")";
throw std::logic_error(ss.str());
}
@@ -73,7 +75,7 @@ namespace orc {
memset(mData.data(), 0, sizeof(uint64_t) * mData.size());
}
- const uint64_t * BitSet::getData() const {
+ const uint64_t* BitSet::getData() const {
return mData.data();
}
@@ -92,8 +94,8 @@ namespace orc {
int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) {
double n = static_cast<double>(expectedEntries);
- return std::max<int32_t>(1, static_cast<int32_t>(
- std::round(static_cast<double>(numBits) / n * std::log(2.0))));
+ return std::max<int32_t>(
+ 1, static_cast<int32_t>(std::round(static_cast<double>(numBits) / n * std::log(2.0))));
}
int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) {
@@ -108,23 +110,20 @@ namespace orc {
// probability'
// Lets split up 64-bit hashcode into two 32-bit hash codes and employ
// the technique mentioned in the above paper
- inline uint64_t getBytesHash(const char * data, int64_t length) {
+ inline uint64_t getBytesHash(const char* data, int64_t length) {
if (data == nullptr) {
return Murmur3::NULL_HASHCODE;
}
- return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data),
- static_cast<uint32_t>(length));
+ return Murmur3::hash64(reinterpret_cast<const uint8_t*>(data), static_cast<uint32_t>(length));
}
/**
* Implementation of BloomFilter
*/
BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) {
- checkArgument(expectedEntries > 0,
- "expectedEntries should be > 0");
- checkArgument(fpp > 0.0 && fpp < 1.0,
- "False positive probability should be > 0.0 & < 1.0");
+ checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
+ checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp));
// make 'mNumBits' multiple of 64
@@ -133,7 +132,7 @@ namespace orc {
mBitSet.reset(new BitSet(mNumBits));
}
- void BloomFilterImpl::addBytes(const char * data, int64_t length) {
+ void BloomFilterImpl::addBytes(const char* data, int64_t length) {
uint64_t hash64 = getBytesHash(data, length);
addHash(static_cast<int64_t>(hash64));
}
@@ -142,7 +141,7 @@ namespace orc {
addHash(getLongHash(data));
}
- bool BloomFilterImpl::testBytes(const char * data, int64_t length) const {
+ bool BloomFilterImpl::testBytes(const char* data, int64_t length) const {
uint64_t hash64 = getBytesHash(data, length);
return testHash(static_cast<int64_t>(hash64));
}
@@ -176,13 +175,13 @@ namespace orc {
// caller should make sure input proto::BloomFilter is valid since
// no check will be performed in the following constructor
BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) {
- mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions());
+ mNumHashFunctions = static_cast<int32_t>(bloomFilter.num_hash_functions());
const std::string& bitsetStr = bloomFilter.utf8bitset();
mNumBits = bitsetStr.size() << SHIFT_3_BITS;
checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!");
- const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data());
+ const uint64_t* bitset = reinterpret_cast<const uint64_t*>(bitsetStr.data());
if (isLittleEndian()) {
mBitSet.reset(new BitSet(bitset, mNumBits));
} else {
@@ -204,7 +203,7 @@ namespace orc {
addLong(reinterpret_cast<int64_t&>(data));
}
- bool BloomFilterImpl::testDouble(double data) const{
+ bool BloomFilterImpl::testDouble(double data) const {
return testLong(reinterpret_cast<int64_t&>(data));
}
@@ -227,7 +226,7 @@ namespace orc {
}
}
- bool BloomFilterImpl::testHash(int64_t hash64) const{
+ bool BloomFilterImpl::testHash(int64_t hash64) const {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
// So we cast hash64 to uint64_t here for an unsigned right shift.
@@ -251,10 +250,8 @@ namespace orc {
if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) {
std::stringstream ss;
ss << "BloomFilters are not compatible for merging: "
- << "this: numBits:" << mNumBits
- << ",numHashFunctions:" << mNumHashFunctions
- << ", that: numBits:" << other.mNumBits
- << ",numHashFunctions:" << other.mNumHashFunctions;
+ << "this: numBits:" << mNumBits << ",numHashFunctions:" << mNumHashFunctions
+ << ", that: numBits:" << other.mNumBits << ",numHashFunctions:" << other.mNumHashFunctions;
throw std::logic_error(ss.str());
}
@@ -266,17 +263,17 @@ namespace orc {
}
void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const {
- bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions));
+ bloomFilter.set_num_hash_functions(static_cast<uint32_t>(mNumHashFunctions));
// According to ORC standard, the encoding is a sequence of bytes with
// a little endian encoding in the utf8bitset field.
if (isLittleEndian()) {
// bytes are already organized in little endian; thus no conversion needed
- const char * bitset = reinterpret_cast<const char *>(mBitSet->getData());
+ const char* bitset = reinterpret_cast<const char*>(mBitSet->getData());
bloomFilter.set_utf8bitset(bitset, sizeInBytes());
} else {
std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0);
- const uint64_t * longs = mBitSet->getData();
+ const uint64_t* longs = mBitSet->getData();
for (size_t i = 0; i != bitset.size(); ++i) {
uint64_t& dst = bitset[i];
const uint64_t src = longs[i];
@@ -290,8 +287,7 @@ namespace orc {
}
bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const {
- return mNumBits == other.mNumBits &&
- mNumHashFunctions == other.mNumHashFunctions &&
+ return mNumBits == other.mNumBits && mNumHashFunctions == other.mNumHashFunctions &&
*mBitSet == *other.mBitSet;
}
@@ -300,29 +296,24 @@ namespace orc {
}
std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize(
- const proto::Stream_Kind& streamKind,
- const proto::ColumnEncoding& encoding,
- const proto::BloomFilter& bloomFilter) {
-
- std::unique_ptr<BloomFilter> ret(nullptr);
-
+ const proto::Stream_Kind& streamKind, const proto::ColumnEncoding& encoding,
+ const proto::BloomFilter& bloomFilter) {
// only BLOOM_FILTER_UTF8 is supported
if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) {
- return ret;
+ return nullptr;
}
// make sure we don't use unknown encodings or original timestamp encodings
- if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) {
- return ret;
+ if (!encoding.has_bloom_encoding() || encoding.bloom_encoding() != 1) {
+ return nullptr;
}
// make sure all required fields exist
- if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) {
- return ret;
+ if (!bloomFilter.has_num_hash_functions() || !bloomFilter.has_utf8bitset()) {
+ return nullptr;
}
- ret.reset(new BloomFilterImpl(bloomFilter));
- return ret;
+ return std::make_unique<BloomFilterImpl>(bloomFilter);
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
index cf18a46fd9..d72961a83c 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
@@ -33,7 +33,7 @@ namespace orc {
* for index bounds nor expand the bit set size if the specified index is greater than the size.
*/
class BitSet {
- public:
+ public:
/**
* Creates an empty BitSet
*
@@ -47,7 +47,7 @@ namespace orc {
* @param bits - serialized uint64_t buffer of bitset
* @param numBits - number of bits used
*/
- BitSet(const uint64_t * bits, uint64_t numBits);
+ BitSet(const uint64_t* bits, uint64_t numBits);
/**
* Sets the bit at specified index.
@@ -82,14 +82,14 @@ namespace orc {
/**
* Gets underlying raw data
*/
- const uint64_t * getData() const;
+ const uint64_t* getData() const;
/**
* Compares two BitSets
*/
bool operator==(const BitSet& other) const;
- private:
+ private:
std::vector<uint64_t> mData;
};
@@ -120,14 +120,14 @@ namespace orc {
* BloomFilterUtf8, which always uses UTF8 for the encoding.
*/
class BloomFilterImpl : public BloomFilter {
- public:
+ public:
/**
* Creates an empty BloomFilter
*
* @param expectedEntries - number of entries it will hold
* @param fpp - false positive probability
*/
- BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP);
+ BloomFilterImpl(uint64_t expectedEntries, double fpp = DEFAULT_FPP);
/**
* Creates a BloomFilter by deserializing the proto-buf version
@@ -139,14 +139,14 @@ namespace orc {
/**
* Adds a new element to the BloomFilter
*/
- void addBytes(const char * data, int64_t length);
+ void addBytes(const char* data, int64_t length);
void addLong(int64_t data);
void addDouble(double data);
/**
* Test if the element exists in BloomFilter
*/
- bool testBytes(const char * data, int64_t length) const override;
+ bool testBytes(const char* data, int64_t length) const override;
bool testLong(int64_t data) const override;
bool testDouble(double data) const override;
@@ -160,7 +160,7 @@ namespace orc {
bool operator==(const BloomFilterImpl& other) const;
- private:
+ private:
friend struct BloomFilterUTF8Utils;
friend class TestBloomFilter_testBloomFilterBasicOperations_Test;
@@ -172,7 +172,7 @@ namespace orc {
void serialize(proto::BloomFilter& bloomFilter) const;
- private:
+ private:
static constexpr double DEFAULT_FPP = 0.05;
uint64_t mNumBits;
int32_t mNumHashFunctions;
@@ -186,25 +186,24 @@ namespace orc {
}
// deserialize BloomFilter from protobuf
- static std::unique_ptr<BloomFilter>
- deserialize(const proto::Stream_Kind& streamKind,
- const proto::ColumnEncoding& columnEncoding,
- const proto::BloomFilter& bloomFilter);
+ static std::unique_ptr<BloomFilter> deserialize(const proto::Stream_Kind& streamKind,
+ const proto::ColumnEncoding& columnEncoding,
+ const proto::BloomFilter& bloomFilter);
};
// Thomas Wang's integer hash function
// http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
// Put this in header file so tests can use it as well.
inline int64_t getLongHash(int64_t key) {
- key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
- key = (key + (key << 3)) + (key << 8); // key * 265
+ key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
- key = (key + (key << 2)) + (key << 4); // key * 21
+ key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
-}
+} // namespace orc
-#endif //ORC_BLOOMFILTER_IMPL_HH
+#endif // ORC_BLOOMFILTER_IMPL_HH
diff --git a/contrib/libs/apache/orc/c++/src/Bpacking.hh b/contrib/libs/apache/orc/c++/src/Bpacking.hh
new file mode 100644
index 0000000000..f55e986d8d
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Bpacking.hh
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BPACKING_HH
+#define ORC_BPACKING_HH
+
+#include <cstdint>
+
+namespace orc {
+ class RleDecoderV2;
+
+ class BitUnpack {
+ public:
+ static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len,
+ uint64_t fbs);
+ };
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.cc b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc
new file mode 100644
index 0000000000..5a80bc6fb1
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc
@@ -0,0 +1,368 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BpackingDefault.hh"
+#include "RLEv2.hh"
+#include "Utils.hh"
+
+namespace orc {
+
+ UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder(dec) {
+ // PASS
+ }
+
+ UnpackDefault::~UnpackDefault() {
+ // PASS
+ }
+
+ void UnpackDefault::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8.
+ while (decoder->getBitsLeft() > 0 && curIdx < offset + len) {
+ decoder->setBitsLeft(decoder->getBitsLeft() - 4);
+ data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15;
+ }
+ if (curIdx == offset + len) return;
+
+ // Exhaust the buffer
+ uint64_t numGroups = (offset + len - curIdx) / 2;
+ numGroups = std::min(numGroups, static_cast<uint64_t>(decoder->bufLength()));
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ uint32_t localByte;
+ for (uint64_t i = 0; i < numGroups; ++i) {
+ localByte = *buffer++;
+ data[curIdx] = (localByte >> 4) & 15;
+ data[curIdx + 1] = localByte & 15;
+ curIdx += 2;
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // readByte() will update 'bufferStart' and 'bufferEnd'
+ decoder->setCurByte(decoder->readByte());
+ decoder->setBitsLeft(8);
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength();
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ data[curIdx++] = *buffer++;
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // readByte() will update 'bufferStart' and 'bufferEnd'.
+ data[curIdx++] = decoder->readByte();
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 2;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint16_t b0, b1;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint16_t>(*buffer);
+ b1 = static_cast<uint16_t>(*(buffer + 1));
+ buffer += 2;
+ data[curIdx++] = (b0 << 8) | b1;
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ data[curIdx++] = (b0 << 8) | b1;
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 3;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint32_t b0, b1, b2;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint32_t>(*buffer);
+ b1 = static_cast<uint32_t>(*(buffer + 1));
+ b2 = static_cast<uint32_t>(*(buffer + 2));
+ buffer += 3;
+ data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
+ }
+ //////decoder->bufferStart += bufferNum * 3;
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ b2 = decoder->readByte();
+ data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 4;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint32_t b0, b1, b2, b3;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint32_t>(*buffer);
+ b1 = static_cast<uint32_t>(*(buffer + 1));
+ b2 = static_cast<uint32_t>(*(buffer + 2));
+ b3 = static_cast<uint32_t>(*(buffer + 3));
+ buffer += 4;
+ data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ b2 = decoder->readByte();
+ b3 = decoder->readByte();
+ data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 5;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint64_t b0, b1, b2, b3, b4;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint32_t>(*buffer);
+ b1 = static_cast<uint32_t>(*(buffer + 1));
+ b2 = static_cast<uint32_t>(*(buffer + 2));
+ b3 = static_cast<uint32_t>(*(buffer + 3));
+ b4 = static_cast<uint32_t>(*(buffer + 4));
+ buffer += 5;
+ data[curIdx++] =
+ static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ b2 = decoder->readByte();
+ b3 = decoder->readByte();
+ b4 = decoder->readByte();
+ data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 6;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint64_t b0, b1, b2, b3, b4, b5;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint32_t>(*buffer);
+ b1 = static_cast<uint32_t>(*(buffer + 1));
+ b2 = static_cast<uint32_t>(*(buffer + 2));
+ b3 = static_cast<uint32_t>(*(buffer + 3));
+ b4 = static_cast<uint32_t>(*(buffer + 4));
+ b5 = static_cast<uint32_t>(*(buffer + 5));
+ buffer += 6;
+ data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) |
+ (b4 << 8) | b5);
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ b2 = decoder->readByte();
+ b3 = decoder->readByte();
+ b4 = decoder->readByte();
+ b5 = decoder->readByte();
+ data[curIdx++] =
+ static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5);
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 7;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint64_t b0, b1, b2, b3, b4, b5, b6;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint32_t>(*buffer);
+ b1 = static_cast<uint32_t>(*(buffer + 1));
+ b2 = static_cast<uint32_t>(*(buffer + 2));
+ b3 = static_cast<uint32_t>(*(buffer + 3));
+ b4 = static_cast<uint32_t>(*(buffer + 4));
+ b5 = static_cast<uint32_t>(*(buffer + 5));
+ b6 = static_cast<uint32_t>(*(buffer + 6));
+ buffer += 7;
+ data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) |
+ (b4 << 16) | (b5 << 8) | b6);
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ b2 = decoder->readByte();
+ b3 = decoder->readByte();
+ b4 = decoder->readByte();
+ b5 = decoder->readByte();
+ b6 = decoder->readByte();
+ data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) |
+ (b4 << 16) | (b5 << 8) | b6);
+ }
+ }
+
+ void UnpackDefault::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) {
+ uint64_t curIdx = offset;
+ while (curIdx < offset + len) {
+ // Exhaust the buffer
+ int64_t bufferNum = decoder->bufLength() / 8;
+ bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
+ uint64_t b0, b1, b2, b3, b4, b5, b6, b7;
+ // Avoid updating 'bufferStart' inside the loop.
+ auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart());
+ for (int i = 0; i < bufferNum; ++i) {
+ b0 = static_cast<uint32_t>(*buffer);
+ b1 = static_cast<uint32_t>(*(buffer + 1));
+ b2 = static_cast<uint32_t>(*(buffer + 2));
+ b3 = static_cast<uint32_t>(*(buffer + 3));
+ b4 = static_cast<uint32_t>(*(buffer + 4));
+ b5 = static_cast<uint32_t>(*(buffer + 5));
+ b6 = static_cast<uint32_t>(*(buffer + 6));
+ b7 = static_cast<uint32_t>(*(buffer + 7));
+ buffer += 8;
+ data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) |
+ (b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
+ }
+ decoder->setBufStart(reinterpret_cast<char*>(buffer));
+ if (curIdx == offset + len) return;
+
+ // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
+ b0 = decoder->readByte();
+ b1 = decoder->readByte();
+ b2 = decoder->readByte();
+ b3 = decoder->readByte();
+ b4 = decoder->readByte();
+ b5 = decoder->readByte();
+ b6 = decoder->readByte();
+ b7 = decoder->readByte();
+ data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) |
+ (b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
+ }
+ }
+
+ void UnpackDefault::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) {
+ for (uint64_t i = offset; i < (offset + len); i++) {
+ uint64_t result = 0;
+ uint64_t bitsLeftToRead = fbs;
+ while (bitsLeftToRead > decoder->getBitsLeft()) {
+ result <<= decoder->getBitsLeft();
+ result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1);
+ bitsLeftToRead -= decoder->getBitsLeft();
+ decoder->setCurByte(decoder->readByte());
+ decoder->setBitsLeft(8);
+ }
+
+ // handle the left over bits
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ decoder->setBitsLeft(decoder->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead));
+ result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1);
+ }
+ data[i] = static_cast<int64_t>(result);
+ }
+ }
+
+ void BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset,
+ uint64_t len, uint64_t fbs) {
+ UnpackDefault unpackDefault(decoder);
+ switch (fbs) {
+ case 4:
+ unpackDefault.unrolledUnpack4(data, offset, len);
+ break;
+ case 8:
+ unpackDefault.unrolledUnpack8(data, offset, len);
+ break;
+ case 16:
+ unpackDefault.unrolledUnpack16(data, offset, len);
+ break;
+ case 24:
+ unpackDefault.unrolledUnpack24(data, offset, len);
+ break;
+ case 32:
+ unpackDefault.unrolledUnpack32(data, offset, len);
+ break;
+ case 40:
+ unpackDefault.unrolledUnpack40(data, offset, len);
+ break;
+ case 48:
+ unpackDefault.unrolledUnpack48(data, offset, len);
+ break;
+ case 56:
+ unpackDefault.unrolledUnpack56(data, offset, len);
+ break;
+ case 64:
+ unpackDefault.unrolledUnpack64(data, offset, len);
+ break;
+ default:
+ // Fallback to the default implementation for deprecated bit size.
+ unpackDefault.plainUnpackLongs(data, offset, len, fbs);
+ break;
+ }
+ }
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.hh b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh
new file mode 100644
index 0000000000..0a58234495
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BPACKINGDEFAULT_HH
+#define ORC_BPACKINGDEFAULT_HH
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "Bpacking.hh"
+
+namespace orc {
+ class RleDecoderV2;
+
+ class UnpackDefault {
+ public:
+ UnpackDefault(RleDecoderV2* dec);
+ ~UnpackDefault();
+
+ void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len);
+ void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len);
+
+ void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs);
+
+ private:
+ RleDecoderV2* decoder;
+ };
+
+ class BitUnpackDefault : public BitUnpack {
+ public:
+ static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len,
+ uint64_t fbs);
+ };
+
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc
index 1c4a645167..b81d282e35 100644
--- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc
+++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc
@@ -16,12 +16,13 @@
* limitations under the License.
*/
+#include <string.h>
#include <algorithm>
#include <iostream>
-#include <string.h>
#include <utility>
#include "ByteRLE.hh"
+#include "Utils.hh"
#include "orc/Exceptions.hh"
namespace orc {
@@ -35,7 +36,7 @@ namespace orc {
}
class ByteRleEncoderImpl : public ByteRleEncoder {
- public:
+ public:
ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
virtual ~ByteRleEncoderImpl() override;
@@ -46,8 +47,7 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) override;
+ virtual void add(const char* data, uint64_t numValues, const char* notNull) override;
/**
* Get size of buffer used so far.
@@ -68,7 +68,7 @@ namespace orc {
*/
void reset();
- protected:
+ protected:
std::unique_ptr<BufferedOutputStream> outputStream;
char* literals;
int numLiterals;
@@ -83,22 +83,21 @@ namespace orc {
void write(char c);
};
- ByteRleEncoderImpl::ByteRleEncoderImpl(
- std::unique_ptr<BufferedOutputStream> output)
- : outputStream(std::move(output)) {
+ ByteRleEncoderImpl::ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output)
+ : outputStream(std::move(output)) {
literals = new char[MAX_LITERAL_SIZE];
reset();
}
ByteRleEncoderImpl::~ByteRleEncoderImpl() {
// PASS
- delete [] literals;
+ delete[] literals;
}
void ByteRleEncoderImpl::writeByte(char c) {
if (bufferPosition == bufferLength) {
int addedSize = 0;
- if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
+ if (!outputStream->Next(reinterpret_cast<void**>(&buffer), &addedSize)) {
throw std::bad_alloc();
}
bufferPosition = 0;
@@ -107,10 +106,7 @@ namespace orc {
buffer[bufferPosition++] = c;
}
- void ByteRleEncoderImpl::add(
- const char* data,
- uint64_t numValues,
- const char* notNull) {
+ void ByteRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
write(data[i]);
@@ -121,8 +117,7 @@ namespace orc {
void ByteRleEncoderImpl::writeValues() {
if (numLiterals != 0) {
if (repeat) {
- writeByte(
- static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
+ writeByte(static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
writeByte(literals[0]);
} else {
writeByte(static_cast<char>(-numLiterals));
@@ -189,7 +184,7 @@ namespace orc {
return outputStream->getSize();
}
- void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const {
+ void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
uint64_t flushedSize = outputStream->getSize();
uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
if (outputStream->isCompressed()) {
@@ -220,14 +215,13 @@ namespace orc {
reset();
}
- std::unique_ptr<ByteRleEncoder> createByteRleEncoder
- (std::unique_ptr<BufferedOutputStream> output) {
- return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl
- (std::move(output)));
+ std::unique_ptr<ByteRleEncoder> createByteRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output) {
+ return std::make_unique<ByteRleEncoderImpl>(std::move(output));
}
class BooleanRleEncoderImpl : public ByteRleEncoderImpl {
- public:
+ public:
BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
virtual ~BooleanRleEncoderImpl() override;
@@ -238,8 +232,7 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) override;
+ virtual void add(const char* data, uint64_t numValues, const char* notNull) override;
/**
* Flushing underlying BufferedOutputStream
@@ -248,15 +241,15 @@ namespace orc {
virtual void recordPosition(PositionRecorder* recorder) const override;
- private:
+ virtual void suppress() override;
+
+ private:
int bitsRemained;
char current;
-
};
- BooleanRleEncoderImpl::BooleanRleEncoderImpl(
- std::unique_ptr<BufferedOutputStream> output)
- : ByteRleEncoderImpl(std::move(output)) {
+ BooleanRleEncoderImpl::BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output)
+ : ByteRleEncoderImpl(std::move(output)) {
bitsRemained = 8;
current = static_cast<char>(0);
}
@@ -265,10 +258,7 @@ namespace orc {
// PASS
}
- void BooleanRleEncoderImpl::add(
- const char* data,
- uint64_t numValues,
- const char* notNull) {
+ void BooleanRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
if (bitsRemained == 0) {
write(current);
@@ -277,8 +267,7 @@ namespace orc {
}
if (!notNull || notNull[i]) {
if (!data || data[i]) {
- current =
- static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
+ current = static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
}
--bitsRemained;
}
@@ -304,43 +293,49 @@ namespace orc {
recorder->add(static_cast<uint64_t>(8 - bitsRemained));
}
- std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
- (std::unique_ptr<BufferedOutputStream> output) {
- BooleanRleEncoderImpl* encoder =
- new BooleanRleEncoderImpl(std::move(output)) ;
- return std::unique_ptr<ByteRleEncoder>(
- reinterpret_cast<ByteRleEncoder*>(encoder));
+ void BooleanRleEncoderImpl::suppress() {
+ ByteRleEncoderImpl::suppress();
+ bitsRemained = 8;
+ current = static_cast<char>(0);
+ }
+
+ std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output) {
+ BooleanRleEncoderImpl* encoder = new BooleanRleEncoderImpl(std::move(output));
+ return std::unique_ptr<ByteRleEncoder>(reinterpret_cast<ByteRleEncoder*>(encoder));
}
ByteRleDecoder::~ByteRleDecoder() {
// PASS
}
- class ByteRleDecoderImpl: public ByteRleDecoder {
- public:
- ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
+ class ByteRleDecoderImpl : public ByteRleDecoder {
+ public:
+ ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics);
- virtual ~ByteRleDecoderImpl();
+ ~ByteRleDecoderImpl() override;
/**
* Seek to a particular spot.
*/
- virtual void seek(PositionProvider&);
+ virtual void seek(PositionProvider&) override;
/**
* Seek over a given number of values.
*/
- virtual void skip(uint64_t numValues);
+ virtual void skip(uint64_t numValues) override;
/**
* Read a number of values into the batch.
*/
- virtual void next(char* data, uint64_t numValues, char* notNull);
+ virtual void next(char* data, uint64_t numValues, char* notNull) override;
- protected:
+ protected:
+ void nextInternal(char* data, uint64_t numValues, char* notNull);
inline void nextBuffer();
inline signed char readByte();
inline void readHeader();
+ inline void reset();
std::unique_ptr<SeekableInputStream> inputStream;
size_t remainingValues;
@@ -348,9 +343,11 @@ namespace orc {
const char* bufferStart;
const char* bufferEnd;
bool repeating;
+ ReaderMetrics* metrics;
};
void ByteRleDecoderImpl::nextBuffer() {
+ SCOPED_MINUS_STOPWATCH(metrics, ByteDecodingLatencyUs);
int bufferLength;
const void* bufferPointer;
bool result = inputStream->Next(&bufferPointer, &bufferLength);
@@ -365,7 +362,7 @@ namespace orc {
if (bufferStart == bufferEnd) {
nextBuffer();
}
- return *(bufferStart++);
+ return static_cast<signed char>(*(bufferStart++));
}
void ByteRleDecoderImpl::readHeader() {
@@ -376,13 +373,11 @@ namespace orc {
} else {
remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT;
repeating = true;
- value = readByte();
+ value = static_cast<char>(readByte());
}
}
- ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream>
- input) {
- inputStream = std::move(input);
+ void ByteRleDecoderImpl::reset() {
repeating = false;
remainingValues = 0;
value = 0;
@@ -390,6 +385,13 @@ namespace orc {
bufferEnd = nullptr;
}
+ ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* _metrics)
+ : metrics(_metrics) {
+ inputStream = std::move(input);
+ reset();
+ }
+
ByteRleDecoderImpl::~ByteRleDecoderImpl() {
// PASS
}
@@ -397,15 +399,14 @@ namespace orc {
void ByteRleDecoderImpl::seek(PositionProvider& location) {
// move the input stream
inputStream->seek(location);
- // force a re-read from the stream
- bufferEnd = bufferStart;
- // read a new header
- readHeader();
+ // reset the decoder status and lazily call readHeader()
+ reset();
// skip ahead the given number of records
ByteRleDecoderImpl::skip(location.next());
}
void ByteRleDecoderImpl::skip(uint64_t numValues) {
+ SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall);
while (numValues > 0) {
if (remainingValues == 0) {
readHeader();
@@ -422,8 +423,7 @@ namespace orc {
nextBuffer();
}
size_t skipSize = std::min(static_cast<size_t>(consumedBytes),
- static_cast<size_t>(bufferEnd -
- bufferStart));
+ static_cast<size_t>(bufferEnd - bufferStart));
bufferStart += skipSize;
consumedBytes -= skipSize;
}
@@ -431,8 +431,12 @@ namespace orc {
}
}
- void ByteRleDecoderImpl::next(char* data, uint64_t numValues,
- char* notNull) {
+ void ByteRleDecoderImpl::next(char* data, uint64_t numValues, char* notNull) {
+ SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall);
+ nextInternal(data, numValues, notNull);
+ }
+
+ void ByteRleDecoderImpl::nextInternal(char* data, uint64_t numValues, char* notNull) {
uint64_t position = 0;
// skip over null values
while (notNull && position < numValues && !notNull[position]) {
@@ -444,12 +448,11 @@ namespace orc {
readHeader();
}
// how many do we read out of this block?
- size_t count = std::min(static_cast<size_t>(numValues - position),
- remainingValues);
+ size_t count = std::min(static_cast<size_t>(numValues - position), remainingValues);
uint64_t consumed = 0;
if (repeating) {
if (notNull) {
- for(uint64_t i=0; i < count; ++i) {
+ for (uint64_t i = 0; i < count; ++i) {
if (notNull[position + i]) {
data[position + i] = value;
consumed += 1;
@@ -461,9 +464,9 @@ namespace orc {
}
} else {
if (notNull) {
- for(uint64_t i=0; i < count; ++i) {
+ for (uint64_t i = 0; i < count; ++i) {
if (notNull[position + i]) {
- data[position + i] = readByte();
+ data[position + i] = static_cast<char>(readByte());
consumed += 1;
}
}
@@ -473,9 +476,8 @@ namespace orc {
if (bufferStart == bufferEnd) {
nextBuffer();
}
- uint64_t copyBytes =
- std::min(static_cast<uint64_t>(count - i),
- static_cast<uint64_t>(bufferEnd - bufferStart));
+ uint64_t copyBytes = std::min(static_cast<uint64_t>(count - i),
+ static_cast<uint64_t>(bufferEnd - bufferStart));
memcpy(data + position + i, bufferStart, copyBytes);
bufferStart += copyBytes;
i += copyBytes;
@@ -492,41 +494,40 @@ namespace orc {
}
}
- std::unique_ptr<ByteRleDecoder> createByteRleDecoder
- (std::unique_ptr<SeekableInputStream> input) {
- return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl
- (std::move(input)));
+ std::unique_ptr<ByteRleDecoder> createByteRleDecoder(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* metrics) {
+ return std::make_unique<ByteRleDecoderImpl>(std::move(input), metrics);
}
- class BooleanRleDecoderImpl: public ByteRleDecoderImpl {
- public:
- BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
+ class BooleanRleDecoderImpl : public ByteRleDecoderImpl {
+ public:
+ BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics);
- virtual ~BooleanRleDecoderImpl();
+ ~BooleanRleDecoderImpl() override;
/**
* Seek to a particular spot.
*/
- virtual void seek(PositionProvider&);
+ virtual void seek(PositionProvider&) override;
/**
* Seek over a given number of values.
*/
- virtual void skip(uint64_t numValues);
+ virtual void skip(uint64_t numValues) override;
/**
* Read a number of values into the batch.
*/
- virtual void next(char* data, uint64_t numValues, char* notNull);
+ virtual void next(char* data, uint64_t numValues, char* notNull) override;
- protected:
+ protected:
size_t remainingBits;
char lastByte;
};
- BooleanRleDecoderImpl::BooleanRleDecoderImpl
- (std::unique_ptr<SeekableInputStream> input
- ): ByteRleDecoderImpl(std::move(input)) {
+ BooleanRleDecoderImpl::BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* _metrics)
+ : ByteRleDecoderImpl(std::move(input), _metrics) {
remainingBits = 0;
lastByte = 0;
}
@@ -564,35 +565,33 @@ namespace orc {
}
}
- void BooleanRleDecoderImpl::next(char* data, uint64_t numValues,
- char* notNull) {
+ void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, char* notNull) {
+ SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall);
// next spot to fill in
uint64_t position = 0;
// use up any remaining bits
if (notNull) {
- while(remainingBits > 0 && position < numValues) {
+ while (remainingBits > 0 && position < numValues) {
if (notNull[position]) {
remainingBits -= 1;
- data[position] = (static_cast<unsigned char>(lastByte) >>
- remainingBits) & 0x1;
+ data[position] = (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1;
} else {
data[position] = 0;
}
position += 1;
}
} else {
- while(remainingBits > 0 && position < numValues) {
+ while (remainingBits > 0 && position < numValues) {
remainingBits -= 1;
- data[position++] = (static_cast<unsigned char>(lastByte) >>
- remainingBits) & 0x1;
+ data[position++] = (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1;
}
}
// count the number of nonNulls remaining
uint64_t nonNulls = numValues - position;
if (notNull) {
- for(uint64_t i=position; i < numValues; ++i) {
+ for (uint64_t i = position; i < numValues; ++i) {
if (!notNull[i]) {
nonNulls -= 1;
}
@@ -607,14 +606,14 @@ namespace orc {
} else if (position < numValues) {
// read the new bytes into the array
uint64_t bytesRead = (nonNulls + 7) / 8;
- ByteRleDecoderImpl::next(data + position, bytesRead, nullptr);
+ ByteRleDecoderImpl::nextInternal(data + position, bytesRead, nullptr);
lastByte = data[position + bytesRead - 1];
remainingBits = bytesRead * 8 - nonNulls;
// expand the array backwards so that we don't clobber the data
uint64_t bitsLeft = bytesRead * 8 - remainingBits;
if (notNull) {
- for(int64_t i=static_cast<int64_t>(numValues) - 1;
- i >= static_cast<int64_t>(position); --i) {
+ for (int64_t i = static_cast<int64_t>(numValues) - 1; i >= static_cast<int64_t>(position);
+ --i) {
if (notNull[i]) {
uint64_t shiftPosn = (-bitsLeft) % 8;
data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
@@ -624,8 +623,8 @@ namespace orc {
}
}
} else {
- for(int64_t i=static_cast<int64_t>(numValues) - 1;
- i >= static_cast<int64_t>(position); --i, --bitsLeft) {
+ for (int64_t i = static_cast<int64_t>(numValues) - 1; i >= static_cast<int64_t>(position);
+ --i, --bitsLeft) {
uint64_t shiftPosn = (-bitsLeft) % 8;
data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
}
@@ -633,11 +632,8 @@ namespace orc {
}
}
- std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
- (std::unique_ptr<SeekableInputStream> input) {
- BooleanRleDecoderImpl* decoder =
- new BooleanRleDecoderImpl(std::move(input));
- return std::unique_ptr<ByteRleDecoder>(
- reinterpret_cast<ByteRleDecoder*>(decoder));
+ std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder(
+ std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics) {
+ return std::make_unique<BooleanRleDecoderImpl>(std::move(input), metrics);
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh
index 2f6e2eb4df..bd19f52ecc 100644
--- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh
+++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh
@@ -27,7 +27,7 @@
namespace orc {
class ByteRleEncoder {
- public:
+ public:
virtual ~ByteRleEncoder();
/**
@@ -37,8 +37,7 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) = 0;
+ virtual void add(const char* data, uint64_t numValues, const char* notNull) = 0;
/**
* Get size of buffer used so far.
@@ -63,7 +62,7 @@ namespace orc {
};
class ByteRleDecoder {
- public:
+ public:
virtual ~ByteRleDecoder();
/**
@@ -90,22 +89,23 @@ namespace orc {
* Create a byte RLE encoder.
* @param output the output stream to write to
*/
- std::unique_ptr<ByteRleEncoder> createByteRleEncoder
- (std::unique_ptr<BufferedOutputStream> output);
+ std::unique_ptr<ByteRleEncoder> createByteRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output);
/**
* Create a boolean RLE encoder.
* @param output the output stream to write to
*/
- std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
- (std::unique_ptr<BufferedOutputStream> output);
+ std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output);
/**
* Create a byte RLE decoder.
* @param input the input stream to read from
+ * @param metrics the metrics of the decoder
*/
- std::unique_ptr<ByteRleDecoder> createByteRleDecoder
- (std::unique_ptr<SeekableInputStream> input);
+ std::unique_ptr<ByteRleDecoder> createByteRleDecoder(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* metrics);
/**
* Create a boolean RLE decoder.
@@ -114,9 +114,10 @@ namespace orc {
* if the value is masked by notNull. This is required for the notNull stream
* processing to properly apply multiple masks from nested types.
* @param input the input stream to read from
+ * @param metrics the metrics of the decoder
*/
- std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
- (std::unique_ptr<SeekableInputStream> input);
-}
+ std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder(
+ std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
index ab6b690c57..5297f80371 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
@@ -21,167 +21,174 @@
#include "Adaptor.hh"
+#include <time.h>
#include <limits>
#include <sstream>
#include <stdexcept>
-#include <time.h>
#include <typeinfo>
#ifdef __clang__
- #pragma clang diagnostic ignored "-Wformat-security"
+#pragma clang diagnostic ignored "-Wformat-security"
#endif
namespace orc {
- class VoidColumnPrinter: public ColumnPrinter {
- public:
+ class VoidColumnPrinter : public ColumnPrinter {
+ public:
VoidColumnPrinter(std::string&);
~VoidColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class BooleanColumnPrinter: public ColumnPrinter {
- private:
+ class BooleanColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
- public:
+
+ public:
BooleanColumnPrinter(std::string&);
~BooleanColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class LongColumnPrinter: public ColumnPrinter {
- private:
+ class LongColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
- public:
+
+ public:
LongColumnPrinter(std::string&);
~LongColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class DoubleColumnPrinter: public ColumnPrinter {
- private:
+ class DoubleColumnPrinter : public ColumnPrinter {
+ private:
const double* data;
const bool isFloat;
- public:
+ public:
DoubleColumnPrinter(std::string&, const Type& type);
virtual ~DoubleColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class TimestampColumnPrinter: public ColumnPrinter {
- private:
+ class TimestampColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* seconds;
const int64_t* nanoseconds;
- public:
+ public:
TimestampColumnPrinter(std::string&);
~TimestampColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class DateColumnPrinter: public ColumnPrinter {
- private:
+ class DateColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
- public:
+ public:
DateColumnPrinter(std::string&);
~DateColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class Decimal64ColumnPrinter: public ColumnPrinter {
- private:
+ class Decimal64ColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
int32_t scale;
- public:
+
+ public:
Decimal64ColumnPrinter(std::string&);
~Decimal64ColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class Decimal128ColumnPrinter: public ColumnPrinter {
- private:
+ class Decimal128ColumnPrinter : public ColumnPrinter {
+ private:
const Int128* data;
int32_t scale;
- public:
+
+ public:
Decimal128ColumnPrinter(std::string&);
~Decimal128ColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class StringColumnPrinter: public ColumnPrinter {
- private:
- const char* const * start;
+ class StringColumnPrinter : public ColumnPrinter {
+ private:
+ const char* const* start;
const int64_t* length;
- public:
+
+ public:
StringColumnPrinter(std::string&);
virtual ~StringColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class BinaryColumnPrinter: public ColumnPrinter {
- private:
- const char* const * start;
+ class BinaryColumnPrinter : public ColumnPrinter {
+ private:
+ const char* const* start;
const int64_t* length;
- public:
+
+ public:
BinaryColumnPrinter(std::string&);
virtual ~BinaryColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class ListColumnPrinter: public ColumnPrinter {
- private:
+ class ListColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* offsets;
std::unique_ptr<ColumnPrinter> elementPrinter;
- public:
+ public:
ListColumnPrinter(std::string&, const Type& type);
virtual ~ListColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class MapColumnPrinter: public ColumnPrinter {
- private:
+ class MapColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* offsets;
std::unique_ptr<ColumnPrinter> keyPrinter;
std::unique_ptr<ColumnPrinter> elementPrinter;
- public:
+ public:
MapColumnPrinter(std::string&, const Type& type);
virtual ~MapColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class UnionColumnPrinter: public ColumnPrinter {
- private:
- const unsigned char *tags;
+ class UnionColumnPrinter : public ColumnPrinter {
+ private:
+ const unsigned char* tags;
const uint64_t* offsets;
std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter;
- public:
+ public:
UnionColumnPrinter(std::string&, const Type& type);
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class StructColumnPrinter: public ColumnPrinter {
- private:
+ class StructColumnPrinter : public ColumnPrinter {
+ private:
std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter;
std::vector<std::string> fieldNames;
- public:
+
+ public:
StructColumnPrinter(std::string&, const Type& type);
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -191,13 +198,12 @@ namespace orc {
file += ch;
}
- void writeString(std::string& file, const char *ptr) {
+ void writeString(std::string& file, const char* ptr) {
size_t len = strlen(ptr);
file.append(ptr, len);
}
- ColumnPrinter::ColumnPrinter(std::string& _buffer
- ): buffer(_buffer) {
+ ColumnPrinter::ColumnPrinter(std::string& _buffer) : buffer(_buffer) {
notNull = nullptr;
hasNulls = false;
}
@@ -211,89 +217,87 @@ namespace orc {
if (hasNulls) {
notNull = batch.notNull.data();
} else {
- notNull = nullptr ;
+ notNull = nullptr;
}
}
- std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer,
- const Type* type) {
- ColumnPrinter *result = nullptr;
+ std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type) {
+ std::unique_ptr<ColumnPrinter> result;
if (type == nullptr) {
- result = new VoidColumnPrinter(buffer);
+ result = std::make_unique<VoidColumnPrinter>(buffer);
} else {
- switch(static_cast<int64_t>(type->getKind())) {
- case BOOLEAN:
- result = new BooleanColumnPrinter(buffer);
- break;
-
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- result = new LongColumnPrinter(buffer);
- break;
-
- case FLOAT:
- case DOUBLE:
- result = new DoubleColumnPrinter(buffer, *type);
- break;
-
- case STRING:
- case VARCHAR :
- case CHAR:
- result = new StringColumnPrinter(buffer);
- break;
-
- case BINARY:
- result = new BinaryColumnPrinter(buffer);
- break;
-
- case TIMESTAMP:
- case TIMESTAMP_INSTANT:
- result = new TimestampColumnPrinter(buffer);
- break;
-
- case LIST:
- result = new ListColumnPrinter(buffer, *type);
- break;
-
- case MAP:
- result = new MapColumnPrinter(buffer, *type);
- break;
-
- case STRUCT:
- result = new StructColumnPrinter(buffer, *type);
- break;
-
- case DECIMAL:
- if (type->getPrecision() == 0 || type->getPrecision() > 18) {
- result = new Decimal128ColumnPrinter(buffer);
- } else {
- result = new Decimal64ColumnPrinter(buffer);
- }
- break;
+ switch (static_cast<int64_t>(type->getKind())) {
+ case BOOLEAN:
+ result = std::make_unique<BooleanColumnPrinter>(buffer);
+ break;
+
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ result = std::make_unique<LongColumnPrinter>(buffer);
+ break;
+
+ case FLOAT:
+ case DOUBLE:
+ result = std::make_unique<DoubleColumnPrinter>(buffer, *type);
+ break;
- case DATE:
- result = new DateColumnPrinter(buffer);
- break;
+ case STRING:
+ case VARCHAR:
+ case CHAR:
+ result = std::make_unique<StringColumnPrinter>(buffer);
+ break;
- case UNION:
- result = new UnionColumnPrinter(buffer, *type);
- break;
+ case BINARY:
+ result = std::make_unique<BinaryColumnPrinter>(buffer);
+ break;
+
+ case TIMESTAMP:
+ case TIMESTAMP_INSTANT:
+ result = std::make_unique<TimestampColumnPrinter>(buffer);
+ break;
+
+ case LIST:
+ result = std::make_unique<ListColumnPrinter>(buffer, *type);
+ break;
+
+ case MAP:
+ result = std::make_unique<MapColumnPrinter>(buffer, *type);
+ break;
+
+ case STRUCT:
+ result = std::make_unique<StructColumnPrinter>(buffer, *type);
+ break;
+
+ case DECIMAL:
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ result = std::make_unique<Decimal128ColumnPrinter>(buffer);
+ } else {
+ result = std::make_unique<Decimal64ColumnPrinter>(buffer);
+ }
+ break;
+
+ case DATE:
+ result = std::make_unique<DateColumnPrinter>(buffer);
+ break;
+
+ case UNION:
+ result = std::make_unique<UnionColumnPrinter>(buffer, *type);
+ break;
- default:
- throw std::logic_error("unknown batch type");
+ default:
+ throw std::logic_error("unknown batch type");
}
}
- return std::unique_ptr<ColumnPrinter>(result);
+ return result;
}
- VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer) {
+ VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer) : ColumnPrinter(_buffer) {
// PASS
}
- void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
+ void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
// PASS
}
@@ -301,13 +305,12 @@ namespace orc {
writeString(buffer, "null");
}
- LongColumnPrinter::LongColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
+ LongColumnPrinter::LongColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr) {
// PASS
}
- void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
@@ -316,22 +319,17 @@ namespace orc {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
- static_cast<int64_t >(data[rowId]));
- writeString(buffer, numBuffer);
+ const auto numBuffer = std::to_string(static_cast<int64_t>(data[rowId]));
+ writeString(buffer, numBuffer.c_str());
}
}
- DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- data(nullptr),
- isFloat(type.getKind() == FLOAT){
+ DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), data(nullptr), isFloat(type.getKind() == FLOAT) {
// PASS
}
- void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
}
@@ -341,20 +339,17 @@ namespace orc {
writeString(buffer, "null");
} else {
char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g",
- data[rowId]);
+ snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", data[rowId]);
writeString(buffer, numBuffer);
}
}
- Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr),
- scale(0) {
+ Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr), scale(0) {
// PASS
}
- void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
@@ -376,13 +371,12 @@ namespace orc {
int32_t len = static_cast<int32_t>(str.length());
if (len > scale) {
return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(scale));
+ str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(scale));
} else if (len == scale) {
return sign + "0." + str;
} else {
std::string result = sign + "0.";
- for(int32_t i=0; i < scale - len; ++i) {
+ for (int32_t i = 0; i < scale - len; ++i) {
result += "0";
}
return result + str;
@@ -397,31 +391,27 @@ namespace orc {
}
}
- Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr),
- scale(0) {
- // PASS
- }
-
- void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
- scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
- }
-
- void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeString(buffer, data[rowId].toDecimalString(scale).c_str());
- }
- }
-
- StringColumnPrinter::StringColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- start(nullptr),
- length(nullptr) {
+ Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr), scale(0) {
+ // PASS
+ }
+
+ void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
+ scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
+ }
+
+ void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, data[rowId].toDecimalString(scale).c_str());
+ }
+ }
+
+ StringColumnPrinter::StringColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), start(nullptr), length(nullptr) {
// PASS
}
@@ -436,51 +426,48 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '"');
- for(int64_t i=0; i < length[rowId]; ++i) {
+ for (int64_t i = 0; i < length[rowId]; ++i) {
char ch = static_cast<char>(start[rowId][i]);
switch (ch) {
- case '\\':
- writeString(buffer, "\\\\");
- break;
- case '\b':
- writeString(buffer, "\\b");
- break;
- case '\f':
- writeString(buffer, "\\f");
- break;
- case '\n':
- writeString(buffer, "\\n");
- break;
- case '\r':
- writeString(buffer, "\\r");
- break;
- case '\t':
- writeString(buffer, "\\t");
- break;
- case '"':
- writeString(buffer, "\\\"");
- break;
- default:
- writeChar(buffer, ch);
- break;
+ case '\\':
+ writeString(buffer, "\\\\");
+ break;
+ case '\b':
+ writeString(buffer, "\\b");
+ break;
+ case '\f':
+ writeString(buffer, "\\f");
+ break;
+ case '\n':
+ writeString(buffer, "\\n");
+ break;
+ case '\r':
+ writeString(buffer, "\\r");
+ break;
+ case '\t':
+ writeString(buffer, "\\t");
+ break;
+ case '"':
+ writeString(buffer, "\\\"");
+ break;
+ default:
+ writeChar(buffer, ch);
+ break;
}
}
writeChar(buffer, '"');
}
}
- ListColumnPrinter::ListColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- offsets(nullptr) {
+ ListColumnPrinter::ListColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), offsets(nullptr) {
elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
}
- void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
- elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).
- elements);
+ elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements);
}
void ListColumnPrinter::printRow(uint64_t rowId) {
@@ -488,7 +475,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
+ for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
if (i != offsets[rowId]) {
writeString(buffer, ", ");
}
@@ -498,15 +485,13 @@ namespace orc {
}
}
- MapColumnPrinter::MapColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- offsets(nullptr) {
+ MapColumnPrinter::MapColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), offsets(nullptr) {
keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
}
- void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
offsets = myBatch.offsets.data();
@@ -519,7 +504,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
+ for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
if (i != offsets[rowId]) {
writeString(buffer, ", ");
}
@@ -533,23 +518,19 @@ namespace orc {
}
}
- UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- tags(nullptr),
- offsets(nullptr) {
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), tags(nullptr), offsets(nullptr) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)));
}
}
void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- const UnionVectorBatch& unionBatch =
- dynamic_cast<const UnionVectorBatch&>(batch);
+ const UnionVectorBatch& unionBatch = dynamic_cast<const UnionVectorBatch&>(batch);
tags = unionBatch.tags.data();
offsets = unionBatch.offsets.data();
- for(size_t i=0; i < fieldPrinter.size(); ++i) {
+ for (size_t i = 0; i < fieldPrinter.size(); ++i) {
fieldPrinter[i]->reset(*(unionBatch.children[i]));
}
}
@@ -559,20 +540,17 @@ namespace orc {
writeString(buffer, "null");
} else {
writeString(buffer, "{\"tag\": ");
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
- static_cast<int64_t>(tags[rowId]));
- writeString(buffer, numBuffer);
+ const auto numBuffer = std::to_string(static_cast<int64_t>(tags[rowId]));
+ writeString(buffer, numBuffer.c_str());
writeString(buffer, ", \"value\": ");
fieldPrinter[tags[rowId]]->printRow(offsets[rowId]);
writeChar(buffer, '}');
}
}
- StructColumnPrinter::StructColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer) {
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ StructColumnPrinter::StructColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
fieldNames.push_back(type.getFieldName(i));
fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)));
}
@@ -580,9 +558,8 @@ namespace orc {
void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- const StructVectorBatch& structBatch =
- dynamic_cast<const StructVectorBatch&>(batch);
- for(size_t i=0; i < fieldPrinter.size(); ++i) {
+ const StructVectorBatch& structBatch = dynamic_cast<const StructVectorBatch&>(batch);
+ for (size_t i = 0; i < fieldPrinter.size(); ++i) {
fieldPrinter[i]->reset(*(structBatch.fields[i]));
}
}
@@ -592,7 +569,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '{');
- for(unsigned int i=0; i < fieldPrinter.size(); ++i) {
+ for (unsigned int i = 0; i < fieldPrinter.size(); ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
@@ -605,9 +582,8 @@ namespace orc {
}
}
- DateColumnPrinter::DateColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
+ DateColumnPrinter::DateColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr) {
// PASS
}
@@ -631,9 +607,8 @@ namespace orc {
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
- BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
+ BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr) {
// PASS
}
@@ -650,10 +625,8 @@ namespace orc {
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
- BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- start(nullptr),
- length(nullptr) {
+ BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), start(nullptr), length(nullptr) {
// PASS
}
@@ -662,14 +635,12 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for(int64_t i=0; i < length[rowId]; ++i) {
+ for (int64_t i = 0; i < length[rowId]; ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), "%d",
- (static_cast<const int>(start[rowId][i]) & 0xff));
- writeString(buffer, numBuffer);
+ const auto numBuffer = std::to_string(static_cast<int>(start[rowId][i]) & 0xff);
+ writeString(buffer, numBuffer.c_str());
}
writeChar(buffer, ']');
}
@@ -681,10 +652,8 @@ namespace orc {
length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
}
- TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- seconds(nullptr),
- nanoseconds(nullptr) {
+ TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), seconds(nullptr), nanoseconds(nullptr) {
// PASS
}
@@ -712,20 +681,20 @@ namespace orc {
zeroDigits += 1;
}
}
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer),
- "%0*" INT64_FORMAT_STRING "d\"",
- static_cast<int>(NANO_DIGITS - zeroDigits),
- static_cast<int64_t >(nanos));
- writeString(buffer, numBuffer);
+ const auto numBuffer = std::to_string(static_cast<int64_t>(nanos));
+ const int64_t padDigits = NANO_DIGITS - zeroDigits - static_cast<int64_t>(numBuffer.size());
+ for (int i = 0; i < padDigits; ++i) {
+ writeChar(buffer, '0');
+ }
+ writeString(buffer, numBuffer.c_str());
+ writeChar(buffer, '"');
}
}
void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- const TimestampVectorBatch& ts =
- dynamic_cast<const TimestampVectorBatch&>(batch);
+ const TimestampVectorBatch& ts = dynamic_cast<const TimestampVectorBatch&>(batch);
seconds = ts.data.data();
nanoseconds = ts.nanoseconds.data();
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
index 873b54c618..a6bbdabedc 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
@@ -21,8 +21,10 @@
#include "Adaptor.hh"
#include "ByteRLE.hh"
#include "ColumnReader.hh"
-#include "orc/Exceptions.hh"
+#include "ConvertColumnReader.hh"
#include "RLE.hh"
+#include "SchemaEvolution.hh"
+#include "orc/Exceptions.hh"
#include <math.h>
#include <iostream>
@@ -35,25 +37,25 @@ namespace orc {
inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
switch (static_cast<int64_t>(kind)) {
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- return RleVersion_1;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return RleVersion_2;
- default:
- throw ParseError("Unknown encoding in convertRleVersion");
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ return RleVersion_1;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return RleVersion_2;
+ default:
+ throw ParseError("Unknown encoding in convertRleVersion");
}
}
- ColumnReader::ColumnReader(const Type& type,
- StripeStreams& stripe
- ): columnId(type.getColumnId()),
- memoryPool(stripe.getMemoryPool()) {
+ ColumnReader::ColumnReader(const Type& type, StripeStreams& stripe)
+ : columnId(type.getColumnId()),
+ memoryPool(stripe.getMemoryPool()),
+ metrics(stripe.getReaderMetrics()) {
std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
+ stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
if (stream.get()) {
- notNullDecoder = createBooleanRleDecoder(std::move(stream));
+ notNullDecoder = createBooleanRleDecoder(std::move(stream), metrics);
}
}
@@ -67,17 +69,14 @@ namespace orc {
// page through the values that we want to skip
// and count how many are non-null
const size_t MAX_BUFFER_SIZE = 32768;
- size_t bufferSize = std::min(MAX_BUFFER_SIZE,
- static_cast<size_t>(numValues));
+ size_t bufferSize = std::min(MAX_BUFFER_SIZE, static_cast<size_t>(numValues));
char buffer[MAX_BUFFER_SIZE];
uint64_t remaining = numValues;
while (remaining > 0) {
- uint64_t chunkSize =
- std::min(remaining,
- static_cast<uint64_t>(bufferSize));
+ uint64_t chunkSize = std::min(remaining, static_cast<uint64_t>(bufferSize));
decoder->next(buffer, chunkSize, nullptr);
remaining -= chunkSize;
- for(uint64_t i=0; i < chunkSize; ++i) {
+ for (uint64_t i = 0; i < chunkSize; ++i) {
if (!buffer[i]) {
numValues -= 1;
}
@@ -87,9 +86,7 @@ namespace orc {
return numValues;
}
- void ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* incomingMask) {
+ void ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* incomingMask) {
if (numValues > rowBatch.capacity) {
rowBatch.resize(numValues);
}
@@ -99,7 +96,7 @@ namespace orc {
char* notNullArray = rowBatch.notNull.data();
decoder->next(notNullArray, numValues, incomingMask);
// check to see if there are nulls in this batch
- for(uint64_t i=0; i < numValues; ++i) {
+ for (uint64_t i = 0; i < numValues; ++i) {
if (!notNullArray[i]) {
rowBatch.hasNulls = true;
return;
@@ -114,240 +111,195 @@ namespace orc {
rowBatch.hasNulls = false;
}
- void ColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void ColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
if (notNullDecoder.get()) {
notNullDecoder->seek(positions.at(columnId));
}
}
/**
- * Expand an array of bytes in place to the corresponding array of longs.
+ * Expand an array of bytes in place to the corresponding array of integer.
* Has to work backwards so that they data isn't clobbered during the
* expansion.
* @param buffer the array of chars and array of longs that need to be
* expanded
* @param numValues the number of bytes to convert to longs
*/
- void expandBytesToLongs(int64_t* buffer, uint64_t numValues) {
- for(size_t i=numValues - 1; i < numValues; --i) {
- buffer[i] = reinterpret_cast<char *>(buffer)[i];
+ template <typename T>
+ void expandBytesToIntegers(T* buffer, uint64_t numValues) {
+ if (sizeof(T) == sizeof(int8_t)) {
+ return;
+ }
+ for (uint64_t i = 0UL; i < numValues; ++i) {
+ buffer[numValues - 1 - i] = reinterpret_cast<int8_t*>(buffer)[numValues - 1 - i];
}
}
- class BooleanColumnReader: public ColumnReader {
- private:
+ template <typename BatchType>
+ class BooleanColumnReader : public ColumnReader {
+ private:
std::unique_ptr<orc::ByteRleDecoder> rle;
- public:
+ public:
BooleanColumnReader(const Type& type, StripeStreams& stipe);
~BooleanColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- BooleanColumnReader::BooleanColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe){
+ template <typename BatchType>
+ BooleanColumnReader<BatchType>::BooleanColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Boolean column");
- rle = createBooleanRleDecoder(std::move(stream));
+ if (stream == nullptr) throw ParseError("DATA stream not found in Boolean column");
+ rle = createBooleanRleDecoder(std::move(stream), metrics);
}
- BooleanColumnReader::~BooleanColumnReader() {
+ template <typename BatchType>
+ BooleanColumnReader<BatchType>::~BooleanColumnReader() {
// PASS
}
- uint64_t BooleanColumnReader::skip(uint64_t numValues) {
+ template <typename BatchType>
+ uint64_t BooleanColumnReader<BatchType>::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
rle->skip(numValues);
return numValues;
}
- void BooleanColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <typename BatchType>
+ void BooleanColumnReader<BatchType>::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- // Since the byte rle places the output in a char* instead of long*,
- // we cheat here and use the long* and then expand it in a second pass.
- int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
- expandBytesToLongs(ptr, numValues);
+ // Since the byte rle places the output in a char* and BatchType here may be
+ // LongVectorBatch with long*. We cheat here in that case and use the long*
+ // and then expand it in a second pass..
+ auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ expandBytesToIntegers(ptr, numValues);
}
- void BooleanColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ template <typename BatchType>
+ void BooleanColumnReader<BatchType>::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
}
- class ByteColumnReader: public ColumnReader {
- private:
+ template <typename BatchType>
+ class ByteColumnReader : public ColumnReader {
+ private:
std::unique_ptr<orc::ByteRleDecoder> rle;
- public:
- ByteColumnReader(const Type& type, StripeStreams& stipe);
- ~ByteColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- ByteColumnReader::ByteColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe){
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Byte column");
- rle = createByteRleDecoder(std::move(stream));
- }
+ public:
+ ByteColumnReader(const Type& type, StripeStreams& stripe) : ColumnReader(type, stripe) {
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr) throw ParseError("DATA stream not found in Byte column");
+ rle = createByteRleDecoder(std::move(stream), metrics);
+ }
- ByteColumnReader::~ByteColumnReader() {
- // PASS
- }
+ ~ByteColumnReader() override = default;
- uint64_t ByteColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
- return numValues;
- }
+ uint64_t skip(uint64_t numValues) override {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+ }
- void ByteColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- // Since the byte rle places the output in a char* instead of long*,
- // we cheat here and use the long* and then expand it in a second pass.
- int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
- expandBytesToLongs(ptr, numValues);
- }
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // Since the byte rle places the output in a char* instead of long*,
+ // we cheat here and use the long* and then expand it in a second pass.
+ auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ expandBytesToIntegers(ptr, numValues);
+ }
- void ByteColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- }
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ }
+ };
- class IntegerColumnReader: public ColumnReader {
- protected:
+ template <typename BatchType>
+ class IntegerColumnReader : public ColumnReader {
+ protected:
std::unique_ptr<orc::RleDecoder> rle;
- public:
- IntegerColumnReader(const Type& type, StripeStreams& stripe);
- ~IntegerColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- IntegerColumnReader::IntegerColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Integer column");
- rle = createRleDecoder(std::move(stream), true, vers, memoryPool);
- }
+ public:
+ IntegerColumnReader(const Type& type, StripeStreams& stripe) : ColumnReader(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr) throw ParseError("DATA stream not found in Integer column");
+ rle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
+ }
- IntegerColumnReader::~IntegerColumnReader() {
- // PASS
- }
+ ~IntegerColumnReader() override {
+ // PASS
+ }
- uint64_t IntegerColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
- return numValues;
- }
+ uint64_t skip(uint64_t numValues) override {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+ }
- void IntegerColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
- }
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ rle->next(dynamic_cast<BatchType&>(rowBatch).data.data(), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ }
- void IntegerColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- }
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ }
+ };
- class TimestampColumnReader: public ColumnReader {
- private:
+ class TimestampColumnReader : public ColumnReader {
+ private:
std::unique_ptr<orc::RleDecoder> secondsRle;
std::unique_ptr<orc::RleDecoder> nanoRle;
- const Timezone& writerTimezone;
- const Timezone& readerTimezone;
+ const Timezone* writerTimezone;
+ const Timezone* readerTimezone;
const int64_t epochOffset;
const bool sameTimezone;
- public:
- TimestampColumnReader(const Type& type,
- StripeStreams& stripe,
- bool isInstantType);
+ public:
+ TimestampColumnReader(const Type& type, StripeStreams& stripe, bool isInstantType);
~TimestampColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
-
- TimestampColumnReader::TimestampColumnReader(const Type& type,
- StripeStreams& stripe,
- bool isInstantType
- ): ColumnReader(type, stripe),
- writerTimezone(isInstantType ?
- getTimezoneByName("GMT") :
- stripe.getWriterTimezone()),
- readerTimezone(isInstantType ?
- getTimezoneByName("GMT") :
- stripe.getReaderTimezone()),
- epochOffset(writerTimezone.getEpoch()),
- sameTimezone(&writerTimezone == &readerTimezone){
+ TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe,
+ bool isInstantType)
+ : ColumnReader(type, stripe),
+ writerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getWriterTimezone()),
+ readerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()),
+ epochOffset(writerTimezone->getEpoch()),
+ sameTimezone(writerTimezone == readerTimezone) {
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Timestamp column");
- secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool);
+ if (stream == nullptr) throw ParseError("DATA stream not found in Timestamp column");
+ secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
- if (stream == nullptr)
- throw ParseError("SECONDARY stream not found in Timestamp column");
- nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool);
+ if (stream == nullptr) throw ParseError("SECONDARY stream not found in Timestamp column");
+ nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
}
TimestampColumnReader::~TimestampColumnReader() {
@@ -361,25 +313,22 @@ namespace orc {
return numValues;
}
- void TimestampColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- TimestampVectorBatch& timestampBatch =
- dynamic_cast<TimestampVectorBatch&>(rowBatch);
- int64_t *secsBuffer = timestampBatch.data.data();
+ TimestampVectorBatch& timestampBatch = dynamic_cast<TimestampVectorBatch&>(rowBatch);
+ int64_t* secsBuffer = timestampBatch.data.data();
secondsRle->next(secsBuffer, numValues, notNull);
- int64_t *nanoBuffer = timestampBatch.nanoseconds.data();
+ int64_t* nanoBuffer = timestampBatch.nanoseconds.data();
nanoRle->next(nanoBuffer, numValues, notNull);
// Construct the values
- for(uint64_t i=0; i < numValues; i++) {
+ for (uint64_t i = 0; i < numValues; i++) {
if (notNull == nullptr || notNull[i]) {
uint64_t zeros = nanoBuffer[i] & 0x7;
nanoBuffer[i] >>= 3;
if (zeros != 0) {
- for(uint64_t j = 0; j <= zeros; ++j) {
+ for (uint64_t j = 0; j <= zeros; ++j) {
nanoBuffer[i] *= 10;
}
}
@@ -387,13 +336,13 @@ namespace orc {
if (!sameTimezone) {
// adjust timestamp value to same wall clock time if writer and reader
// time zones have different rules, which is required for Apache Orc.
- const auto& wv = writerTimezone.getVariant(writerTime);
- const auto& rv = readerTimezone.getVariant(writerTime);
+ const auto& wv = writerTimezone->getVariant(writerTime);
+ const auto& rv = readerTimezone->getVariant(writerTime);
if (!wv.hasSameTzRule(rv)) {
// If the timezone adjustment moves the millis across a DST boundary,
// we need to reevaluate the offsets.
int64_t adjustedTime = writerTime + wv.gmtOffset - rv.gmtOffset;
- const auto& adjustedReader = readerTimezone.getVariant(adjustedTime);
+ const auto& adjustedReader = readerTimezone->getVariant(adjustedTime);
writerTime = writerTime + wv.gmtOffset - adjustedReader.gmtOffset;
}
}
@@ -406,38 +355,34 @@ namespace orc {
}
void TimestampColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
secondsRle->seek(positions.at(columnId));
nanoRle->seek(positions.at(columnId));
}
- template<TypeKind columnKind, bool isLittleEndian>
- class DoubleColumnReader: public ColumnReader {
- public:
+ template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
+ class DoubleColumnReader : public ColumnReader {
+ public:
DoubleColumnReader(const Type& type, StripeStreams& stripe);
~DoubleColumnReader() override {}
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
+ private:
std::unique_ptr<SeekableInputStream> inputStream;
const uint64_t bytesPerValue = (columnKind == FLOAT) ? 4 : 8;
- const char *bufferPointer;
- const char *bufferEnd;
+ const char* bufferPointer;
+ const char* bufferEnd;
unsigned char readByte() {
if (bufferPointer == bufferEnd) {
int length;
- if (!inputStream->Next
- (reinterpret_cast<const void**>(&bufferPointer), &length)) {
+ if (!inputStream->Next(reinterpret_cast<const void**>(&bufferPointer), &length)) {
throw ParseError("bad read in DoubleColumnReader::next()");
}
bufferEnd = bufferPointer + length;
@@ -445,7 +390,8 @@ namespace orc {
return static_cast<unsigned char>(*(bufferPointer++));
}
- double readDouble() {
+ template <typename FloatType>
+ FloatType readDouble() {
int64_t bits = 0;
if (bufferEnd - bufferPointer >= 8) {
if (isLittleEndian) {
@@ -466,11 +412,12 @@ namespace orc {
bits |= static_cast<int64_t>(readByte()) << (i * 8);
}
}
- double *result = reinterpret_cast<double*>(&bits);
+ FloatType* result = reinterpret_cast<FloatType*>(&bits);
return *result;
}
- double readFloat() {
+ template <typename FloatType>
+ FloatType readFloat() {
int32_t bits = 0;
if (bufferEnd - bufferPointer >= 4) {
if (isLittleEndian) {
@@ -487,33 +434,32 @@ namespace orc {
bits |= readByte() << (i * 8);
}
}
- float *result = reinterpret_cast<float*>(&bits);
- return static_cast<double>(*result);
+ float* result = reinterpret_cast<float*>(&bits);
+ if (!result) {
+ std::cerr << "read float empty." << std::endl;
+ }
+ return static_cast<FloatType>(*result);
}
};
- template<TypeKind columnKind, bool isLittleEndian>
- DoubleColumnReader<columnKind, isLittleEndian>::DoubleColumnReader(
- const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- bufferPointer(nullptr),
- bufferEnd(nullptr) {
+ template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
+ DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::DoubleColumnReader(
+ const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe), bufferPointer(nullptr), bufferEnd(nullptr) {
inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (inputStream == nullptr)
- throw ParseError("DATA stream not found in Double column");
+ if (inputStream == nullptr) throw ParseError("DATA stream not found in Double column");
}
- template<TypeKind columnKind, bool isLittleEndian>
- uint64_t DoubleColumnReader<columnKind, isLittleEndian>::skip(uint64_t numValues) {
+ template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
+ uint64_t DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::skip(
+ uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- if (static_cast<size_t>(bufferEnd - bufferPointer) >=
- bytesPerValue * numValues) {
+ if (static_cast<size_t>(bufferEnd - bufferPointer) >= bytesPerValue * numValues) {
bufferPointer += bytesPerValue * numValues;
} else {
- size_t sizeToSkip = bytesPerValue * numValues -
- static_cast<size_t>(bufferEnd - bufferPointer);
+ size_t sizeToSkip =
+ bytesPerValue * numValues - static_cast<size_t>(bufferEnd - bufferPointer);
const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
while (sizeToSkip != 0) {
size_t step = sizeToSkip > cap ? cap : sizeToSkip;
@@ -527,33 +473,32 @@ namespace orc {
return numValues;
}
- template<TypeKind columnKind, bool isLittleEndian>
- void DoubleColumnReader<columnKind, isLittleEndian>::next(
- ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
+ void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::next(
+ ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data();
+ ValueType* outArray =
+ reinterpret_cast<ValueType*>(dynamic_cast<BatchType&>(rowBatch).data.data());
- if (columnKind == FLOAT) {
+ if constexpr (columnKind == FLOAT) {
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- outArray[i] = readFloat();
+ outArray[i] = readFloat<ValueType>();
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
- outArray[i] = readFloat();
+ for (size_t i = 0; i < numValues; ++i) {
+ outArray[i] = readFloat<ValueType>();
}
}
} else {
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- outArray[i] = readDouble();
+ outArray[i] = readDouble<ValueType>();
}
}
} else {
@@ -561,25 +506,23 @@ namespace orc {
// Only viable when the machine is little-endian.
uint64_t bufferNum = 0;
if (isLittleEndian) {
- bufferNum = std::min(numValues,
- static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue);
+ bufferNum =
+ std::min(numValues, static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue);
uint64_t bufferBytes = bufferNum * bytesPerValue;
- if (bufferPointer && bufferBytes) {
+ if (bufferBytes > 0) {
memcpy(outArray, bufferPointer, bufferBytes);
bufferPointer += bufferBytes;
- } else {
- bufferNum = 0;
}
}
for (size_t i = bufferNum; i < numValues; ++i) {
- outArray[i] = readDouble();
+ outArray[i] = readDouble<ValueType>();
}
}
}
}
- template<TypeKind columnKind, bool isLittleEndian>
- void DoubleColumnReader<columnKind, isLittleEndian>::seekToRowGroup(
+ template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
+ void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
inputStream->seek(positions.at(columnId));
@@ -604,54 +547,46 @@ namespace orc {
}
}
- class StringDictionaryColumnReader: public ColumnReader {
- private:
+ class StringDictionaryColumnReader : public ColumnReader {
+ private:
std::shared_ptr<StringDictionary> dictionary;
std::unique_ptr<RleDecoder> rle;
- public:
+ public:
StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
~StringDictionaryColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- StringDictionaryColumnReader::StringDictionaryColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- dictionary(new StringDictionary(stripe.getMemoryPool())) {
- RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
- .kind());
- uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize();
+ StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe), dictionary(new StringDictionary(stripe.getMemoryPool())) {
+ RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind());
+ uint32_t dictSize = stripe.getEncoding(columnId).dictionary_size();
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) {
throw ParseError("DATA stream not found in StringDictionaryColumn");
}
- rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool);
+ rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
if (dictSize > 0 && stream == nullptr) {
throw ParseError("LENGTH stream not found in StringDictionaryColumn");
}
std::unique_ptr<RleDecoder> lengthDecoder =
- createRleDecoder(std::move(stream), false, rleVersion, memoryPool);
+ createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
dictionary->dictionaryOffset.resize(dictSize + 1);
int64_t* lengthArray = dictionary->dictionaryOffset.data();
lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
lengthArray[0] = 0;
- for(uint32_t i = 1; i < dictSize + 1; ++i) {
+ for (uint32_t i = 1; i < dictSize + 1; ++i) {
if (lengthArray[i] < 0) {
throw ParseError("Negative dictionary entry length");
}
@@ -660,10 +595,9 @@ namespace orc {
int64_t blobSize = lengthArray[dictSize];
dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
std::unique_ptr<SeekableInputStream> blobStream =
- stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
+ stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
if (blobSize > 0 && blobStream == nullptr) {
- throw ParseError(
- "DICTIONARY_DATA stream not found in StringDictionaryColumn");
+ throw ParseError("DICTIONARY_DATA stream not found in StringDictionaryColumn");
}
readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get());
}
@@ -678,47 +612,43 @@ namespace orc {
return numValues;
}
- void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char *blob = dictionary->dictionaryBlob.data();
- int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data();
- char **outputStarts = byteBatch.data.data();
- int64_t *outputLengths = byteBatch.length.data();
+ char* blob = dictionary->dictionaryBlob.data();
+ int64_t* dictionaryOffsets = dictionary->dictionaryOffset.data();
+ char** outputStarts = byteBatch.data.data();
+ int64_t* outputLengths = byteBatch.length.data();
rle->next(outputLengths, numValues, notNull);
uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1;
if (notNull) {
- for(uint64_t i=0; i < numValues; ++i) {
+ for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
int64_t entry = outputLengths[i];
- if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) {
+ if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) {
throw ParseError("Entry index out of range in StringDictionaryColumn");
}
outputStarts[i] = blob + dictionaryOffsets[entry];
- outputLengths[i] = dictionaryOffsets[entry+1] -
- dictionaryOffsets[entry];
+ outputLengths[i] = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
}
}
} else {
- for(uint64_t i=0; i < numValues; ++i) {
+ for (uint64_t i = 0; i < numValues; ++i) {
int64_t entry = outputLengths[i];
if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) {
throw ParseError("Entry index out of range in StringDictionaryColumn");
}
outputStarts[i] = blob + dictionaryOffsets[entry];
- outputLengths[i] = dictionaryOffsets[entry+1] -
- dictionaryOffsets[entry];
+ outputLengths[i] = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
}
}
}
- void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) {
+ void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
rowBatch.isEncoded = true;
@@ -731,17 +661,16 @@ namespace orc {
}
void StringDictionaryColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
}
-
- class StringDirectColumnReader: public ColumnReader {
- private:
+ class StringDirectColumnReader : public ColumnReader {
+ private:
std::unique_ptr<RleDecoder> lengthRle;
std::unique_ptr<SeekableInputStream> blobStream;
- const char *lastBuffer;
+ const char* lastBuffer;
size_t lastBufferLength;
/**
@@ -751,38 +680,28 @@ namespace orc {
* @param numValues the lengths of the arrays
* @return the total number of bytes for the non-null values
*/
- size_t computeSize(const int64_t *lengths, const char *notNull,
- uint64_t numValues);
+ size_t computeSize(const int64_t* lengths, const char* notNull, uint64_t numValues);
- public:
+ public:
StringDirectColumnReader(const Type& type, StripeStreams& stipe);
~StringDirectColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- StringDirectColumnReader::StringDirectColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
- .kind());
+ StringDirectColumnReader::StringDirectColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in StringDirectColumn");
- lengthRle = createRleDecoder(
- std::move(stream), false, rleVersion, memoryPool);
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in StringDirectColumn");
+ lengthRle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (blobStream == nullptr)
- throw ParseError("DATA stream not found in StringDirectColumn");
+ if (blobStream == nullptr) throw ParseError("DATA stream not found in StringDirectColumn");
lastBuffer = nullptr;
lastBufferLength = 0;
}
@@ -799,8 +718,7 @@ namespace orc {
size_t totalBytes = 0;
// read the lengths, so we know haw many bytes to skip
while (done < numValues) {
- uint64_t step = std::min(BUFFER_SIZE,
- static_cast<size_t>(numValues - done));
+ uint64_t step = std::min(BUFFER_SIZE, static_cast<size_t>(numValues - done));
lengthRle->next(buffer, step, nullptr);
totalBytes += computeSize(buffer, nullptr, step);
done += step;
@@ -824,33 +742,31 @@ namespace orc {
return numValues;
}
- size_t StringDirectColumnReader::computeSize(const int64_t* lengths,
- const char* notNull,
+ size_t StringDirectColumnReader::computeSize(const int64_t* lengths, const char* notNull,
uint64_t numValues) {
size_t totalLength = 0;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
totalLength += static_cast<size_t>(lengths[i]);
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
totalLength += static_cast<size_t>(lengths[i]);
}
}
return totalLength;
}
- void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char **startPtr = byteBatch.data.data();
- int64_t *lengthPtr = byteBatch.length.data();
+ char** startPtr = byteBatch.data.data();
+ int64_t* lengthPtr = byteBatch.length.data();
// read the length vector
lengthRle->next(lengthPtr, numValues, notNull);
@@ -862,7 +778,7 @@ namespace orc {
// to get the rest directly out of the stream's buffer.
size_t bytesBuffered = 0;
byteBatch.blob.resize(totalLength);
- char *ptr= byteBatch.blob.data();
+ char* ptr = byteBatch.blob.data();
while (bytesBuffered + lastBufferLength < totalLength) {
memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength);
bytesBuffered += lastBufferLength;
@@ -902,7 +818,7 @@ namespace orc {
}
void StringDirectColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
blobStream->seek(positions.at(columnId));
lengthRle->seek(positions.at(columnId));
@@ -911,145 +827,130 @@ namespace orc {
lastBufferLength = 0;
}
- class StructColumnReader: public ColumnReader {
- private:
+ class StructColumnReader : public ColumnReader {
+ private:
std::vector<std::unique_ptr<ColumnReader>> children;
- public:
- StructColumnReader(const Type& type, StripeStreams& stipe);
+ public:
+ StructColumnReader(const Type& type, StripeStreams& stripe, bool useTightNumericVector = false,
+ bool throwOnSchemaEvolutionOverflow = false);
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- StructColumnReader::StructColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ StructColumnReader::StructColumnReader(const Type& type, StripeStreams& stripe,
+ bool useTightNumericVector,
+ bool throwOnSchemaEvolutionOverflow)
+ : ColumnReader(type, stripe) {
// count the number of selected sub-columns
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) {
- case proto::ColumnEncoding_Kind_DIRECT:
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
- const Type& child = *type.getSubtype(i);
- if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
- children.push_back(buildReader(child, stripe));
+ case proto::ColumnEncoding_Kind_DIRECT:
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ const Type& child = *type.getSubtype(i);
+ if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
+ children.push_back(
+ buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow));
+ }
}
- }
- break;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- default:
- throw ParseError("Unknown encoding for StructColumnReader");
+ break;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ default:
+ throw ParseError("Unknown encoding for StructColumnReader");
}
}
uint64_t StructColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- for(auto& ptr : children) {
+ for (auto& ptr : children) {
ptr->skip(numValues);
}
return numValues;
}
- void StructColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StructColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- uint64_t i=0;
- notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr;
- for(auto iter = children.begin(); iter != children.end(); ++iter, ++i) {
+ uint64_t i = 0;
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ for (auto iter = children.begin(); iter != children.end(); ++iter, ++i) {
if (encoded) {
- (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
- numValues, notNull);
+ (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues,
+ notNull);
} else {
- (*iter)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
- numValues, notNull);
+ (*iter)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues, notNull);
}
}
}
void StructColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- for(auto& ptr : children) {
+ for (auto& ptr : children) {
ptr->seekToRowGroup(positions);
}
}
- class ListColumnReader: public ColumnReader {
- private:
+ class ListColumnReader : public ColumnReader {
+ private:
std::unique_ptr<ColumnReader> child;
std::unique_ptr<RleDecoder> rle;
- public:
- ListColumnReader(const Type& type, StripeStreams& stipe);
+ public:
+ ListColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,
+ bool throwOnSchemaEvolutionOverflow = false);
~ListColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- ListColumnReader::ListColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ ListColumnReader::ListColumnReader(const Type& type, StripeStreams& stripe,
+ bool useTightNumericVector,
+ bool throwOnSchemaEvolutionOverflow)
+ : ColumnReader(type, stripe) {
// count the number of selected sub-columns
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in List column");
- rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in List column");
+ rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
const Type& childType = *type.getSubtype(0);
if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
- child = buildReader(childType, stripe);
+ child = buildReader(childType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
}
@@ -1059,7 +960,7 @@ namespace orc {
uint64_t ListColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- ColumnReader *childReader = child.get();
+ ColumnReader* childReader = child.get();
if (childReader) {
const uint64_t BUFFER_SIZE = 1024;
int64_t buffer[BUFFER_SIZE];
@@ -1068,7 +969,7 @@ namespace orc {
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
+ for (size_t i = 0; i < chunk; ++i) {
childrenElements += static_cast<size_t>(buffer[i]);
}
lengthsRead += chunk;
@@ -1080,30 +981,26 @@ namespace orc {
return numValues;
}
- void ListColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void ListColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
+ ListVectorBatch& listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
int64_t* offsets = listBatch.offsets.data();
notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr;
rle->next(offsets, numValues, notNull);
uint64_t totalChildren = 0;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
@@ -1113,14 +1010,14 @@ namespace orc {
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
totalChildren += tmp;
}
}
offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader *childReader = child.get();
+ ColumnReader* childReader = child.get();
if (childReader) {
if (encoded) {
childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr);
@@ -1130,8 +1027,7 @@ namespace orc {
}
}
- void ListColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void ListColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
if (child.get()) {
@@ -1139,54 +1035,49 @@ namespace orc {
}
}
- class MapColumnReader: public ColumnReader {
- private:
+ class MapColumnReader : public ColumnReader {
+ private:
std::unique_ptr<ColumnReader> keyReader;
std::unique_ptr<ColumnReader> elementReader;
std::unique_ptr<RleDecoder> rle;
- public:
- MapColumnReader(const Type& type, StripeStreams& stipe);
+ public:
+ MapColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,
+ bool throwOnSchemaEvolutionOverflow = false);
~MapColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- MapColumnReader::MapColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ MapColumnReader::MapColumnReader(const Type& type, StripeStreams& stripe,
+ bool useTightNumericVector, bool throwOnSchemaEvolutionOverflow)
+ : ColumnReader(type, stripe) {
// Determine if the key and/or value columns are selected
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in Map column");
- rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in Map column");
+ rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
const Type& keyType = *type.getSubtype(0);
if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) {
- keyReader = buildReader(keyType, stripe);
+ keyReader =
+ buildReader(keyType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
const Type& elementType = *type.getSubtype(1);
if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) {
- elementReader = buildReader(elementType, stripe);
+ elementReader =
+ buildReader(elementType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
}
@@ -1196,8 +1087,8 @@ namespace orc {
uint64_t MapColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- ColumnReader *rawKeyReader = keyReader.get();
- ColumnReader *rawElementReader = elementReader.get();
+ ColumnReader* rawKeyReader = keyReader.get();
+ ColumnReader* rawElementReader = elementReader.get();
if (rawKeyReader || rawElementReader) {
const uint64_t BUFFER_SIZE = 1024;
int64_t buffer[BUFFER_SIZE];
@@ -1206,7 +1097,7 @@ namespace orc {
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
+ for (size_t i = 0; i < chunk; ++i) {
childrenElements += static_cast<size_t>(buffer[i]);
}
lengthsRead += chunk;
@@ -1223,32 +1114,26 @@ namespace orc {
return numValues;
}
- void MapColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull)
- {
+ void MapColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull)
- {
+ void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
+ MapVectorBatch& mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
int64_t* offsets = mapBatch.offsets.data();
notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr;
rle->next(offsets, numValues, notNull);
uint64_t totalChildren = 0;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
@@ -1258,14 +1143,14 @@ namespace orc {
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
totalChildren += tmp;
}
}
offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader *rawKeyReader = keyReader.get();
+ ColumnReader* rawKeyReader = keyReader.get();
if (rawKeyReader) {
if (encoded) {
rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr);
@@ -1273,7 +1158,7 @@ namespace orc {
rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr);
}
}
- ColumnReader *rawElementReader = elementReader.get();
+ ColumnReader* rawElementReader = elementReader.get();
if (rawElementReader) {
if (encoded) {
rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr);
@@ -1283,8 +1168,7 @@ namespace orc {
}
}
- void MapColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void MapColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
if (keyReader.get()) {
@@ -1295,54 +1179,49 @@ namespace orc {
}
}
- class UnionColumnReader: public ColumnReader {
- private:
+ class UnionColumnReader : public ColumnReader {
+ private:
std::unique_ptr<ByteRleDecoder> rle;
std::vector<std::unique_ptr<ColumnReader>> childrenReader;
std::vector<int64_t> childrenCounts;
uint64_t numChildren;
- public:
- UnionColumnReader(const Type& type, StripeStreams& stipe);
+ public:
+ UnionColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,
+ bool throwOnSchemaEvolutionOverflow = false);
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- UnionColumnReader::UnionColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ UnionColumnReader::UnionColumnReader(const Type& type, StripeStreams& stripe,
+ bool useTightNumericVector,
+ bool throwOnSchemaEvolutionOverflow)
+ : ColumnReader(type, stripe) {
numChildren = type.getSubtypeCount();
childrenReader.resize(numChildren);
childrenCounts.resize(numChildren);
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in Union column");
- rle = createByteRleDecoder(std::move(stream));
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in Union column");
+ rle = createByteRleDecoder(std::move(stream), metrics);
// figure out which types are selected
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- for(unsigned int i=0; i < numChildren; ++i) {
- const Type &child = *type.getSubtype(i);
+ for (unsigned int i = 0; i < numChildren; ++i) {
+ const Type& child = *type.getSubtype(i);
if (selectedColumns[static_cast<size_t>(child.getColumnId())]) {
- childrenReader[i] = buildReader(child, stripe);
+ childrenReader[i] =
+ buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow);
}
}
}
@@ -1352,17 +1231,17 @@ namespace orc {
const uint64_t BUFFER_SIZE = 1024;
char buffer[BUFFER_SIZE];
uint64_t lengthsRead = 0;
- int64_t *counts = childrenCounts.data();
+ int64_t* counts = childrenCounts.data();
memset(counts, 0, sizeof(int64_t) * numChildren);
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
+ for (size_t i = 0; i < chunk; ++i) {
counts[static_cast<size_t>(buffer[i])] += 1;
}
lengthsRead += chunk;
}
- for(size_t i=0; i < numChildren; ++i) {
+ for (size_t i = 0; i < numChildren; ++i) {
if (counts[i] != 0 && childrenReader[i] != nullptr) {
childrenReader[i]->skip(static_cast<uint64_t>(counts[i]));
}
@@ -1370,63 +1249,57 @@ namespace orc {
return numValues;
}
- void UnionColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void UnionColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
+ UnionVectorBatch& unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
uint64_t* offsets = unionBatch.offsets.data();
int64_t* counts = childrenCounts.data();
memset(counts, 0, sizeof(int64_t) * numChildren);
unsigned char* tags = unionBatch.tags.data();
notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr;
- rle->next(reinterpret_cast<char *>(tags), numValues, notNull);
+ rle->next(reinterpret_cast<char*>(tags), numValues, notNull);
// set the offsets for each row
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- offsets[i] =
- static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ offsets[i] = static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
- offsets[i] =
- static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ for (size_t i = 0; i < numValues; ++i) {
+ offsets[i] = static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
}
}
// read the right number of each child column
- for(size_t i=0; i < numChildren; ++i) {
+ for (size_t i = 0; i < numChildren; ++i) {
if (childrenReader[i] != nullptr) {
if (encoded) {
childrenReader[i]->nextEncoded(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
+ static_cast<uint64_t>(counts[i]), nullptr);
} else {
- childrenReader[i]->next(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
+ childrenReader[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]),
+ nullptr);
}
}
}
}
void UnionColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
- for(size_t i = 0; i < numChildren; ++i) {
+ for (size_t i = 0; i < numChildren; ++i) {
if (childrenReader[i] != nullptr) {
childrenReader[i]->seekToRowGroup(positions);
}
@@ -1446,13 +1319,13 @@ namespace orc {
}
}
- class Decimal64ColumnReader: public ColumnReader {
- public:
+ class Decimal64ColumnReader : public ColumnReader {
+ public:
static const uint32_t MAX_PRECISION_64 = 18;
static const uint32_t MAX_PRECISION_128 = 38;
static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1];
- protected:
+ protected:
std::unique_ptr<SeekableInputStream> valueStream;
int32_t precision;
int32_t scale;
@@ -1467,9 +1340,8 @@ namespace orc {
void readBuffer() {
while (buffer == bufferEnd) {
int length;
- if (!valueStream->Next(reinterpret_cast<const void**>(&buffer),
- &length)) {
- throw ParseError("Read past end of stream in Decimal64ColumnReader "+
+ if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), &length)) {
+ throw ParseError("Read past end of stream in Decimal64ColumnReader " +
valueStream->getName());
}
bufferEnd = buffer + length;
@@ -1489,69 +1361,61 @@ namespace orc {
}
}
value = unZigZag(static_cast<uint64_t>(value));
- if (scale > currentScale &&
- static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) {
+ if (scale > currentScale && static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) {
value *= POWERS_OF_TEN[scale - currentScale];
} else if (scale < currentScale &&
- static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) {
+ static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) {
value /= POWERS_OF_TEN[currentScale - scale];
} else if (scale != currentScale) {
throw ParseError("Decimal scale out of range");
}
}
- public:
+ public:
Decimal64ColumnReader(const Type& type, StripeStreams& stipe);
~Decimal64ColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
const uint32_t Decimal64ColumnReader::MAX_PRECISION_64;
const uint32_t Decimal64ColumnReader::MAX_PRECISION_128;
- const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]=
- {1,
- 10,
- 100,
- 1000,
- 10000,
- 100000,
- 1000000,
- 10000000,
- 100000000,
- 1000000000,
- 10000000000,
- 100000000000,
- 1000000000000,
- 10000000000000,
- 100000000000000,
- 1000000000000000,
- 10000000000000000,
- 100000000000000000,
- 1000000000000000000};
-
- Decimal64ColumnReader::Decimal64ColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+ Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
scale = static_cast<int32_t>(type.getScale());
precision = static_cast<int32_t>(type.getPrecision());
valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (valueStream == nullptr)
- throw ParseError("DATA stream not found in Decimal64Column");
+ if (valueStream == nullptr) throw ParseError("DATA stream not found in Decimal64Column");
buffer = nullptr;
bufferEnd = nullptr;
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
- if (stream == nullptr)
- throw ParseError("SECONDARY stream not found in Decimal64Column");
- scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool);
+ if (stream == nullptr) throw ParseError("SECONDARY stream not found in Decimal64Column");
+ scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
}
Decimal64ColumnReader::~Decimal64ColumnReader() {
@@ -1571,13 +1435,10 @@ namespace orc {
return numValues;
}
- void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal64VectorBatch &batch =
- dynamic_cast<Decimal64VectorBatch&>(rowBatch);
+ Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch);
int64_t* values = batch.values.data();
// read the next group of scales
int64_t* scaleBuffer = batch.readScales.data();
@@ -1585,13 +1446,13 @@ namespace orc {
batch.precision = precision;
batch.scale = scale;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
@@ -1599,28 +1460,25 @@ namespace orc {
void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) {
if (scale > currentScale) {
- while(scale > currentScale) {
+ while (scale > currentScale) {
uint32_t scaleAdjust =
- std::min(Decimal64ColumnReader::MAX_PRECISION_64,
- scale - currentScale);
+ std::min(Decimal64ColumnReader::MAX_PRECISION_64, scale - currentScale);
value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust];
currentScale += scaleAdjust;
}
} else if (scale < currentScale) {
Int128 remainder;
- while(currentScale > scale) {
+ while (currentScale > scale) {
uint32_t scaleAdjust =
- std::min(Decimal64ColumnReader::MAX_PRECISION_64,
- currentScale - scale);
- value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust],
- remainder);
+ std::min(Decimal64ColumnReader::MAX_PRECISION_64, currentScale - scale);
+ value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], remainder);
currentScale -= scaleAdjust;
}
}
}
void Decimal64ColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
valueStream->seek(positions.at(columnId));
scaleDecoder->seek(positions.at(columnId));
@@ -1629,16 +1487,14 @@ namespace orc {
bufferEnd = nullptr;
}
- class Decimal128ColumnReader: public Decimal64ColumnReader {
- public:
+ class Decimal128ColumnReader : public Decimal64ColumnReader {
+ public:
Decimal128ColumnReader(const Type& type, StripeStreams& stipe);
~Decimal128ColumnReader() override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- private:
+ private:
void readInt128(Int128& value, int32_t currentScale) {
value = 0;
Int128 work;
@@ -1648,22 +1504,19 @@ namespace orc {
unsigned char ch = static_cast<unsigned char>(*(buffer++));
work = ch & 0x7f;
work <<= offset;
- value |= work;
+ value |= work;
offset += 7;
if (!(ch & 0x80)) {
break;
}
}
unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
- static_cast<uint32_t>(currentScale));
+ scaleInt128(value, static_cast<uint32_t>(scale), static_cast<uint32_t>(currentScale));
}
};
- Decimal128ColumnReader::Decimal128ColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): Decimal64ColumnReader(type, stripe) {
+ Decimal128ColumnReader::Decimal128ColumnReader(const Type& type, StripeStreams& stripe)
+ : Decimal64ColumnReader(type, stripe) {
// PASS
}
@@ -1671,13 +1524,11 @@ namespace orc {
// PASS
}
- void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal128VectorBatch &batch =
- dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch);
Int128* values = batch.values.data();
// read the next group of scales
int64_t* scaleBuffer = batch.readScales.data();
@@ -1685,38 +1536,35 @@ namespace orc {
batch.precision = precision;
batch.scale = scale;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
}
- class Decimal64ColumnReaderV2: public ColumnReader {
- protected:
+ class Decimal64ColumnReaderV2 : public ColumnReader {
+ protected:
std::unique_ptr<RleDecoder> valueDecoder;
int32_t precision;
int32_t scale;
- public:
+ public:
Decimal64ColumnReaderV2(const Type& type, StripeStreams& stripe);
~Decimal64ColumnReaderV2() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
};
- Decimal64ColumnReaderV2::Decimal64ColumnReaderV2(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ Decimal64ColumnReaderV2::Decimal64ColumnReaderV2(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
scale = static_cast<int32_t>(type.getScale());
precision = static_cast<int32_t>(type.getPrecision());
std::unique_ptr<SeekableInputStream> stream =
@@ -1726,7 +1574,7 @@ namespace orc {
ss << "DATA stream not found in Decimal64V2 column. ColumnId=" << columnId;
throw ParseError(ss.str());
}
- valueDecoder = createRleDecoder(std::move(stream), true, RleVersion_2, memoryPool);
+ valueDecoder = createRleDecoder(std::move(stream), true, RleVersion_2, memoryPool, metrics);
}
Decimal64ColumnReaderV2::~Decimal64ColumnReaderV2() {
@@ -1739,20 +1587,18 @@ namespace orc {
return numValues;
}
- void Decimal64ColumnReaderV2::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void Decimal64ColumnReaderV2::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal64VectorBatch &batch =
- dynamic_cast<Decimal64VectorBatch&>(rowBatch);
+ Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch);
valueDecoder->next(batch.values.data(), numValues, notNull);
batch.precision = precision;
batch.scale = scale;
}
- class DecimalHive11ColumnReader: public Decimal64ColumnReader {
- private:
+ class DecimalHive11ColumnReader : public Decimal64ColumnReader {
+ private:
bool throwOnOverflow;
std::ostream* errorStream;
@@ -1762,7 +1608,7 @@ namespace orc {
bool readInt128(Int128& value, int32_t currentScale) {
// -/+ 99999999999999999999999999999999999999
static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001);
- static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff);
+ static const Int128 MAX_VALUE(0x4b3b4ca85a86c47a, 0x098a223fffffffff);
value = 0;
Int128 work;
@@ -1778,7 +1624,7 @@ namespace orc {
result = false;
}
work <<= offset;
- value |= work;
+ value |= work;
offset += 7;
if (!(ch & 0x80)) {
break;
@@ -1789,24 +1635,19 @@ namespace orc {
return result;
}
unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
- static_cast<uint32_t>(currentScale));
+ scaleInt128(value, static_cast<uint32_t>(scale), static_cast<uint32_t>(currentScale));
return value >= MIN_VALUE && value <= MAX_VALUE;
}
- public:
+ public:
DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe);
~DecimalHive11ColumnReader() override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
};
- DecimalHive11ColumnReader::DecimalHive11ColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): Decimal64ColumnReader(type, stripe) {
+ DecimalHive11ColumnReader::DecimalHive11ColumnReader(const Type& type, StripeStreams& stripe)
+ : Decimal64ColumnReader(type, stripe) {
scale = stripe.getForcedScaleOnHive11Decimal();
throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow();
errorStream = stripe.getErrorStream();
@@ -1816,13 +1657,11 @@ namespace orc {
// PASS
}
- void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal128VectorBatch &batch =
- dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch);
Int128* values = batch.values.data();
// read the next group of scales
int64_t* scaleBuffer = batch.readScales.data();
@@ -1832,10 +1671,9 @@ namespace orc {
batch.precision = precision;
batch.scale = scale;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- if (!readInt128(values[i],
- static_cast<int32_t>(scaleBuffer[i]))) {
+ if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -1848,9 +1686,8 @@ namespace orc {
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
- if (!readInt128(values[i],
- static_cast<int32_t>(scaleBuffer[i]))) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -1866,109 +1703,133 @@ namespace orc {
}
static bool isLittleEndian() {
- static union { uint32_t i; char c[4]; } num = { 0x01020304 };
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
return num.c[0] == 4;
}
/**
* Create a reader for the given stripe.
*/
- std::unique_ptr<ColumnReader> buildReader(const Type& type,
- StripeStreams& stripe) {
- switch (static_cast<int64_t>(type.getKind())) {
- case DATE:
- case INT:
- case LONG:
- case SHORT:
- return std::unique_ptr<ColumnReader>(
- new IntegerColumnReader(type, stripe));
- case BINARY:
- case CHAR:
- case STRING:
- case VARCHAR:
- switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){
- case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return std::unique_ptr<ColumnReader>(
- new StringDictionaryColumnReader(type, stripe));
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- return std::unique_ptr<ColumnReader>(
- new StringDirectColumnReader(type, stripe));
- default:
- throw NotImplementedYet("buildReader unhandled string encoding");
- }
+ std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe,
+ bool useTightNumericVector,
+ bool throwOnSchemaEvolutionOverflow,
+ bool convertToReadType) {
+ if (convertToReadType && stripe.getSchemaEvolution() &&
+ stripe.getSchemaEvolution()->needConvert(type)) {
+ return buildConvertReader(type, stripe, useTightNumericVector,
+ throwOnSchemaEvolutionOverflow);
+ }
- case BOOLEAN:
- return std::unique_ptr<ColumnReader>(
- new BooleanColumnReader(type, stripe));
+ switch (static_cast<int64_t>(type.getKind())) {
+ case SHORT:
+ if (useTightNumericVector) {
+ return std::make_unique<IntegerColumnReader<ShortVectorBatch>>(type, stripe);
+ }
+ return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe);
+ case INT:
+ if (useTightNumericVector) {
+ return std::make_unique<IntegerColumnReader<IntVectorBatch>>(type, stripe);
+ }
+ return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe);
+ case LONG:
+ case DATE:
+ return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe);
+ case BINARY:
+ case CHAR:
+ case STRING:
+ case VARCHAR:
+ switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return std::make_unique<StringDictionaryColumnReader>(type, stripe);
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ return std::make_unique<StringDirectColumnReader>(type, stripe);
+ default:
+ throw NotImplementedYet("buildReader unhandled string encoding");
+ }
- case BYTE:
- return std::unique_ptr<ColumnReader>(
- new ByteColumnReader(type, stripe));
+ case BOOLEAN: {
+ if (useTightNumericVector) {
+ return std::make_unique<BooleanColumnReader<ByteVectorBatch>>(type, stripe);
+ } else {
+ return std::make_unique<BooleanColumnReader<LongVectorBatch>>(type, stripe);
+ }
+ }
- case LIST:
- return std::unique_ptr<ColumnReader>(
- new ListColumnReader(type, stripe));
+ case BYTE:
+ if (useTightNumericVector) {
+ return std::make_unique<ByteColumnReader<ByteVectorBatch>>(type, stripe);
+ }
+ return std::make_unique<ByteColumnReader<LongVectorBatch>>(type, stripe);
- case MAP:
- return std::unique_ptr<ColumnReader>(
- new MapColumnReader(type, stripe));
+ case LIST:
+ return std::make_unique<ListColumnReader>(type, stripe, useTightNumericVector,
+ throwOnSchemaEvolutionOverflow);
- case UNION:
- return std::unique_ptr<ColumnReader>(
- new UnionColumnReader(type, stripe));
+ case MAP:
+ return std::make_unique<MapColumnReader>(type, stripe, useTightNumericVector,
+ throwOnSchemaEvolutionOverflow);
- case STRUCT:
- return std::unique_ptr<ColumnReader>(
- new StructColumnReader(type, stripe));
+ case UNION:
+ return std::make_unique<UnionColumnReader>(type, stripe, useTightNumericVector,
+ throwOnSchemaEvolutionOverflow);
- case FLOAT:
- if (isLittleEndian()) {
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<FLOAT, true>(type, stripe));
- }
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<FLOAT, false>(type, stripe));
+ case STRUCT:
+ return std::make_unique<StructColumnReader>(type, stripe, useTightNumericVector,
+ throwOnSchemaEvolutionOverflow);
- case DOUBLE:
- if (isLittleEndian()) {
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<DOUBLE, true>(type, stripe));
- }
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<DOUBLE, false>(type, stripe));
-
- case TIMESTAMP:
- return std::unique_ptr<ColumnReader>
- (new TimestampColumnReader(type, stripe, false));
-
- case TIMESTAMP_INSTANT:
- return std::unique_ptr<ColumnReader>
- (new TimestampColumnReader(type, stripe, true));
-
- case DECIMAL:
- // is this a Hive 0.11 or 0.12 file?
- if (type.getPrecision() == 0) {
- return std::unique_ptr<ColumnReader>
- (new DecimalHive11ColumnReader(type, stripe));
+ case FLOAT: {
+ if (useTightNumericVector) {
+ if (isLittleEndian()) {
+ return std::make_unique<DoubleColumnReader<FLOAT, true, float, FloatVectorBatch>>(
+ type, stripe);
+ }
+ return std::make_unique<DoubleColumnReader<FLOAT, false, float, FloatVectorBatch>>(
+ type, stripe);
+ }
+ if (isLittleEndian()) {
+ return std::make_unique<DoubleColumnReader<FLOAT, true, double, DoubleVectorBatch>>(
+ type, stripe);
+ }
+ return std::make_unique<DoubleColumnReader<FLOAT, false, double, DoubleVectorBatch>>(
+ type, stripe);
}
- // can we represent the values using int64_t?
- if (type.getPrecision() <= Decimal64ColumnReader::MAX_PRECISION_64) {
- if (stripe.isDecimalAsLong()) {
- return std::unique_ptr<ColumnReader>
- (new Decimal64ColumnReaderV2(type, stripe));
+ case DOUBLE: {
+ if (isLittleEndian()) {
+ return std::make_unique<DoubleColumnReader<DOUBLE, true, double, DoubleVectorBatch>>(
+ type, stripe);
}
- return std::unique_ptr<ColumnReader>
- (new Decimal64ColumnReader(type, stripe));
+ return std::make_unique<DoubleColumnReader<DOUBLE, false, double, DoubleVectorBatch>>(
+ type, stripe);
}
- // otherwise we use the Int128 implementation
- return std::unique_ptr<ColumnReader>
- (new Decimal128ColumnReader(type, stripe));
+ case TIMESTAMP:
+ return std::make_unique<TimestampColumnReader>(type, stripe, false);
+
+ case TIMESTAMP_INSTANT:
+ return std::make_unique<TimestampColumnReader>(type, stripe, true);
- default:
- throw NotImplementedYet("buildReader unhandled type");
+ case DECIMAL:
+ // is this a Hive 0.11 or 0.12 file?
+ if (type.getPrecision() == 0) {
+ return std::make_unique<DecimalHive11ColumnReader>(type, stripe);
+ }
+ // can we represent the values using int64_t?
+ if (type.getPrecision() <= Decimal64ColumnReader::MAX_PRECISION_64) {
+ if (stripe.isDecimalAsLong()) {
+ return std::make_unique<Decimal64ColumnReaderV2>(type, stripe);
+ }
+ return std::make_unique<Decimal64ColumnReader>(type, stripe);
+ }
+ // otherwise we use the Int128 implementation
+ return std::make_unique<Decimal128ColumnReader>(type, stripe);
+
+ default:
+ throw NotImplementedYet("buildReader unhandled type");
}
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.hh b/contrib/libs/apache/orc/c++/src/ColumnReader.hh
index 80b59de2c1..f0f3fe1b52 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnReader.hh
+++ b/contrib/libs/apache/orc/c++/src/ColumnReader.hh
@@ -30,8 +30,10 @@
namespace orc {
+ class SchemaEvolution;
+
class StripeStreams {
- public:
+ public:
virtual ~StripeStreams();
/**
@@ -53,10 +55,9 @@ namespace orc {
* @param shouldStream should the reading page the stream in
* @return the new stream
*/
- virtual std::unique_ptr<SeekableInputStream>
- getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const = 0;
+ virtual std::unique_ptr<SeekableInputStream> getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const = 0;
/**
* Get the memory pool for this reader.
@@ -64,6 +65,11 @@ namespace orc {
virtual MemoryPool& getMemoryPool() const = 0;
/**
+ * Get the reader metrics for this reader.
+ */
+ virtual ReaderMetrics* getReaderMetrics() const = 0;
+
+ /**
* Get the writer's timezone, so that we can convert their dates correctly.
*/
virtual const Timezone& getWriterTimezone() const = 0;
@@ -97,18 +103,24 @@ namespace orc {
* encoded in RLE.
*/
virtual bool isDecimalAsLong() const = 0;
+
+ /**
+ * @return get schema evolution utility object
+ */
+ virtual const SchemaEvolution* getSchemaEvolution() const = 0;
};
/**
* The interface for reading ORC data types.
*/
class ColumnReader {
- protected:
+ protected:
std::unique_ptr<ByteRleDecoder> notNullDecoder;
uint64_t columnId;
MemoryPool& memoryPool;
+ ReaderMetrics* metrics;
- public:
+ public:
ColumnReader(const Type& type, StripeStreams& stipe);
virtual ~ColumnReader();
@@ -128,9 +140,7 @@ namespace orc {
* a mask (with at least numValues bytes) for which values to
* set.
*/
- virtual void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull);
+ virtual void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
/**
* Read the next group of values without decoding
@@ -140,10 +150,7 @@ namespace orc {
* a mask (with at least numValues bytes) for which values to
* set.
*/
- virtual void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull)
- {
+ virtual void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
rowBatch.isEncoded = false;
next(rowBatch, numValues, notNull);
}
@@ -152,16 +159,16 @@ namespace orc {
* Seek to beginning of a row group in the current stripe
* @param positions a list of PositionProviders storing the positions
*/
- virtual void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions);
-
+ virtual void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions);
};
/**
* Create a reader for the given stripe.
*/
- std::unique_ptr<ColumnReader> buildReader(const Type& type,
- StripeStreams& stripe);
-}
+ std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe,
+ bool useTightNumericVector = false,
+ bool throwOnSchemaEvolutionOverflow = false,
+ bool convertToReadType = true);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
index 32b68af349..f24be1f0b2 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
@@ -27,55 +27,43 @@
namespace orc {
StreamsFactory::~StreamsFactory() {
- //PASS
+ // PASS
}
class StreamsFactoryImpl : public StreamsFactory {
- public:
- StreamsFactoryImpl(
- const WriterOptions& writerOptions,
- OutputStream* outputStream) :
- options(writerOptions),
- outStream(outputStream) {
- }
-
- virtual std::unique_ptr<BufferedOutputStream>
- createStream(proto::Stream_Kind kind) const override;
- private:
+ public:
+ StreamsFactoryImpl(const WriterOptions& writerOptions, OutputStream* outputStream)
+ : options(writerOptions), outStream(outputStream) {}
+
+ virtual std::unique_ptr<BufferedOutputStream> createStream(
+ proto::Stream_Kind kind) const override;
+
+ private:
const WriterOptions& options;
OutputStream* outStream;
};
- std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(
- proto::Stream_Kind) const {
+ std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(proto::Stream_Kind) const {
// In the future, we can decide compression strategy and modifier
// based on stream kind. But for now we just use the setting from
// WriterOption
- return createCompressor(
- options.getCompression(),
- outStream,
- options.getCompressionStrategy(),
+ return createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(),
// BufferedOutputStream initial capacity
- 1 * 1024 * 1024,
- options.getCompressionBlockSize(),
- *options.getMemoryPool());
+ options.getOutputBufferCapacity(), options.getCompressionBlockSize(),
+ *options.getMemoryPool(), options.getWriterMetrics());
}
- std::unique_ptr<StreamsFactory> createStreamsFactory(
- const WriterOptions& options,
- OutputStream* outStream) {
- return std::unique_ptr<StreamsFactory>(
- new StreamsFactoryImpl(options, outStream));
+ std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options,
+ OutputStream* outStream) {
+ return std::make_unique<StreamsFactoryImpl>(options, outStream);
}
RowIndexPositionRecorder::~RowIndexPositionRecorder() {
// PASS
}
- proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion)
- {
- switch (rleVersion)
- {
+ proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) {
+ switch (rleVersion) {
case RleVersion_1:
return proto::ColumnEncoding_Kind_DIRECT;
case RleVersion_2:
@@ -85,24 +73,21 @@ namespace orc {
}
}
- ColumnWriter::ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- columnId(type.getColumnId()),
- colIndexStatistics(),
- colStripeStatistics(),
- colFileStatistics(),
- enableIndex(options.getEnableIndex()),
- rowIndex(),
- rowIndexEntry(),
- rowIndexPosition(),
- enableBloomFilter(false),
- memPool(*options.getMemoryPool()),
- indexStream(),
- bloomFilterStream(),
- hasNullValue(false) {
-
+ ColumnWriter::ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : columnId(type.getColumnId()),
+ colIndexStatistics(),
+ colStripeStatistics(),
+ colFileStatistics(),
+ enableIndex(options.getEnableIndex()),
+ rowIndex(),
+ rowIndexEntry(),
+ rowIndexPosition(),
+ enableBloomFilter(false),
+ memPool(*options.getMemoryPool()),
+ indexStream(),
+ bloomFilterStream(),
+ hasNullValue(false) {
std::unique_ptr<BufferedOutputStream> presentStream =
factory.createStream(proto::Stream_Kind_PRESENT);
notNullEncoder = createBooleanRleEncoder(std::move(presentStream));
@@ -112,20 +97,17 @@ namespace orc {
colFileStatistics = createColumnStatistics(type);
if (enableIndex) {
- rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex());
- rowIndexEntry =
- std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry());
- rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>(
- new RowIndexPositionRecorder(*rowIndexEntry));
- indexStream =
- factory.createStream(proto::Stream_Kind_ROW_INDEX);
+ rowIndex = std::make_unique<proto::RowIndex>();
+ rowIndexEntry = std::make_unique<proto::RowIndexEntry>();
+ rowIndexPosition = std::make_unique<RowIndexPositionRecorder>(*rowIndexEntry);
+ indexStream = factory.createStream(proto::Stream_Kind_ROW_INDEX);
// BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported
- if (options.isColumnUseBloomFilter(columnId)
- && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) {
+ if (options.isColumnUseBloomFilter(columnId) &&
+ options.getBloomFilterVersion() == BloomFilterVersion::UTF8) {
enableBloomFilter = true;
- bloomFilter.reset(new BloomFilterImpl(
- options.getRowIndexStride(), options.getBloomFilterFPP()));
+ bloomFilter.reset(
+ new BloomFilterImpl(options.getRowIndexStride(), options.getBloomFilterFPP()));
bloomFilterIndex.reset(new proto::BloomFilterIndex());
bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8);
}
@@ -136,9 +118,7 @@ namespace orc {
// PASS
}
- void ColumnWriter::add(ColumnVectorBatch& batch,
- uint64_t offset,
- uint64_t numValues,
+ void ColumnWriter::add(ColumnVectorBatch& batch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
const char* notNull = batch.notNull.data() + offset;
notNullEncoder->add(notNull, numValues, incomingMask);
@@ -167,8 +147,7 @@ namespace orc {
return notNullEncoder->getBufferSize();
}
- void ColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
getProtoBufStatistics(stats, colStripeStatistics.get());
}
@@ -182,13 +161,12 @@ namespace orc {
colIndexStatistics->reset();
}
- void ColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
getProtoBufStatistics(stats, colFileStatistics.get());
}
void ColumnWriter::createRowIndexEntry() {
- proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics();
+ proto::ColumnStatistics* indexStats = rowIndexEntry->mutable_statistics();
colIndexStatistics->toProtoBuf(*indexStats);
*rowIndex->add_entry() = *rowIndexEntry;
@@ -206,12 +184,12 @@ namespace orc {
void ColumnWriter::addBloomFilterEntry() {
if (enableBloomFilter) {
- BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter());
+ BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloom_filter());
bloomFilter->reset();
}
}
- void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ void ColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
if (!hasNullValue) {
// remove positions of present stream
int presentCount = indexStream->isCompressed() ? 4 : 3;
@@ -266,7 +244,7 @@ namespace orc {
if (enableBloomFilter) {
bloomFilter->reset();
- bloomFilterIndex->clear_bloomfilter();
+ bloomFilterIndex->clear_bloom_filter();
}
}
@@ -275,28 +253,21 @@ namespace orc {
}
class StructColumnWriter : public ColumnWriter {
- public:
- StructColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
+ public:
+ StructColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -304,23 +275,20 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void writeDictionary() override;
virtual void reset() override;
- private:
+ private:
std::vector<std::unique_ptr<ColumnWriter>> children;
};
- StructColumnWriter::StructColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
- for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ StructColumnWriter::StructColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
const Type& child = *type.getSubtype(i);
children.push_back(buildWriter(child, factory, options));
}
@@ -330,20 +298,15 @@ namespace orc {
}
}
- void StructColumnWriter::add(
- ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void StructColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const StructVectorBatch* structBatch =
- dynamic_cast<const StructVectorBatch *>(&rowBatch);
+ const StructVectorBatch* structBatch = dynamic_cast<const StructVectorBatch*>(&rowBatch);
if (structBatch == nullptr) {
throw InvalidArgument("Failed to cast to StructVectorBatch");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = structBatch->hasNulls ?
- structBatch->notNull.data() + offset : nullptr;
+ const char* notNull = structBatch->hasNulls ? structBatch->notNull.data() + offset : nullptr;
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->add(*structBatch->fields[i], offset, numValues, notNull);
}
@@ -372,8 +335,7 @@ namespace orc {
}
}
- void StructColumnWriter::writeIndex(
- std::vector<proto::Stream> &streams) const {
+ void StructColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->writeIndex(streams);
@@ -388,19 +350,17 @@ namespace orc {
return size;
}
- void StructColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void StructColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
encodings.push_back(encoding);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->getColumnEncoding(encodings);
}
}
- void StructColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void StructColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -416,8 +376,7 @@ namespace orc {
}
}
- void StructColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void StructColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -425,7 +384,7 @@ namespace orc {
}
}
- void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -455,47 +414,38 @@ namespace orc {
}
}
+ template <typename BatchType>
class IntegerColumnWriter : public ColumnWriter {
- public:
- IntegerColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
+ public:
+ IntegerColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
std::unique_ptr<RleEncoder> rleEncoder;
- private:
+ private:
RleVersion rleVersion;
};
- IntegerColumnWriter::IntegerColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()) {
+ template <typename BatchType>
+ IntegerColumnWriter<BatchType>::IntegerColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createRleEncoder(
- std::move(dataStream),
- true,
- rleVersion,
- memPool,
+ factory.createStream(proto::Stream_Kind_DATA);
+ rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -503,15 +453,12 @@ namespace orc {
}
}
- void IntegerColumnWriter::add(
- ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const LongVectorBatch* longBatch =
- dynamic_cast<const LongVectorBatch*>(&rowBatch);
- if (longBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
+ template <typename BatchType>
+ void IntegerColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset,
+ uint64_t numValues, const char* incomingMask) {
+ const BatchType* intBatch = dynamic_cast<const BatchType*>(&rowBatch);
+ if (intBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to IntegerVectorBatch");
}
IntegerColumnStatisticsImpl* intStats =
dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -521,9 +468,8 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const int64_t* data = longBatch->data.data() + offset;
- const char* notNull = longBatch->hasNulls ?
- longBatch->notNull.data() + offset : nullptr;
+ const auto* data = intBatch->data.data() + offset;
+ const char* notNull = intBatch->hasNulls ? intBatch->notNull.data() + offset : nullptr;
rleEncoder->add(data, numValues, notNull);
@@ -533,9 +479,9 @@ namespace orc {
if (notNull == nullptr || notNull[i]) {
++count;
if (enableBloomFilter) {
- bloomFilter->addLong(data[i]);
+ bloomFilter->addLong(static_cast<int64_t>(data[i]));
}
- intStats->update(data[i], 1);
+ intStats->update(static_cast<int64_t>(data[i]), 1);
}
}
intStats->increase(count);
@@ -544,7 +490,8 @@ namespace orc {
}
}
- void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ template <typename BatchType>
+ void IntegerColumnWriter<BatchType>::flush(std::vector<proto::Stream>& streams) {
ColumnWriter::flush(streams);
proto::Stream stream;
@@ -554,59 +501,57 @@ namespace orc {
streams.push_back(stream);
}
- uint64_t IntegerColumnWriter::getEstimatedSize() const {
+ template <typename BatchType>
+ uint64_t IntegerColumnWriter<BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
size += rleEncoder->getBufferSize();
return size;
}
- void IntegerColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ template <typename BatchType>
+ void IntegerColumnWriter<BatchType>::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
- void IntegerColumnWriter::recordPosition() const {
+ template <typename BatchType>
+ void IntegerColumnWriter<BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
rleEncoder->recordPosition(rowIndexPosition.get());
}
+ template <typename BatchType>
class ByteColumnWriter : public ColumnWriter {
- public:
- ByteColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ public:
+ ByteColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- private:
+ private:
std::unique_ptr<ByteRleEncoder> byteRleEncoder;
};
- ByteColumnWriter::ByteColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
+ template <typename BatchType>
+ ByteColumnWriter<BatchType>::ByteColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
+ factory.createStream(proto::Stream_Kind_DATA);
byteRleEncoder = createByteRleEncoder(std::move(dataStream));
if (enableIndex) {
@@ -614,13 +559,12 @@ namespace orc {
}
}
- void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
+ template <typename BatchType>
+ void ByteColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset,
+ uint64_t numValues, const char* incomingMask) {
+ BatchType* byteBatch = dynamic_cast<BatchType*>(&rowBatch);
if (byteBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
+ throw InvalidArgument("Failed to cast to IntegerVectorBatch");
}
IntegerColumnStatisticsImpl* intStats =
dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -630,9 +574,8 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- int64_t* data = byteBatch->data.data() + offset;
- const char* notNull = byteBatch->hasNulls ?
- byteBatch->notNull.data() + offset : nullptr;
+ auto* data = byteBatch->data.data() + offset;
+ const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + offset : nullptr;
char* byteData = reinterpret_cast<char*>(data);
for (uint64_t i = 0; i < numValues; ++i) {
@@ -656,7 +599,8 @@ namespace orc {
}
}
- void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ template <typename BatchType>
+ void ByteColumnWriter<BatchType>::flush(std::vector<proto::Stream>& streams) {
ColumnWriter::flush(streams);
proto::Stream stream;
@@ -666,59 +610,59 @@ namespace orc {
streams.push_back(stream);
}
- uint64_t ByteColumnWriter::getEstimatedSize() const {
+ template <typename BatchType>
+ uint64_t ByteColumnWriter<BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
size += byteRleEncoder->getBufferSize();
return size;
}
- void ByteColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ template <typename BatchType>
+ void ByteColumnWriter<BatchType>::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
- void ByteColumnWriter::recordPosition() const {
+ template <typename BatchType>
+ void ByteColumnWriter<BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
byteRleEncoder->recordPosition(rowIndexPosition.get());
}
+ template <typename BatchType>
class BooleanColumnWriter : public ColumnWriter {
- public:
- BooleanColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ BooleanColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- private:
+ private:
std::unique_ptr<ByteRleEncoder> rleEncoder;
};
- BooleanColumnWriter::BooleanColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
+ template <typename BatchType>
+ BooleanColumnWriter<BatchType>::BooleanColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
+ factory.createStream(proto::Stream_Kind_DATA);
rleEncoder = createBooleanRleEncoder(std::move(dataStream));
if (enableIndex) {
@@ -726,13 +670,14 @@ namespace orc {
}
}
- void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset,
+ uint64_t numValues, const char* incomingMask) {
+ BatchType* byteBatch = dynamic_cast<BatchType*>(&rowBatch);
if (byteBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
+ std::stringstream ss;
+ ss << "Failed to cast to " << typeid(BatchType).name();
+ throw InvalidArgument(ss.str());
}
BooleanColumnStatisticsImpl* boolStats =
dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -742,9 +687,8 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- int64_t* data = byteBatch->data.data() + offset;
- const char* notNull = byteBatch->hasNulls ?
- byteBatch->notNull.data() + offset : nullptr;
+ auto* data = byteBatch->data.data() + offset;
+ const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + offset : nullptr;
char* byteData = reinterpret_cast<char*>(data);
for (uint64_t i = 0; i < numValues; ++i) {
@@ -768,7 +712,8 @@ namespace orc {
}
}
- void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::flush(std::vector<proto::Stream>& streams) {
ColumnWriter::flush(streams);
proto::Stream stream;
@@ -778,65 +723,63 @@ namespace orc {
streams.push_back(stream);
}
- uint64_t BooleanColumnWriter::getEstimatedSize() const {
+ template <typename BatchType>
+ uint64_t BooleanColumnWriter<BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
size += rleEncoder->getBufferSize();
return size;
}
- void BooleanColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
- void BooleanColumnWriter::recordPosition() const {
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
rleEncoder->recordPosition(rowIndexPosition.get());
}
- class DoubleColumnWriter : public ColumnWriter {
- public:
- DoubleColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isFloat);
+ template <typename ValueType, typename BatchType>
+ class FloatingColumnWriter : public ColumnWriter {
+ public:
+ FloatingColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isFloat);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- private:
+ private:
bool isFloat;
std::unique_ptr<AppendOnlyBufferedStream> dataStream;
DataBuffer<char> buffer;
};
- DoubleColumnWriter::DoubleColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isFloatType) :
- ColumnWriter(type, factory, options),
- isFloat(isFloatType),
- buffer(*options.getMemoryPool()) {
- dataStream.reset(new AppendOnlyBufferedStream(
- factory.createStream(proto::Stream_Kind_DATA)));
+ template <typename ValueType, typename BatchType>
+ FloatingColumnWriter<ValueType, BatchType>::FloatingColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options,
+ bool isFloatType)
+ : ColumnWriter(type, factory, options),
+ isFloat(isFloatType),
+ buffer(*options.getMemoryPool()) {
+ dataStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA)));
buffer.resize(isFloat ? 4 : 8);
if (enableIndex) {
@@ -854,26 +797,24 @@ namespace orc {
}
}
- void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const DoubleVectorBatch* dblBatch =
- dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
+ template <typename ValueType, typename BatchType>
+ void FloatingColumnWriter<ValueType, BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const BatchType* dblBatch = dynamic_cast<const BatchType*>(&rowBatch);
if (dblBatch == nullptr) {
- throw InvalidArgument("Failed to cast to DoubleVectorBatch");
+ throw InvalidArgument("Failed to cast to FloatingVectorBatch");
}
DoubleColumnStatisticsImpl* doubleStats =
- dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
if (doubleStats == nullptr) {
throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const double* doubleData = dblBatch->data.data() + offset;
- const char* notNull = dblBatch->hasNulls ?
- dblBatch->notNull.data() + offset : nullptr;
+ const ValueType* doubleData = dblBatch->data.data() + offset;
+ const char* notNull = dblBatch->hasNulls ? dblBatch->notNull.data() + offset : nullptr;
size_t bytes = isFloat ? 4 : 8;
char* data = buffer.data();
@@ -883,14 +824,14 @@ namespace orc {
if (isFloat) {
encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data);
} else {
- encodeFloatNum<double, int64_t>(doubleData[i], data);
+ encodeFloatNum<double, int64_t>(static_cast<double>(doubleData[i]), data);
}
dataStream->write(data, bytes);
++count;
if (enableBloomFilter) {
- bloomFilter->addDouble(doubleData[i]);
+ bloomFilter->addDouble(static_cast<double>(doubleData[i]));
}
- doubleStats->update(doubleData[i]);
+ doubleStats->update(static_cast<double>(doubleData[i]));
}
}
doubleStats->increase(count);
@@ -899,7 +840,8 @@ namespace orc {
}
}
- void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ template <typename ValueType, typename BatchType>
+ void FloatingColumnWriter<ValueType, BatchType>::flush(std::vector<proto::Stream>& streams) {
ColumnWriter::flush(streams);
proto::Stream stream;
@@ -909,24 +851,27 @@ namespace orc {
streams.push_back(stream);
}
- uint64_t DoubleColumnWriter::getEstimatedSize() const {
+ template <typename ValueType, typename BatchType>
+ uint64_t FloatingColumnWriter<ValueType, BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
size += dataStream->getSize();
return size;
}
- void DoubleColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ template <typename ValueType, typename BatchType>
+ void FloatingColumnWriter<ValueType, BatchType>::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
- void DoubleColumnWriter::recordPosition() const {
+ template <typename ValueType, typename BatchType>
+ void FloatingColumnWriter<ValueType, BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
dataStream->recordPosition(rowIndexPosition.get());
}
@@ -935,27 +880,26 @@ namespace orc {
* Implementation of increasing sorted string dictionary
*/
class SortedStringDictionary {
- public:
+ public:
struct DictEntry {
- DictEntry(const char * str, size_t len):data(str),length(len) {}
- const char * data;
+ DictEntry(const char* str, size_t len) : data(str), length(len) {}
+ const char* data;
size_t length;
};
- SortedStringDictionary():totalLength(0) {}
+ SortedStringDictionary() : totalLength(0) {}
// insert a new string into dictionary, return its insertion order
- size_t insert(const char * data, size_t len);
+ size_t insert(const char* data, size_t len);
// write dictionary data & length to output buffer
- void flush(AppendOnlyBufferedStream * dataStream,
- RleEncoder * lengthEncoder) const;
+ void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const;
// reorder input index buffer from insertion order to dictionary order
void reorder(std::vector<int64_t>& idxBuffer) const;
// get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const;
+ void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
// return count of entries
size_t size() const;
@@ -965,7 +909,7 @@ namespace orc {
void clear();
- private:
+ private:
struct LessThan {
bool operator()(const DictEntry& left, const DictEntry& right) const {
int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
@@ -989,14 +933,14 @@ namespace orc {
};
// insert a new string into dictionary, return its insertion order
- size_t SortedStringDictionary::insert(const char * str, size_t len) {
+ size_t SortedStringDictionary::insert(const char* str, size_t len) {
auto ret = dict.insert({DictEntry(str, len), dict.size()});
if (ret.second) {
// make a copy to internal storage
data.push_back(std::vector<char>(len));
memcpy(data.back().data(), str, len);
// update dictionary entry to link pointer to internal storage
- DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first));
+ DictEntry* entry = const_cast<DictEntry*>(&(ret.first->first));
entry->data = data.back().data();
totalLength += len;
}
@@ -1004,8 +948,8 @@ namespace orc {
}
// write dictionary data & length to output buffer
- void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream,
- RleEncoder * lengthEncoder) const {
+ void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+ RleEncoder* lengthEncoder) const {
for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
dataStream->write(it->first.data, it->first.length);
lengthEncoder->write(static_cast<int64_t>(it->first.length));
@@ -1032,14 +976,13 @@ namespace orc {
// do the transformation
for (size_t i = 0; i != idxBuffer.size(); ++i) {
- idxBuffer[i] = static_cast<int64_t>(
- mapping[static_cast<size_t>(idxBuffer[i])]);
+ idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
}
}
// get dict entries in insertion order
void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector<const DictEntry *>& entries) const {
+ std::vector<const DictEntry*>& entries) const {
entries.resize(dict.size());
for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
entries[it->second] = &(it->first);
@@ -1056,29 +999,25 @@ namespace orc {
return totalLength;
}
- void SortedStringDictionary::clear() {
+ void SortedStringDictionary::clear() {
totalLength = 0;
data.clear();
dict.clear();
}
class StringColumnWriter : public ColumnWriter {
- public:
- StringColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ StringColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
@@ -1088,7 +1027,7 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
/**
* dictionary related functions
*/
@@ -1098,7 +1037,7 @@ namespace orc {
void deleteDictStreams();
void fallbackToDirectEncoding();
- protected:
+ protected:
RleVersion rleVersion;
bool useCompression;
const StreamsFactory& streamsFactory;
@@ -1128,18 +1067,16 @@ namespace orc {
mutable std::vector<size_t> startOfRowGroups;
};
- StringColumnWriter::StringColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- useCompression(options.getCompression() != CompressionKind_NONE),
- streamsFactory(factory),
- alignedBitPacking(options.getAlignedBitpacking()),
- doneDictionaryCheck(false),
- useDictionary(options.getEnableDictionary()),
- dictSizeThreshold(options.getDictionaryKeySizeThreshold()){
+ StringColumnWriter::StringColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ useCompression(options.getCompression() != CompressionKind_NONE),
+ streamsFactory(factory),
+ alignedBitPacking(options.getAlignedBitpacking()),
+ doneDictionaryCheck(false),
+ useDictionary(options.getEnableDictionary()),
+ dictSizeThreshold(options.getDictionaryKeySizeThreshold()) {
if (type.getKind() == TypeKind::BINARY) {
useDictionary = false;
doneDictionaryCheck = true;
@@ -1157,12 +1094,9 @@ namespace orc {
}
}
- void StringColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void StringColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const StringVectorBatch* stringBatch =
- dynamic_cast<const StringVectorBatch*>(&rowBatch);
+ const StringVectorBatch* stringBatch = dynamic_cast<const StringVectorBatch*>(&rowBatch);
if (stringBatch == nullptr) {
throw InvalidArgument("Failed to cast to StringVectorBatch");
}
@@ -1175,12 +1109,11 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- char *const * data = stringBatch->data.data() + offset;
+ char* const* data = stringBatch->data.data() + offset;
const int64_t* length = stringBatch->length.data() + offset;
- const char* notNull = stringBatch->hasNulls ?
- stringBatch->notNull.data() + offset : nullptr;
+ const char* notNull = stringBatch->hasNulls ? stringBatch->notNull.data() + offset : nullptr;
- if (!useDictionary){
+ if (!useDictionary) {
directLengthEncoder->add(length, numValues, notNull);
}
@@ -1259,21 +1192,18 @@ namespace orc {
return size;
}
- void StringColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void StringColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
if (!useDictionary) {
- encoding.set_kind(rleVersion == RleVersion_1 ?
- proto::ColumnEncoding_Kind_DIRECT :
- proto::ColumnEncoding_Kind_DIRECT_V2);
+ encoding.set_kind(rleVersion == RleVersion_1 ? proto::ColumnEncoding_Kind_DIRECT
+ : proto::ColumnEncoding_Kind_DIRECT_V2);
} else {
- encoding.set_kind(rleVersion == RleVersion_1 ?
- proto::ColumnEncoding_Kind_DICTIONARY :
- proto::ColumnEncoding_Kind_DICTIONARY_V2);
+ encoding.set_kind(rleVersion == RleVersion_1 ? proto::ColumnEncoding_Kind_DICTIONARY
+ : proto::ColumnEncoding_Kind_DICTIONARY_V2);
}
- encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size()));
+ encoding.set_dictionary_size(static_cast<uint32_t>(dictionary.size()));
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
@@ -1292,8 +1222,9 @@ namespace orc {
bool StringColumnWriter::checkDictionaryKeyRatio() {
if (!doneDictionaryCheck) {
- useDictionary = dictionary.size() <= static_cast<size_t>(
- static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold);
+ useDictionary = dictionary.size() <=
+ static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer.size()) *
+ dictSizeThreshold);
doneDictionaryCheck = true;
}
@@ -1320,33 +1251,24 @@ namespace orc {
void StringColumnWriter::createDirectStreams() {
std::unique_ptr<BufferedOutputStream> directLengthStream =
- streamsFactory.createStream(proto::Stream_Kind_LENGTH);
- directLengthEncoder = createRleEncoder(std::move(directLengthStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
- directDataStream.reset(new AppendOnlyBufferedStream(
- streamsFactory.createStream(proto::Stream_Kind_DATA)));
+ streamsFactory.createStream(proto::Stream_Kind_LENGTH);
+ directLengthEncoder = createRleEncoder(std::move(directLengthStream), false, rleVersion,
+ memPool, alignedBitPacking);
+ directDataStream.reset(
+ new AppendOnlyBufferedStream(streamsFactory.createStream(proto::Stream_Kind_DATA)));
}
void StringColumnWriter::createDictStreams() {
std::unique_ptr<BufferedOutputStream> dictDataStream =
- streamsFactory.createStream(proto::Stream_Kind_DATA);
- dictDataEncoder = createRleEncoder(std::move(dictDataStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
+ streamsFactory.createStream(proto::Stream_Kind_DATA);
+ dictDataEncoder =
+ createRleEncoder(std::move(dictDataStream), false, rleVersion, memPool, alignedBitPacking);
std::unique_ptr<BufferedOutputStream> dictLengthStream =
- streamsFactory.createStream(proto::Stream_Kind_LENGTH);
- dictLengthEncoder = createRleEncoder(std::move(dictLengthStream),
- false,
- rleVersion,
- memPool,
+ streamsFactory.createStream(proto::Stream_Kind_LENGTH);
+ dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), false, rleVersion, memPool,
alignedBitPacking);
dictStream.reset(new AppendOnlyBufferedStream(
- streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA)));
+ streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA)));
}
void StringColumnWriter::deleteDictStreams() {
@@ -1360,7 +1282,7 @@ namespace orc {
}
void StringColumnWriter::writeDictionary() {
- if (useDictionary && !doneDictionaryCheck) {
+ if (useDictionary && !doneDictionaryCheck) {
// when index is disabled, dictionary check happens while writing 1st stripe
if (!checkDictionaryKeyRatio()) {
fallbackToDirectEncoding();
@@ -1376,7 +1298,7 @@ namespace orc {
dictionary.reorder(dictionary.idxInDictBuffer);
// write data sequences
- int64_t * data = dictionary.idxInDictBuffer.data();
+ int64_t* data = dictionary.idxInDictBuffer.data();
if (enableIndex) {
size_t prevOffset = 0;
for (size_t i = 0; i < startOfRowGroups.size(); ++i) {
@@ -1386,9 +1308,9 @@ namespace orc {
// update index positions
int rowGroupId = static_cast<int>(i);
- proto::RowIndexEntry* indexEntry =
- (rowGroupId < rowIndex->entry_size()) ?
- rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get();
+ proto::RowIndexEntry* indexEntry = (rowGroupId < rowIndex->entry_size())
+ ? rowIndex->mutable_entry(rowGroupId)
+ : rowIndexEntry.get();
// add positions for direct streams
RowIndexPositionRecorder recorder(*indexEntry);
@@ -1397,8 +1319,7 @@ namespace orc {
prevOffset = offset;
}
- dictDataEncoder->add(data + prevOffset,
- dictionary.idxInDictBuffer.size() - prevOffset,
+ dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer.size() - prevOffset,
nullptr);
} else {
dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr);
@@ -1412,18 +1333,18 @@ namespace orc {
if (enableIndex) {
// fallback happens at the 1st row group;
// simply complete positions for direct streams
- proto::RowIndexEntry * indexEntry = rowIndexEntry.get();
+ proto::RowIndexEntry* indexEntry = rowIndexEntry.get();
RowIndexPositionRecorder recorder(*indexEntry);
directDataStream->recordPosition(&recorder);
directLengthEncoder->recordPosition(&recorder);
}
// get dictionary entries in insertion order
- std::vector<const SortedStringDictionary::DictEntry *> entries;
+ std::vector<const SortedStringDictionary::DictEntry*> entries;
dictionary.getEntriesInInsertionOrder(entries);
// store each length of the data into a vector
- const SortedStringDictionary::DictEntry * dictEntry = nullptr;
+ const SortedStringDictionary::DictEntry* dictEntry = nullptr;
for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) {
// write one row data in direct encoding
dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])];
@@ -1438,7 +1359,7 @@ namespace orc {
/**
* Counts how many utf-8 chars of the input data
*/
- static uint64_t charLength(const char * data, uint64_t length) {
+ static uint64_t charLength(const char* data, uint64_t length) {
uint64_t chars = 0;
for (uint64_t i = 0; i < length; i++) {
if (isUtfStartByte(data[i])) {
@@ -1458,9 +1379,7 @@ namespace orc {
* @param data the bytes of UTF-8
* @param length the length of data to truncate
*/
- static uint64_t truncateBytesTo(uint64_t maxCharLength,
- const char * data,
- uint64_t length) {
+ static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) {
uint64_t chars = 0;
if (length <= maxCharLength) {
return length;
@@ -1490,8 +1409,8 @@ namespace orc {
* @param from the first byte location
* @param until the last byte location
* @return the index of the last character
- */
- static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) {
+ */
+ static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) {
uint64_t posn = until;
/* we don't expect characters more than 5 bytes */
while (posn >= from) {
@@ -1501,36 +1420,29 @@ namespace orc {
posn -= 1;
}
/* beginning of a valid char not found */
- throw std::logic_error(
- "Could not truncate string, beginning of a valid char not found");
+ throw std::logic_error("Could not truncate string, beginning of a valid char not found");
}
};
class CharColumnWriter : public StringColumnWriter {
- public:
- CharColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()),
- padBuffer(*options.getMemoryPool()) {
+ public:
+ CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options)
+ : StringColumnWriter(type, factory, options),
+ maxLength(type.getMaximumLength()),
+ padBuffer(*options.getMemoryPool()) {
// utf-8 is currently 4 bytes long, but it could be up to 6
padBuffer.resize(maxLength * 6);
}
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
- private:
+ private:
uint64_t maxLength;
DataBuffer<char> padBuffer;
};
- void CharColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void CharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (charsBatch == nullptr) {
@@ -1547,26 +1459,24 @@ namespace orc {
char** data = charsBatch->data.data() + offset;
int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
+ const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + offset : nullptr;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- const char * charData = nullptr;
+ const char* charData = nullptr;
uint64_t originLength = static_cast<uint64_t>(length[i]);
uint64_t charLength = Utf8Utils::charLength(data[i], originLength);
if (charLength >= maxLength) {
charData = data[i];
- length[i] = static_cast<int64_t>(
- Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
+ length[i] =
+ static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
} else {
charData = padBuffer.data();
// the padding is exactly 1 byte per char
length[i] = length[i] + static_cast<int64_t>(maxLength - charLength);
memcpy(padBuffer.data(), data[i], originLength);
- memset(padBuffer.data() + originLength,
- ' ',
+ memset(padBuffer.data() + originLength, ' ',
static_cast<size_t>(length[i]) - originLength);
}
@@ -1596,27 +1506,21 @@ namespace orc {
}
class VarCharColumnWriter : public StringColumnWriter {
- public:
- VarCharColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()) {
+ public:
+ VarCharColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : StringColumnWriter(type, factory, options), maxLength(type.getMaximumLength()) {
// PASS
}
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
- private:
+ private:
uint64_t maxLength;
};
- void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (charsBatch == nullptr) {
@@ -1633,14 +1537,13 @@ namespace orc {
char* const* data = charsBatch->data.data() + offset;
int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
+ const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + offset : nullptr;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- uint64_t itemLength = Utf8Utils::truncateBytesTo(
- maxLength, data[i], static_cast<uint64_t>(length[i]));
+ uint64_t itemLength =
+ Utf8Utils::truncateBytesTo(maxLength, data[i], static_cast<uint64_t>(length[i]));
length[i] = static_cast<int64_t>(itemLength);
if (useDictionary) {
@@ -1669,23 +1572,18 @@ namespace orc {
}
class BinaryColumnWriter : public StringColumnWriter {
- public:
- BinaryColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options) {
+ public:
+ BinaryColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : StringColumnWriter(type, factory, options) {
// PASS
}
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
};
- void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (binBatch == nullptr) {
@@ -1702,8 +1600,7 @@ namespace orc {
char** data = binBatch->data.data() + offset;
int64_t* length = binBatch->length.data() + offset;
- const char* notNull = binBatch->hasNulls ?
- binBatch->notNull.data() + offset : nullptr;
+ const char* notNull = binBatch->hasNulls ? binBatch->notNull.data() + offset : nullptr;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
@@ -1726,60 +1623,43 @@ namespace orc {
}
class TimestampColumnWriter : public ColumnWriter {
- public:
- TimestampColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isInstantType);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ public:
+ TimestampColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isInstantType);
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder;
- private:
+ private:
RleVersion rleVersion;
- const Timezone& timezone;
+ const Timezone* timezone;
const bool isUTC;
};
- TimestampColumnWriter::TimestampColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isInstantType) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- timezone(isInstantType ?
- getTimezoneByName("GMT") :
- options.getTimezone()),
- isUTC(isInstantType ||
- options.getTimezoneName() == "GMT") {
+ TimestampColumnWriter::TimestampColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isInstantType)
+ : ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ timezone(isInstantType ? &getTimezoneByName("GMT") : &options.getTimezone()),
+ isUTC(isInstantType || options.getTimezoneName() == "GMT") {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
std::unique_ptr<BufferedOutputStream> secondaryStream =
factory.createStream(proto::Stream_Kind_SECONDARY);
- secRleEncoder = createRleEncoder(std::move(dataStream),
- true,
- rleVersion,
- memPool,
+ secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool,
options.getAlignedBitpacking());
- nanoRleEncoder = createRleEncoder(std::move(secondaryStream),
- false,
- rleVersion,
- memPool,
+ nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -1808,12 +1688,9 @@ namespace orc {
}
}
- void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- TimestampVectorBatch* tsBatch =
- dynamic_cast<TimestampVectorBatch*>(&rowBatch);
+ TimestampVectorBatch* tsBatch = dynamic_cast<TimestampVectorBatch*>(&rowBatch);
if (tsBatch == nullptr) {
throw InvalidArgument("Failed to cast to TimestampVectorBatch");
}
@@ -1826,10 +1703,9 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = tsBatch->hasNulls ?
- tsBatch->notNull.data() + offset : nullptr;
- int64_t *secs = tsBatch->data.data() + offset;
- int64_t *nanos = tsBatch->nanoseconds.data() + offset;
+ const char* notNull = tsBatch->hasNulls ? tsBatch->notNull.data() + offset : nullptr;
+ int64_t* secs = tsBatch->data.data() + offset;
+ int64_t* nanos = tsBatch->nanoseconds.data() + offset;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
@@ -1837,7 +1713,7 @@ namespace orc {
// TimestampVectorBatch already stores data in UTC
int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
if (!isUTC) {
- millsUTC = timezone.convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000;
+ millsUTC = timezone->convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000;
}
++count;
if (enableBloomFilter) {
@@ -1849,7 +1725,7 @@ namespace orc {
secs[i] += 1;
}
- secs[i] -= timezone.getEpoch();
+ secs[i] -= timezone->getEpoch();
nanos[i] = formatNano(nanos[i]);
}
}
@@ -1886,12 +1762,12 @@ namespace orc {
}
void TimestampColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
@@ -1902,32 +1778,23 @@ namespace orc {
nanoRleEncoder->recordPosition(rowIndexPosition.get());
}
- class DateColumnWriter : public IntegerColumnWriter {
- public:
- DateColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
+ class DateColumnWriter : public IntegerColumnWriter<LongVectorBatch> {
+ public:
+ DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
};
- DateColumnWriter::DateColumnWriter(
- const Type &type,
- const StreamsFactory &factory,
- const WriterOptions &options) :
- IntegerColumnWriter(type, factory, options) {
+ DateColumnWriter::DateColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : IntegerColumnWriter<LongVectorBatch>(type, factory, options) {
// PASS
}
- void DateColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void DateColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const LongVectorBatch* longBatch =
- dynamic_cast<const LongVectorBatch*>(&rowBatch);
+ const LongVectorBatch* longBatch = dynamic_cast<const LongVectorBatch*>(&rowBatch);
if (longBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
@@ -1941,8 +1808,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const int64_t* data = longBatch->data.data() + offset;
- const char* notNull = longBatch->hasNulls ?
- longBatch->notNull.data() + offset : nullptr;
+ const char* notNull = longBatch->hasNulls ? longBatch->notNull.data() + offset : nullptr;
rleEncoder->add(data, numValues, notNull);
@@ -1963,55 +1829,45 @@ namespace orc {
}
class Decimal64ColumnWriter : public ColumnWriter {
- public:
+ public:
static const uint32_t MAX_PRECISION_64 = 18;
static const uint32_t MAX_PRECISION_128 = 38;
- Decimal64ColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
RleVersion rleVersion;
uint64_t precision;
uint64_t scale;
std::unique_ptr<AppendOnlyBufferedStream> valueStream;
std::unique_ptr<RleEncoder> scaleEncoder;
- private:
+ private:
char buffer[10];
};
- Decimal64ColumnWriter::Decimal64ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- precision(type.getPrecision()),
- scale(type.getScale()) {
- valueStream.reset(new AppendOnlyBufferedStream(
- factory.createStream(proto::Stream_Kind_DATA)));
+ Decimal64ColumnWriter::Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ precision(type.getPrecision()),
+ scale(type.getScale()) {
+ valueStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA)));
std::unique_ptr<BufferedOutputStream> scaleStream =
factory.createStream(proto::Stream_Kind_SECONDARY);
- scaleEncoder = createRleEncoder(std::move(scaleStream),
- true,
- rleVersion,
- memPool,
+ scaleEncoder = createRleEncoder(std::move(scaleStream), true, rleVersion, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -2019,26 +1875,22 @@ namespace orc {
}
}
- void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const Decimal64VectorBatch* decBatch =
- dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
+ const Decimal64VectorBatch* decBatch = dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
}
DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
+ const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr;
const int64_t* values = decBatch->values.data() + offset;
uint64_t count = 0;
@@ -2059,10 +1911,8 @@ namespace orc {
valueStream->write(buffer, static_cast<size_t>(data - buffer));
++count;
if (enableBloomFilter) {
- std::string decimal = Decimal(
- values[i], static_cast<int32_t>(scale)).toString(true);
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true);
+ bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()));
}
decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
}
@@ -2099,12 +1949,12 @@ namespace orc {
}
void Decimal64ColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
@@ -2116,44 +1966,35 @@ namespace orc {
}
class Decimal64ColumnWriterV2 : public ColumnWriter {
- public:
- Decimal64ColumnWriterV2(const Type& type,
- const StreamsFactory& factory,
+ public:
+ Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
uint64_t precision;
uint64_t scale;
std::unique_ptr<RleEncoder> valueEncoder;
};
- Decimal64ColumnWriterV2::Decimal64ColumnWriterV2(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- precision(type.getPrecision()),
- scale(type.getScale()) {
+ Decimal64ColumnWriterV2::Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options),
+ precision(type.getPrecision()),
+ scale(type.getScale()) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
- valueEncoder = createRleEncoder(std::move(dataStream),
- true,
- RleVersion_2,
- memPool,
+ valueEncoder = createRleEncoder(std::move(dataStream), true, RleVersion_2, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -2161,18 +2002,15 @@ namespace orc {
}
}
- void Decimal64ColumnWriterV2::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const Decimal64VectorBatch* decBatch =
- dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
+ void Decimal64ColumnWriterV2::add(ColumnVectorBatch& rowBatch, uint64_t offset,
+ uint64_t numValues, const char* incomingMask) {
+ const Decimal64VectorBatch* decBatch = dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
}
DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
@@ -2180,8 +2018,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const int64_t* data = decBatch->values.data() + offset;
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
+ const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr;
valueEncoder->add(data, numValues, notNull);
@@ -2190,10 +2027,8 @@ namespace orc {
if (!notNull || notNull[i]) {
++count;
if (enableBloomFilter) {
- std::string decimal = Decimal(
- data[i], static_cast<int32_t>(scale)).toString(true);
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ std::string decimal = Decimal(data[i], static_cast<int32_t>(scale)).toString(true);
+ bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()));
}
decStats->update(Decimal(data[i], static_cast<int32_t>(scale)));
}
@@ -2221,12 +2056,12 @@ namespace orc {
}
void Decimal64ColumnWriterV2::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(RleVersion_2));
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
}
@@ -2237,25 +2072,20 @@ namespace orc {
}
class Decimal128ColumnWriter : public Decimal64ColumnWriter {
- public:
- Decimal128ColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
- private:
+ private:
char buffer[20];
};
- Decimal128ColumnWriter::Decimal128ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- Decimal64ColumnWriter(type, factory, options) {
+ Decimal128ColumnWriter::Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : Decimal64ColumnWriter(type, factory, options) {
// PASS
}
@@ -2272,26 +2102,22 @@ namespace orc {
return val;
}
- void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const Decimal128VectorBatch* decBatch =
- dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
+ const Decimal128VectorBatch* decBatch = dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal128VectorBatch");
}
DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
+ const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr;
const Int128* values = decBatch->values.data() + offset;
// The current encoding of decimal columns stores the integer representation
@@ -2314,10 +2140,8 @@ namespace orc {
++count;
if (enableBloomFilter) {
- std::string decimal = Decimal(
- values[i], static_cast<int32_t>(scale)).toString(true);
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true);
+ bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()));
}
decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
}
@@ -2331,29 +2155,22 @@ namespace orc {
}
class ListColumnWriter : public ColumnWriter {
- public:
- ListColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
+ public:
+ ListColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
~ListColumnWriter() override;
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -2361,8 +2178,7 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void recordPosition() const override;
@@ -2370,24 +2186,18 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
std::unique_ptr<RleEncoder> lengthEncoder;
RleVersion rleVersion;
std::unique_ptr<ColumnWriter> child;
};
- ListColumnWriter::ListColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()){
-
+ ListColumnWriter::ListColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> lengthStream =
- factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream),
- false,
- rleVersion,
- memPool,
+ factory.createStream(proto::Stream_Kind_LENGTH);
+ lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool,
options.getAlignedBitpacking());
if (type.getSubtypeCount() == 1) {
@@ -2403,9 +2213,7 @@ namespace orc {
// PASS
}
- void ListColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void ListColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch);
if (listBatch == nullptr) {
@@ -2420,8 +2228,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
int64_t* offsets = listBatch->offsets.data() + offset;
- const char* notNull = listBatch->hasNulls ?
- listBatch->notNull.data() + offset : nullptr;
+ const char* notNull = listBatch->hasNulls ? listBatch->notNull.data() + offset : nullptr;
uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
@@ -2473,7 +2280,7 @@ namespace orc {
}
}
- void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ void ListColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
if (child.get()) {
child->writeIndex(streams);
@@ -2489,13 +2296,12 @@ namespace orc {
return size;
}
- void ListColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void ListColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
if (child.get()) {
@@ -2503,8 +2309,7 @@ namespace orc {
}
}
- void ListColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ListColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
if (child.get()) {
child->getStripeStatistics(stats);
@@ -2518,15 +2323,14 @@ namespace orc {
}
}
- void ListColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ListColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
if (child.get()) {
child->getFileStatistics(stats);
}
}
- void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
if (child.get()) {
child->mergeRowGroupStatsIntoStripeStats();
@@ -2559,29 +2363,22 @@ namespace orc {
}
class MapColumnWriter : public ColumnWriter {
- public:
- MapColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
+ public:
+ MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
~MapColumnWriter() override;
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -2589,8 +2386,7 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void recordPosition() const override;
@@ -2598,24 +2394,19 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
std::unique_ptr<ColumnWriter> keyWriter;
std::unique_ptr<ColumnWriter> elemWriter;
std::unique_ptr<RleEncoder> lengthEncoder;
RleVersion rleVersion;
};
- MapColumnWriter::MapColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()){
+ MapColumnWriter::MapColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> lengthStream =
- factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream),
- false,
- rleVersion,
- memPool,
+ factory.createStream(proto::Stream_Kind_LENGTH);
+ lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool,
options.getAlignedBitpacking());
if (type.getSubtypeCount() > 0) {
@@ -2635,9 +2426,7 @@ namespace orc {
// PASS
}
- void MapColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void MapColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch);
if (mapBatch == nullptr) {
@@ -2652,8 +2441,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
int64_t* offsets = mapBatch->offsets.data() + offset;
- const char* notNull = mapBatch->hasNulls ?
- mapBatch->notNull.data() + offset : nullptr;
+ const char* notNull = mapBatch->hasNulls ? mapBatch->notNull.data() + offset : nullptr;
uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
@@ -2712,8 +2500,7 @@ namespace orc {
}
}
- void MapColumnWriter::writeIndex(
- std::vector<proto::Stream> &streams) const {
+ void MapColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
if (keyWriter.get()) {
keyWriter->writeIndex(streams);
@@ -2735,13 +2522,12 @@ namespace orc {
return size;
}
- void MapColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void MapColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
if (keyWriter.get()) {
@@ -2752,8 +2538,7 @@ namespace orc {
}
}
- void MapColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void MapColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
if (keyWriter.get()) {
keyWriter->getStripeStatistics(stats);
@@ -2773,8 +2558,7 @@ namespace orc {
}
}
- void MapColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void MapColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
if (keyWriter.get()) {
keyWriter->getFileStatistics(stats);
@@ -2784,7 +2568,7 @@ namespace orc {
}
}
- void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
if (keyWriter.get()) {
keyWriter->mergeRowGroupStatsIntoStripeStats();
@@ -2829,28 +2613,22 @@ namespace orc {
}
class UnionColumnWriter : public ColumnWriter {
- public:
- UnionColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ UnionColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -2858,8 +2636,7 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void recordPosition() const override;
@@ -2867,24 +2644,20 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
std::unique_ptr<ByteRleEncoder> rleEncoder;
std::vector<std::unique_ptr<ColumnWriter>> children;
};
- UnionColumnWriter::UnionColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
-
+ UnionColumnWriter::UnionColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
+ factory.createStream(proto::Stream_Kind_DATA);
rleEncoder = createByteRleEncoder(std::move(dataStream));
for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) {
- children.push_back(buildWriter(*type.getSubtype(i),
- factory,
- options));
+ children.push_back(buildWriter(*type.getSubtype(i), factory, options));
}
if (enableIndex) {
@@ -2892,9 +2665,7 @@ namespace orc {
}
}
- void UnionColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch);
if (unionBatch == nullptr) {
@@ -2903,10 +2674,9 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = unionBatch->hasNulls ?
- unionBatch->notNull.data() + offset : nullptr;
- unsigned char * tags = unionBatch->tags.data() + offset;
- uint64_t * offsets = unionBatch->offsets.data() + offset;
+ const char* notNull = unionBatch->hasNulls ? unionBatch->notNull.data() + offset : nullptr;
+ unsigned char* tags = unionBatch->tags.data() + offset;
+ uint64_t* offsets = unionBatch->offsets.data() + offset;
std::vector<int64_t> childOffset(children.size(), -1);
std::vector<uint64_t> childLength(children.size(), 0);
@@ -2922,8 +2692,7 @@ namespace orc {
for (uint32_t i = 0; i < children.size(); ++i) {
if (childLength[i] > 0) {
- children[i]->add(*unionBatch->children[i],
- static_cast<uint64_t>(childOffset[i]),
+ children[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]),
childLength[i], nullptr);
}
}
@@ -2964,7 +2733,7 @@ namespace orc {
}
}
- void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ void UnionColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->writeIndex(streams);
@@ -2980,13 +2749,12 @@ namespace orc {
return size;
}
- void UnionColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void UnionColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
+ encoding.set_dictionary_size(0);
if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ encoding.set_bloom_encoding(BloomFilterVersion::UTF8);
}
encodings.push_back(encoding);
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -2994,8 +2762,7 @@ namespace orc {
}
}
- void UnionColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void UnionColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->getStripeStatistics(stats);
@@ -3009,15 +2776,14 @@ namespace orc {
}
}
- void UnionColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void UnionColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->getFileStatistics(stats);
}
}
- void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->mergeRowGroupStatsIntoStripeStats();
@@ -3049,140 +2815,80 @@ namespace orc {
}
}
- std::unique_ptr<ColumnWriter> buildWriter(
- const Type& type,
- const StreamsFactory& factory,
+ std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options) {
switch (static_cast<int64_t>(type.getKind())) {
case STRUCT:
- return std::unique_ptr<ColumnWriter>(
- new StructColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<StructColumnWriter>(type, factory, options);
+ case SHORT:
+ if (options.getUseTightNumericVector()) {
+ return std::make_unique<IntegerColumnWriter<ShortVectorBatch>>(type, factory, options);
+ }
+ return std::make_unique<IntegerColumnWriter<LongVectorBatch>>(type, factory, options);
case INT:
+ if (options.getUseTightNumericVector()) {
+ return std::make_unique<IntegerColumnWriter<IntVectorBatch>>(type, factory, options);
+ }
+ return std::make_unique<IntegerColumnWriter<LongVectorBatch>>(type, factory, options);
case LONG:
- case SHORT:
- return std::unique_ptr<ColumnWriter>(
- new IntegerColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<IntegerColumnWriter<LongVectorBatch>>(type, factory, options);
case BYTE:
- return std::unique_ptr<ColumnWriter>(
- new ByteColumnWriter(
- type,
- factory,
- options));
+ if (options.getUseTightNumericVector()) {
+ return std::make_unique<ByteColumnWriter<ByteVectorBatch>>(type, factory, options);
+ }
+ return std::make_unique<ByteColumnWriter<LongVectorBatch>>(type, factory, options);
case BOOLEAN:
- return std::unique_ptr<ColumnWriter>(
- new BooleanColumnWriter(
- type,
- factory,
- options));
+ if (options.getUseTightNumericVector()) {
+ return std::make_unique<BooleanColumnWriter<ByteVectorBatch>>(type, factory, options);
+ }
+ return std::make_unique<BooleanColumnWriter<LongVectorBatch>>(type, factory, options);
case DOUBLE:
- return std::unique_ptr<ColumnWriter>(
- new DoubleColumnWriter(
- type,
- factory,
- options,
- false));
+ return std::make_unique<FloatingColumnWriter<double, DoubleVectorBatch>>(type, factory,
+ options, false);
case FLOAT:
- return std::unique_ptr<ColumnWriter>(
- new DoubleColumnWriter(
- type,
- factory,
- options,
- true));
+ if (options.getUseTightNumericVector()) {
+ return std::make_unique<FloatingColumnWriter<float, FloatVectorBatch>>(type, factory,
+ options, true);
+ }
+ return std::make_unique<FloatingColumnWriter<double, DoubleVectorBatch>>(type, factory,
+ options, true);
case BINARY:
- return std::unique_ptr<ColumnWriter>(
- new BinaryColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<BinaryColumnWriter>(type, factory, options);
case STRING:
- return std::unique_ptr<ColumnWriter>(
- new StringColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<StringColumnWriter>(type, factory, options);
case CHAR:
- return std::unique_ptr<ColumnWriter>(
- new CharColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<CharColumnWriter>(type, factory, options);
case VARCHAR:
- return std::unique_ptr<ColumnWriter>(
- new VarCharColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<VarCharColumnWriter>(type, factory, options);
case DATE:
- return std::unique_ptr<ColumnWriter>(
- new DateColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<DateColumnWriter>(type, factory, options);
case TIMESTAMP:
- return std::unique_ptr<ColumnWriter>(
- new TimestampColumnWriter(
- type,
- factory,
- options,
- false));
+ return std::make_unique<TimestampColumnWriter>(type, factory, options, false);
case TIMESTAMP_INSTANT:
- return std::unique_ptr<ColumnWriter>(
- new TimestampColumnWriter(
- type,
- factory,
- options,
- true));
+ return std::make_unique<TimestampColumnWriter>(type, factory, options, true);
case DECIMAL:
if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) {
if (options.getFileVersion() == FileVersion::UNSTABLE_PRE_2_0()) {
- return std::unique_ptr<ColumnWriter>(
- new Decimal64ColumnWriterV2(
- type,
- factory,
- options));
+ return std::make_unique<Decimal64ColumnWriterV2>(type, factory, options);
}
- return std::unique_ptr<ColumnWriter>(
- new Decimal64ColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<Decimal64ColumnWriter>(type, factory, options);
} else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) {
- return std::unique_ptr<ColumnWriter>(
- new Decimal128ColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<Decimal128ColumnWriter>(type, factory, options);
} else {
- throw NotImplementedYet("Decimal precision more than 38 is not "
- "supported");
+ throw NotImplementedYet(
+ "Decimal precision more than 38 is not "
+ "supported");
}
case LIST:
- return std::unique_ptr<ColumnWriter>(
- new ListColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<ListColumnWriter>(type, factory, options);
case MAP:
- return std::unique_ptr<ColumnWriter>(
- new MapColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<MapColumnWriter>(type, factory, options);
case UNION:
- return std::unique_ptr<ColumnWriter>(
- new UnionColumnWriter(
- type,
- factory,
- options));
+ return std::make_unique<UnionColumnWriter>(type, factory, options);
default:
- throw NotImplementedYet("Type is not supported yet for creating "
- "ColumnWriter.");
+ throw NotImplementedYet(
+ "Type is not supported yet for creating "
+ "ColumnWriter.");
}
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
index 20983774c4..f21ffd6f83 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
@@ -24,15 +24,15 @@
#include "BloomFilter.hh"
#include "ByteRLE.hh"
#include "Compression.hh"
-#include "orc/Exceptions.hh"
#include "Statistics.hh"
+#include "orc/Exceptions.hh"
#include "wrap/orc-proto-wrapper.hh"
namespace orc {
class StreamsFactory {
- public:
+ public:
virtual ~StreamsFactory();
/**
@@ -40,29 +40,26 @@ namespace orc {
* @param kind the kind of the stream
* @return the buffered output stream
*/
- virtual std::unique_ptr<BufferedOutputStream>
- createStream(proto::Stream_Kind kind) const = 0;
+ virtual std::unique_ptr<BufferedOutputStream> createStream(proto::Stream_Kind kind) const = 0;
};
- std::unique_ptr<StreamsFactory> createStreamsFactory(
- const WriterOptions& options,
- OutputStream * outStream);
+ std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options,
+ OutputStream* outStream);
/**
* record stream positions for row index
*/
class RowIndexPositionRecorder : public PositionRecorder {
- public:
+ public:
virtual ~RowIndexPositionRecorder() override;
- RowIndexPositionRecorder(proto::RowIndexEntry& entry):
- rowIndexEntry(entry) {}
+ RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry(entry) {}
virtual void add(uint64_t pos) override {
rowIndexEntry.add_positions(pos);
}
- private:
+ private:
proto::RowIndexEntry& rowIndexEntry;
};
@@ -70,7 +67,7 @@ namespace orc {
* The interface for writing ORC data types.
*/
class ColumnWriter {
- protected:
+ protected:
std::unique_ptr<ByteRleEncoder> notNullEncoder;
uint64_t columnId;
std::unique_ptr<MutableColumnStatistics> colIndexStatistics;
@@ -88,9 +85,8 @@ namespace orc {
std::unique_ptr<BloomFilterImpl> bloomFilter;
std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex;
- public:
- ColumnWriter(const Type& type, const StreamsFactory& factory,
- const WriterOptions& options);
+ public:
+ ColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
virtual ~ColumnWriter();
@@ -103,10 +99,8 @@ namespace orc {
* a mask (with at least numValues bytes) for which
* values to write.
*/
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char * incomingMask);
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
+ const char* incomingMask);
/**
* Flush column writer output streams.
* @param streams vector to store streams generated by flush()
@@ -123,22 +117,19 @@ namespace orc {
* Get the encoding used by the writer for this column.
* @param encodings vector to store the returned ColumnEncoding info
*/
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const = 0;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const = 0;
/**
* Get the stripe statistics for this column.
* @param stats vector to store the returned stripe statistics
*/
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const;
/**
* Get the file statistics for this column.
* @param stats vector to store the returned file statistics
*/
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const;
/**
* Merge index stats into stripe stats and reset index stats.
@@ -167,7 +158,7 @@ namespace orc {
* Write row index streams for this column.
* @param streams output list of ROW_INDEX streams
*/
- virtual void writeIndex(std::vector<proto::Stream> &streams) const;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const;
/**
* Record positions for index.
@@ -188,22 +179,21 @@ namespace orc {
*/
virtual void writeDictionary();
- protected:
+ protected:
/**
* Utility function to translate ColumnStatistics into protobuf form and
* add it to output list.
* @param statsList output list for protobuf stats
* @param stats ColumnStatistics to be transformed and added
*/
- void getProtoBufStatistics(
- std::vector<proto::ColumnStatistics>& statsList,
- const MutableColumnStatistics* stats) const {
- proto::ColumnStatistics pbStats;
- stats->toProtoBuf(pbStats);
- statsList.push_back(pbStats);
- }
+ void getProtoBufStatistics(std::vector<proto::ColumnStatistics>& statsList,
+ const MutableColumnStatistics* stats) const {
+ proto::ColumnStatistics pbStats;
+ stats->toProtoBuf(pbStats);
+ statsList.push_back(pbStats);
+ }
- protected:
+ protected:
MemoryPool& memPool;
std::unique_ptr<BufferedOutputStream> indexStream;
std::unique_ptr<BufferedOutputStream> bloomFilterStream;
@@ -213,10 +203,8 @@ namespace orc {
/**
* Create a writer for the given type.
*/
- std::unique_ptr<ColumnWriter> buildWriter(
- const Type& type,
- const StreamsFactory& factory,
+ std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc
index 477bfd3b4c..cf2ff27ef1 100644
--- a/contrib/libs/apache/orc/c++/src/Common.cc
+++ b/contrib/libs/apache/orc/c++/src/Common.cc
@@ -82,6 +82,8 @@ namespace orc {
return "Scritchley Go";
case TRINO_WRITER:
return "Trino";
+ case CUDF_WRITER:
+ return "CUDF";
default: {
std::ostringstream buffer;
buffer << "Unknown(" << id << ")";
@@ -138,14 +140,14 @@ namespace orc {
ss << majorVersion << '.' << minorVersion;
return ss.str();
}
-
- const FileVersion& FileVersion::v_0_11(){
- static FileVersion version(0,11);
+
+ const FileVersion& FileVersion::v_0_11() {
+ static FileVersion version(0, 11);
return version;
}
-
- const FileVersion& FileVersion::v_0_12(){
- static FileVersion version(0,12);
+
+ const FileVersion& FileVersion::v_0_12() {
+ static FileVersion version(0, 12);
return version;
}
@@ -156,9 +158,9 @@ namespace orc {
* without providing any forward or backward compatibility.
*
* When 2.0 is released, this version identifier will be completely removed.
- */
+ */
const FileVersion& FileVersion::UNSTABLE_PRE_2_0() {
static FileVersion version(1, 9999);
return version;
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc
index ea10171507..94be774ab4 100644
--- a/contrib/libs/apache/orc/c++/src/Compression.cc
+++ b/contrib/libs/apache/orc/c++/src/Compression.cc
@@ -16,13 +16,15 @@
* limitations under the License.
*/
-#include "Adaptor.hh"
#include "Compression.hh"
-#include "orc/Exceptions.hh"
+#include "Adaptor.hh"
#include "LzoDecompressor.hh"
+#include "Utils.hh"
#include "lz4.h"
+#include "orc/Exceptions.hh"
#include <algorithm>
+#include <array>
#include <iomanip>
#include <iostream>
#include <sstream>
@@ -47,28 +49,30 @@
namespace orc {
- class CompressionStreamBase: public BufferedOutputStream {
- public:
- CompressionStreamBase(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool);
+ class CompressionStreamBase : public BufferedOutputStream {
+ public:
+ CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
- virtual bool Next(void** data, int*size) override = 0;
+ virtual bool Next(void** data, int* size) override = 0;
virtual void BackUp(int count) override;
virtual std::string getName() const override = 0;
virtual uint64_t flush() override;
+ virtual void suppress() override;
- virtual bool isCompressed() const override { return true; }
+ virtual bool isCompressed() const override {
+ return true;
+ }
virtual uint64_t getSize() const override;
- protected:
- void writeHeader(char * buffer, size_t compressedSize, bool original) {
- buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0));
- buffer[1] = static_cast<char>(compressedSize >> 7);
- buffer[2] = static_cast<char>(compressedSize >> 15);
+ protected:
+ void writeData(const unsigned char* data, int size);
+
+ void writeHeader(size_t compressedSize, bool original) {
+ *header[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0));
+ *header[1] = static_cast<char>(compressedSize >> 7);
+ *header[2] = static_cast<char>(compressedSize >> 15);
}
// ensure enough room for compression block header
@@ -81,7 +85,7 @@ namespace orc {
int level;
// Compressed data output buffer
- char * outputBuffer;
+ char* outputBuffer;
// Size for compressionBuffer
int bufferSize;
@@ -91,24 +95,24 @@ namespace orc {
// Compress output buffer size
int outputSize;
+
+ // Compression block header pointer array
+ static const uint32_t HEADER_SIZE = 3;
+ std::array<char*, HEADER_SIZE> header;
};
- CompressionStreamBase::CompressionStreamBase(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool) :
- BufferedOutputStream(pool,
- outStream,
- capacity,
- blockSize),
- rawInputBuffer(pool, blockSize),
- level(compressionLevel),
- outputBuffer(nullptr),
- bufferSize(0),
- outputPosition(0),
- outputSize(0) {
- // PASS
+ CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel,
+ uint64_t capacity, uint64_t blockSize,
+ MemoryPool& pool, WriterMetrics* metrics)
+ : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics),
+ rawInputBuffer(pool, blockSize),
+ level(compressionLevel),
+ outputBuffer(nullptr),
+ bufferSize(0),
+ outputPosition(0),
+ outputSize(0) {
+ // init header pointer array
+ header.fill(nullptr);
}
void CompressionStreamBase::BackUp(int count) {
@@ -119,7 +123,7 @@ namespace orc {
}
uint64_t CompressionStreamBase::flush() {
- void * data;
+ void* data;
int size;
if (!Next(&data, &size)) {
throw std::runtime_error("Failed to flush compression buffer.");
@@ -129,79 +133,91 @@ namespace orc {
return BufferedOutputStream::flush();
}
+ void CompressionStreamBase::suppress() {
+ outputBuffer = nullptr;
+ bufferSize = outputPosition = outputSize = 0;
+ BufferedOutputStream::suppress();
+ }
+
uint64_t CompressionStreamBase::getSize() const {
- return BufferedOutputStream::getSize() -
- static_cast<uint64_t>(outputSize - outputPosition);
+ return BufferedOutputStream::getSize() - static_cast<uint64_t>(outputSize - outputPosition);
+ }
+
+ // write the data content into outputBuffer
+ void CompressionStreamBase::writeData(const unsigned char* data, int size) {
+ int offset = 0;
+ while (offset < size) {
+ if (outputPosition == outputSize) {
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw std::runtime_error("Failed to get next output buffer from output stream.");
+ }
+ outputPosition = 0;
+ } else if (outputPosition > outputSize) {
+ // for safety this will unlikely happen
+ throw std::logic_error("Write to an out-of-bound place during compression!");
+ }
+ int currentSize = std::min(outputSize - outputPosition, size - offset);
+ memcpy(outputBuffer + outputPosition, data + offset, static_cast<size_t>(currentSize));
+ offset += currentSize;
+ outputPosition += currentSize;
+ }
}
void CompressionStreamBase::ensureHeader() {
// adjust 3 bytes for the compression header
- if (outputPosition + 3 >= outputSize) {
- int newPosition = outputPosition + 3 - outputSize;
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
+ for (uint32_t i = 0; i < HEADER_SIZE; ++i) {
+ if (outputPosition >= outputSize) {
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw std::runtime_error("Failed to get next output buffer from output stream.");
+ }
+ outputPosition = 0;
}
- outputPosition = newPosition;
- } else {
- outputPosition += 3;
+ header[i] = outputBuffer + outputPosition;
+ ++outputPosition;
}
}
/**
* Streaming compression base class
*/
- class CompressionStream: public CompressionStreamBase {
- public:
- CompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool);
-
- virtual bool Next(void** data, int*size) override;
+ class CompressionStream : public CompressionStreamBase {
+ public:
+ CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+
+ virtual bool Next(void** data, int* size) override;
virtual std::string getName() const override = 0;
- protected:
+ protected:
// return total compressed size
virtual uint64_t doStreamingCompression() = 0;
};
- CompressionStream::CompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool) :
- CompressionStreamBase(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
+ CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel,
+ uint64_t capacity, uint64_t blockSize, MemoryPool& pool,
+ WriterMetrics* metrics)
+ : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
// PASS
}
- bool CompressionStream::Next(void** data, int*size) {
+ bool CompressionStream::Next(void** data, int* size) {
if (bufferSize != 0) {
ensureHeader();
+ uint64_t preSize = getSize();
uint64_t totalCompressedSize = doStreamingCompression();
-
- char * header = outputBuffer + outputPosition - totalCompressedSize - 3;
if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) {
- writeHeader(header, static_cast<size_t>(bufferSize), true);
- memcpy(
- header + 3,
- rawInputBuffer.data(),
- static_cast<size_t>(bufferSize));
-
- int backup = static_cast<int>(totalCompressedSize) - bufferSize;
- BufferedOutputStream::BackUp(backup);
- outputPosition -= backup;
- outputSize -= backup;
+ writeHeader(static_cast<size_t>(bufferSize), true);
+ // reset output buffer
+ outputBuffer = nullptr;
+ outputPosition = outputSize = 0;
+ uint64_t backup = getSize() - preSize;
+ BufferedOutputStream::BackUp(static_cast<int>(backup));
+
+ // copy raw input buffer into block buffer
+ writeData(rawInputBuffer.data(), bufferSize);
} else {
- writeHeader(header, totalCompressedSize, false);
+ writeHeader(totalCompressedSize, false);
}
}
@@ -212,13 +228,10 @@ namespace orc {
return true;
}
- class ZlibCompressionStream: public CompressionStream {
- public:
- ZlibCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool);
+ class ZlibCompressionStream : public CompressionStream {
+ public:
+ ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
virtual ~ZlibCompressionStream() override {
end();
@@ -226,26 +239,19 @@ namespace orc {
virtual std::string getName() const override;
- protected:
+ protected:
virtual uint64_t doStreamingCompression() override;
- private:
+ private:
void init();
void end();
z_stream strm;
};
- ZlibCompressionStream::ZlibCompressionStream(
- OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : CompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
+ ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel,
+ uint64_t capacity, uint64_t blockSize,
+ MemoryPool& pool, WriterMetrics* metrics)
+ : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
init();
}
@@ -259,18 +265,13 @@ namespace orc {
do {
if (outputPosition >= outputSize) {
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw std::runtime_error("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
}
- strm.next_out = reinterpret_cast<unsigned char *>
- (outputBuffer + outputPosition);
- strm.avail_out = static_cast<unsigned int>
- (outputSize - outputPosition);
+ strm.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition);
+ strm.avail_out = static_cast<unsigned int>(outputSize - outputPosition);
int ret = deflate(&strm, Z_FINISH);
outputPosition = outputSize - static_cast<int>(strm.avail_out);
@@ -291,7 +292,7 @@ namespace orc {
return "ZlibCompressionStream";
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
@@ -303,8 +304,7 @@ DIAGNOSTIC_PUSH
strm.opaque = nullptr;
strm.next_in = nullptr;
- if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)
- != Z_OK) {
+ if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
throw std::runtime_error("Error while calling deflateInit2() for zlib.");
}
}
@@ -313,42 +313,46 @@ DIAGNOSTIC_PUSH
(void)deflateEnd(&strm);
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
- enum DecompressState { DECOMPRESS_HEADER,
- DECOMPRESS_START,
- DECOMPRESS_CONTINUE,
- DECOMPRESS_ORIGINAL,
- DECOMPRESS_EOF};
+ enum DecompressState {
+ DECOMPRESS_HEADER,
+ DECOMPRESS_START,
+ DECOMPRESS_CONTINUE,
+ DECOMPRESS_ORIGINAL,
+ DECOMPRESS_EOF
+ };
std::string decompressStateToString(DecompressState state) {
switch (state) {
- case DECOMPRESS_HEADER: return "DECOMPRESS_HEADER";
- case DECOMPRESS_START: return "DECOMPRESS_START";
- case DECOMPRESS_CONTINUE: return "DECOMPRESS_CONTINUE";
- case DECOMPRESS_ORIGINAL: return "DECOMPRESS_ORIGINAL";
- case DECOMPRESS_EOF: return "DECOMPRESS_EOF";
+ case DECOMPRESS_HEADER:
+ return "DECOMPRESS_HEADER";
+ case DECOMPRESS_START:
+ return "DECOMPRESS_START";
+ case DECOMPRESS_CONTINUE:
+ return "DECOMPRESS_CONTINUE";
+ case DECOMPRESS_ORIGINAL:
+ return "DECOMPRESS_ORIGINAL";
+ case DECOMPRESS_EOF:
+ return "DECOMPRESS_EOF";
}
return "unknown";
}
class DecompressionStream : public SeekableInputStream {
- public:
- DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& pool);
+ public:
+ DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t bufferSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
virtual ~DecompressionStream() override {}
- virtual bool Next(const void** data, int*size) override;
+ virtual bool Next(const void** data, int* size) override;
virtual void BackUp(int count) override;
virtual bool Skip(int count) override;
virtual int64_t ByteCount() const override;
virtual void seek(PositionProvider& position) override;
virtual std::string getName() const override = 0;
- protected:
- virtual void NextDecompress(const void** data,
- int*size,
- size_t availableSize) = 0;
+ protected:
+ virtual void NextDecompress(const void** data, int* size, size_t availableSize) = 0;
std::string getStreamName() const;
void readBuffer(bool failOnEof);
@@ -366,8 +370,8 @@ DIAGNOSTIC_PUSH
// The starting and current position of the buffer for the uncompressed
// data. It either points to the data buffer or the underlying input stream.
- const char *outputBufferStart;
- const char *outputBuffer;
+ const char* outputBufferStart;
+ const char* outputBuffer;
size_t outputBufferLength;
// The uncompressed buffer length. For compressed chunk, it's the original
// (ie. the overall) and the actual length of the decompressed data.
@@ -379,9 +383,9 @@ DIAGNOSTIC_PUSH
size_t remainingLength;
// the last buffer returned from the input
- const char *inputBufferStart;
- const char *inputBuffer;
- const char *inputBufferEnd;
+ const char* inputBufferStart;
+ const char* inputBuffer;
+ const char* inputBufferEnd;
// Variables for saving the position of the header and the start of the
// buffer. Used when we have to seek a position.
@@ -390,37 +394,38 @@ DIAGNOSTIC_PUSH
// roughly the number of bytes returned
off_t bytesReturned;
+
+ ReaderMetrics* metrics;
};
- DecompressionStream::DecompressionStream(
- std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& _pool
- ) : pool(_pool),
- input(std::move(inStream)),
- outputDataBuffer(pool, bufferSize),
- state(DECOMPRESS_HEADER),
- outputBufferStart(nullptr),
- outputBuffer(nullptr),
- outputBufferLength(0),
- uncompressedBufferLength(0),
- remainingLength(0),
- inputBufferStart(nullptr),
- inputBuffer(nullptr),
- inputBufferEnd(nullptr),
- headerPosition(0),
- inputBufferStartPosition(0),
- bytesReturned(0) {
- }
+ DecompressionStream::DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t bufferSize, MemoryPool& _pool,
+ ReaderMetrics* _metrics)
+ : pool(_pool),
+ input(std::move(inStream)),
+ outputDataBuffer(pool, bufferSize),
+ state(DECOMPRESS_HEADER),
+ outputBufferStart(nullptr),
+ outputBuffer(nullptr),
+ outputBufferLength(0),
+ uncompressedBufferLength(0),
+ remainingLength(0),
+ inputBufferStart(nullptr),
+ inputBuffer(nullptr),
+ inputBufferEnd(nullptr),
+ headerPosition(0),
+ inputBufferStartPosition(0),
+ bytesReturned(0),
+ metrics(_metrics) {}
std::string DecompressionStream::getStreamName() const {
return input->getName();
}
void DecompressionStream::readBuffer(bool failOnEof) {
+ SCOPED_MINUS_STOPWATCH(metrics, DecompressionLatencyUs);
int length;
- if (!input->Next(reinterpret_cast<const void**>(&inputBuffer),
- &length)) {
+ if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), &length)) {
if (failOnEof) {
throw ParseError("Read past EOF in DecompressionStream::readBuffer");
}
@@ -430,8 +435,7 @@ DIAGNOSTIC_PUSH
inputBufferStart = nullptr;
} else {
inputBufferEnd = inputBuffer + length;
- inputBufferStartPosition
- = static_cast<size_t>(input->ByteCount() - length);
+ inputBufferStartPosition = static_cast<size_t>(input->ByteCount() - length);
inputBufferStart = inputBuffer;
}
}
@@ -462,7 +466,8 @@ DIAGNOSTIC_PUSH
}
}
- bool DecompressionStream::Next(const void** data, int*size) {
+ bool DecompressionStream::Next(const void** data, int* size) {
+ SCOPED_STOPWATCH(metrics, DecompressionLatencyUs, DecompressionCall);
// If we are starting a new header, we will have to store its positions
// after decompressing.
bool saveBufferPositions = false;
@@ -478,8 +483,8 @@ DIAGNOSTIC_PUSH
if (state == DECOMPRESS_HEADER || remainingLength == 0) {
readHeader();
// Here we already read the three bytes of the header.
- headerPosition = inputBufferStartPosition
- + static_cast<size_t>(inputBuffer - inputBufferStart) - 3;
+ headerPosition =
+ inputBufferStartPosition + static_cast<size_t>(inputBuffer - inputBufferStart) - 3;
saveBufferPositions = true;
}
if (state == DECOMPRESS_EOF) {
@@ -489,8 +494,7 @@ DIAGNOSTIC_PUSH
readBuffer(true);
}
size_t availableSize =
- std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
- remainingLength);
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength);
if (state == DECOMPRESS_ORIGINAL) {
*data = inputBuffer;
*size = static_cast<int>(availableSize);
@@ -501,8 +505,9 @@ DIAGNOSTIC_PUSH
} else if (state == DECOMPRESS_START) {
NextDecompress(data, size, availableSize);
} else {
- throw std::logic_error("Unknown compression state in "
- "DecompressionStream::Next");
+ throw std::logic_error(
+ "Unknown compression state in "
+ "DecompressionStream::Next");
}
bytesReturned += static_cast<off_t>(*size);
if (saveBufferPositions) {
@@ -530,7 +535,7 @@ DIAGNOSTIC_PUSH
// this is a stupid implementation for now.
// should skip entire blocks without decompressing
while (count > 0) {
- const void *ptr;
+ const void* ptr;
int len;
if (!Next(&ptr, &len)) {
return false;
@@ -560,10 +565,10 @@ DIAGNOSTIC_PUSH
// Case 1: the seeked position is in the current chunk and it's buffered and
// decompressed/uncompressed. Note that after the headerPosition comes the 3 bytes of
// the header.
- if (headerPosition == seekedHeaderPosition
- && inputBufferStartPosition <= headerPosition + 3 && inputBufferStart) {
- position.next(); // Skip the input level position, i.e. seekedHeaderPosition.
- size_t posInChunk = position.next(); // Chunk level position.
+ if (headerPosition == seekedHeaderPosition && inputBufferStartPosition <= headerPosition + 3 &&
+ inputBufferStart) {
+ position.next(); // Skip the input level position, i.e. seekedHeaderPosition.
+ size_t posInChunk = position.next(); // Chunk level position.
// Case 1.a: The position is in the decompressed/uncompressed buffer. Here we only
// need to set the output buffer's pointer to the seeked position.
if (uncompressedBufferLength >= posInChunk) {
@@ -575,9 +580,8 @@ DIAGNOSTIC_PUSH
// Skip bytes to seek.
if (!Skip(static_cast<int>(posInChunk - uncompressedBufferLength))) {
std::ostringstream ss;
- ss << "Bad seek to (chunkHeader=" << seekedHeaderPosition << ", posInChunk="
- << posInChunk << ") in " << getName() << ". DecompressionState: "
- << decompressStateToString(state);
+ ss << "Bad seek to (chunkHeader=" << seekedHeaderPosition << ", posInChunk=" << posInChunk
+ << ") in " << getName() << ". DecompressionState: " << decompressStateToString(state);
throw ParseError(ss.str());
}
return;
@@ -592,15 +596,14 @@ DIAGNOSTIC_PUSH
// Case 2: The input is buffered, but not yet decompressed. No need to
// force re-reading the inputBuffer, we just have to move it to the
// seeked position.
- position.next(); // Skip the input level position.
- inputBuffer
- = inputBufferStart + (seekedHeaderPosition - inputBufferStartPosition);
+ position.next(); // Skip the input level position.
+ inputBuffer = inputBufferStart + (seekedHeaderPosition - inputBufferStartPosition);
} else {
// Case 3: The seeked position is not in the input buffer, here we are
// forcing to read it.
inputBuffer = nullptr;
inputBufferEnd = nullptr;
- input->seek(position); // Actually use the input level position.
+ input->seek(position); // Actually use the input level position.
}
bytesReturned = static_cast<off_t>(input->ByteCount());
if (!Skip(static_cast<int>(position.next()))) {
@@ -609,33 +612,29 @@ DIAGNOSTIC_PUSH
}
class ZlibDecompressionStream : public DecompressionStream {
- public:
- ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool);
+ public:
+ ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
virtual ~ZlibDecompressionStream() override;
virtual std::string getName() const override;
- protected:
- virtual void NextDecompress(const void** data,
- int* size,
- size_t availableSize) override;
- private:
+ protected:
+ virtual void NextDecompress(const void** data, int* size, size_t availableSize) override;
+
+ private:
z_stream zstream;
};
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
- ZlibDecompressionStream::ZlibDecompressionStream
- (std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& _pool
- ): DecompressionStream
- (std::move(inStream), bufferSize, _pool) {
+ ZlibDecompressionStream::ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t bufferSize, MemoryPool& _pool,
+ ReaderMetrics* _metrics)
+ : DecompressionStream(std::move(inStream), bufferSize, _pool, _metrics) {
zstream.next_in = nullptr;
zstream.avail_in = 0;
zstream.zalloc = nullptr;
@@ -645,20 +644,20 @@ DIAGNOSTIC_PUSH
zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
int64_t result = inflateInit2(&zstream, -15);
switch (result) {
- case Z_OK:
- break;
- case Z_MEM_ERROR:
- throw std::logic_error("Memory error from inflateInit2");
- case Z_VERSION_ERROR:
- throw std::logic_error("Version error from inflateInit2");
- case Z_STREAM_ERROR:
- throw std::logic_error("Stream error from inflateInit2");
- default:
- throw std::logic_error("Unknown error from inflateInit2");
+ case Z_OK:
+ break;
+ case Z_MEM_ERROR:
+ throw std::logic_error("Memory error from inflateInit2");
+ case Z_VERSION_ERROR:
+ throw std::logic_error("Version error from inflateInit2");
+ case Z_STREAM_ERROR:
+ throw std::logic_error("Stream error from inflateInit2");
+ default:
+ throw std::logic_error("Unknown error from inflateInit2");
}
}
-DIAGNOSTIC_POP
+ DIAGNOSTIC_POP
ZlibDecompressionStream::~ZlibDecompressionStream() {
int64_t result = inflateEnd(&zstream);
@@ -668,49 +667,48 @@ DIAGNOSTIC_POP
}
}
- void ZlibDecompressionStream::NextDecompress(const void** data, int* size,
- size_t availableSize) {
- zstream.next_in =
- reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ void ZlibDecompressionStream::NextDecompress(const void** data, int* size, size_t availableSize) {
+ zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
zstream.avail_in = static_cast<uInt>(availableSize);
outputBuffer = outputDataBuffer.data();
- zstream.next_out =
- reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
+ zstream.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
if (inflateReset(&zstream) != Z_OK) {
- throw std::logic_error("Bad inflateReset in "
- "ZlibDecompressionStream::NextDecompress");
+ throw std::logic_error(
+ "Bad inflateReset in "
+ "ZlibDecompressionStream::NextDecompress");
}
int64_t result;
do {
- result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH :
- Z_SYNC_FLUSH);
+ result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH);
switch (result) {
- case Z_OK:
- remainingLength -= availableSize;
- inputBuffer += availableSize;
- readBuffer(true);
- availableSize =
- std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
- remainingLength);
- zstream.next_in =
- reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
- zstream.avail_in = static_cast<uInt>(availableSize);
- break;
- case Z_STREAM_END:
- break;
- case Z_BUF_ERROR:
- throw std::logic_error("Buffer error in "
- "ZlibDecompressionStream::NextDecompress");
- case Z_DATA_ERROR:
- throw std::logic_error("Data error in "
- "ZlibDecompressionStream::NextDecompress");
- case Z_STREAM_ERROR:
- throw std::logic_error("Stream error in "
- "ZlibDecompressionStream::NextDecompress");
- default:
- throw std::logic_error("Unknown error in "
- "ZlibDecompressionStream::NextDecompress");
+ case Z_OK:
+ remainingLength -= availableSize;
+ inputBuffer += availableSize;
+ readBuffer(true);
+ availableSize =
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength);
+ zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream.avail_in = static_cast<uInt>(availableSize);
+ break;
+ case Z_STREAM_END:
+ break;
+ case Z_BUF_ERROR:
+ throw std::logic_error(
+ "Buffer error in "
+ "ZlibDecompressionStream::NextDecompress");
+ case Z_DATA_ERROR:
+ throw std::logic_error(
+ "Data error in "
+ "ZlibDecompressionStream::NextDecompress");
+ case Z_STREAM_ERROR:
+ throw std::logic_error(
+ "Stream error in "
+ "ZlibDecompressionStream::NextDecompress");
+ default:
+ throw std::logic_error(
+ "Unknown error in "
+ "ZlibDecompressionStream::NextDecompress");
}
} while (result != Z_STREAM_END);
*size = static_cast<int>(outputDataBuffer.capacity() - zstream.avail_out);
@@ -727,44 +725,38 @@ DIAGNOSTIC_POP
return result.str();
}
- class BlockDecompressionStream: public DecompressionStream {
- public:
- BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool);
+ class BlockDecompressionStream : public DecompressionStream {
+ public:
+ BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
virtual ~BlockDecompressionStream() override {}
virtual std::string getName() const override = 0;
- protected:
- virtual void NextDecompress(const void** data,
- int* size,
- size_t availableSize) override;
+ protected:
+ virtual void NextDecompress(const void** data, int* size, size_t availableSize) override;
+
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) = 0;
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength) = 0;
- private:
+ private:
// may need to stitch together multiple input buffers;
// to give snappy a contiguous block
DataBuffer<char> inputDataBuffer;
};
- BlockDecompressionStream::BlockDecompressionStream
- (std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool
- ) : DecompressionStream
- (std::move(inStream), blockSize, _pool),
- inputDataBuffer(pool, blockSize) {
- }
-
+ BlockDecompressionStream::BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize, MemoryPool& _pool,
+ ReaderMetrics* _metrics)
+ : DecompressionStream(std::move(inStream), blockSize, _pool, _metrics),
+ inputDataBuffer(pool, blockSize) {}
void BlockDecompressionStream::NextDecompress(const void** data, int* size,
- size_t availableSize) {
+ size_t availableSize) {
// Get contiguous bytes of compressed block.
- const char *compressed = inputBuffer;
+ const char* compressed = inputBuffer;
if (remainingLength == availableSize) {
- inputBuffer += availableSize;
+ inputBuffer += availableSize;
} else {
// Did not read enough from input.
if (inputDataBuffer.capacity() < remainingLength) {
@@ -774,19 +766,16 @@ DIAGNOSTIC_POP
inputBuffer += availableSize;
compressed = inputDataBuffer.data();
- for (size_t pos = availableSize; pos < remainingLength; ) {
+ for (size_t pos = availableSize; pos < remainingLength;) {
readBuffer(true);
size_t avail =
- std::min(static_cast<size_t>(inputBufferEnd -
- inputBuffer),
- remainingLength - pos);
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength - pos);
::memcpy(inputDataBuffer.data() + pos, inputBuffer, avail);
pos += avail;
inputBuffer += avail;
}
}
- outputBufferLength = decompress(compressed, remainingLength,
- outputDataBuffer.data(),
+ outputBufferLength = decompress(compressed, remainingLength, outputDataBuffer.data(),
outputDataBuffer.capacity());
remainingLength = 0;
state = DECOMPRESS_HEADER;
@@ -796,15 +785,11 @@ DIAGNOSTIC_POP
outputBufferLength = 0;
}
- class SnappyDecompressionStream: public BlockDecompressionStream {
- public:
- SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- _pool) {
+ class SnappyDecompressionStream : public BlockDecompressionStream {
+ public:
+ SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
// PASS
}
@@ -814,15 +799,12 @@ DIAGNOSTIC_POP
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
};
- uint64_t SnappyDecompressionStream::decompress(const char *_input,
- uint64_t length,
- char *output,
+ uint64_t SnappyDecompressionStream::decompress(const char* _input, uint64_t length, char* output,
size_t maxOutputLength) {
size_t outLength;
if (!snappy::GetUncompressedLength(_input, length, &outLength)) {
@@ -839,15 +821,11 @@ DIAGNOSTIC_POP
return outLength;
}
- class LzoDecompressionStream: public BlockDecompressionStream {
- public:
- LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- _pool) {
+ class LzoDecompressionStream : public BlockDecompressionStream {
+ public:
+ LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
// PASS
}
@@ -857,29 +835,21 @@ DIAGNOSTIC_POP
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
};
- uint64_t LzoDecompressionStream::decompress(const char *inputPtr,
- uint64_t length,
- char *output,
+ uint64_t LzoDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
- return lzoDecompress(inputPtr, inputPtr + length, output,
- output + maxOutputLength);
+ return lzoDecompress(inputPtr, inputPtr + length, output, output + maxOutputLength);
}
- class Lz4DecompressionStream: public BlockDecompressionStream {
- public:
- Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- _pool) {
+ class Lz4DecompressionStream : public BlockDecompressionStream {
+ public:
+ Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
// PASS
}
@@ -889,15 +859,12 @@ DIAGNOSTIC_POP
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
};
- uint64_t Lz4DecompressionStream::decompress(const char *inputPtr,
- uint64_t length,
- char *output,
+ uint64_t Lz4DecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
int result = LZ4_decompress_safe(inputPtr, output, static_cast<int>(length),
static_cast<int>(maxOutputLength));
@@ -910,26 +877,20 @@ DIAGNOSTIC_POP
/**
* Block compression base class
*/
- class BlockCompressionStream: public CompressionStreamBase {
- public:
- BlockCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : CompressionStreamBase(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool)
- , compressorBuffer(pool) {
+ class BlockCompressionStream : public CompressionStreamBase {
+ public:
+ BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics),
+ compressorBuffer(pool) {
// PASS
}
- virtual bool Next(void** data, int*size) override;
+ virtual bool Next(void** data, int* size) override;
+ virtual void suppress() override;
virtual std::string getName() const override = 0;
- protected:
+ protected:
// compresses a block and returns the compressed size
virtual uint64_t doBlockCompression() = 0;
@@ -941,50 +902,27 @@ DIAGNOSTIC_POP
DataBuffer<unsigned char> compressorBuffer;
};
- bool BlockCompressionStream::Next(void** data, int*size) {
+ bool BlockCompressionStream::Next(void** data, int* size) {
if (bufferSize != 0) {
ensureHeader();
// perform compression
size_t totalCompressedSize = doBlockCompression();
- const unsigned char * dataToWrite = nullptr;
+ const unsigned char* dataToWrite = nullptr;
int totalSizeToWrite = 0;
- char * header = outputBuffer + outputPosition - 3;
if (totalCompressedSize >= static_cast<size_t>(bufferSize)) {
- writeHeader(header, static_cast<size_t>(bufferSize), true);
+ writeHeader(static_cast<size_t>(bufferSize), true);
dataToWrite = rawInputBuffer.data();
totalSizeToWrite = bufferSize;
} else {
- writeHeader(header, totalCompressedSize, false);
+ writeHeader(totalCompressedSize, false);
dataToWrite = compressorBuffer.data();
totalSizeToWrite = static_cast<int>(totalCompressedSize);
}
- char * dst = header + 3;
- while (totalSizeToWrite > 0) {
- if (outputPosition == outputSize) {
- if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::logic_error(
- "Failed to get next output buffer from output stream.");
- }
- outputPosition = 0;
- dst = outputBuffer;
- } else if (outputPosition > outputSize) {
- // this will unlikely happen, but we have seen a few on zstd v1.1.0
- throw std::logic_error("Write to an out-of-bound place!");
- }
-
- int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition);
- std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite));
-
- outputPosition += sizeToWrite;
- dataToWrite += sizeToWrite;
- totalSizeToWrite -= sizeToWrite;
- dst += sizeToWrite;
- }
+ writeData(dataToWrite, totalSizeToWrite);
}
*data = rawInputBuffer.data();
@@ -995,52 +933,48 @@ DIAGNOSTIC_POP
return true;
}
+ void BlockCompressionStream::suppress() {
+ compressorBuffer.resize(0);
+ CompressionStreamBase::suppress();
+ }
+
/**
* LZ4 block compression
*/
- class Lz4CompressionSteam: public BlockCompressionStream {
- public:
- Lz4CompressionSteam(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
+ class Lz4CompressionSteam : public BlockCompressionStream {
+ public:
+ Lz4CompressionSteam(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
this->init();
}
virtual std::string getName() const override {
return "Lz4CompressionStream";
}
-
+
virtual ~Lz4CompressionSteam() override {
this->end();
}
- protected:
+ protected:
virtual uint64_t doBlockCompression() override;
virtual uint64_t estimateMaxCompressionSize() override {
return static_cast<uint64_t>(LZ4_compressBound(bufferSize));
}
- private:
+ private:
void init();
void end();
- LZ4_stream_t *state;
+ LZ4_stream_t* state;
};
uint64_t Lz4CompressionSteam::doBlockCompression() {
- int result = LZ4_compress_fast_extState(static_cast<void*>(state),
- reinterpret_cast<const char*>(rawInputBuffer.data()),
- reinterpret_cast<char*>(compressorBuffer.data()),
- bufferSize,
- static_cast<int>(compressorBuffer.size()),
- level);
+ int result = LZ4_compress_fast_extState(
+ static_cast<void*>(state), reinterpret_cast<const char*>(rawInputBuffer.data()),
+ reinterpret_cast<char*>(compressorBuffer.data()), bufferSize,
+ static_cast<int>(compressorBuffer.size()), level);
if (result == 0) {
throw std::runtime_error("Error during block compression using lz4.");
}
@@ -1062,34 +996,25 @@ DIAGNOSTIC_POP
/**
* Snappy block compression
*/
- class SnappyCompressionStream: public BlockCompressionStream {
- public:
- SnappyCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
- }
+ class SnappyCompressionStream : public BlockCompressionStream {
+ public:
+ SnappyCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {}
virtual std::string getName() const override {
return "SnappyCompressionStream";
}
-
+
virtual ~SnappyCompressionStream() override {
// PASS
}
- protected:
+ protected:
virtual uint64_t doBlockCompression() override;
virtual uint64_t estimateMaxCompressionSize() override {
- return static_cast<uint64_t>
- (snappy::MaxCompressedLength(static_cast<size_t>(bufferSize)));
+ return static_cast<uint64_t>(snappy::MaxCompressedLength(static_cast<size_t>(bufferSize)));
}
};
@@ -1097,92 +1022,75 @@ DIAGNOSTIC_POP
size_t compressedLength;
snappy::RawCompress(reinterpret_cast<const char*>(rawInputBuffer.data()),
static_cast<size_t>(bufferSize),
- reinterpret_cast<char*>(compressorBuffer.data()),
- &compressedLength);
+ reinterpret_cast<char*>(compressorBuffer.data()), &compressedLength);
return static_cast<uint64_t>(compressedLength);
}
/**
* ZSTD block compression
*/
- class ZSTDCompressionStream: public BlockCompressionStream {
- public:
- ZSTDCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
+ class ZSTDCompressionStream : public BlockCompressionStream {
+ public:
+ ZSTDCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
this->init();
}
virtual std::string getName() const override {
return "ZstdCompressionStream";
}
-
+
virtual ~ZSTDCompressionStream() override {
this->end();
}
- protected:
+ protected:
virtual uint64_t doBlockCompression() override;
virtual uint64_t estimateMaxCompressionSize() override {
return ZSTD_compressBound(static_cast<size_t>(bufferSize));
}
-
- private:
+
+ private:
void init();
void end();
- ZSTD_CCtx *cctx;
+ ZSTD_CCtx* cctx;
};
uint64_t ZSTDCompressionStream::doBlockCompression() {
- return ZSTD_compressCCtx(cctx,
- compressorBuffer.data(),
- compressorBuffer.size(),
- rawInputBuffer.data(),
- static_cast<size_t>(bufferSize),
- level);
+ return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(),
+ rawInputBuffer.data(), static_cast<size_t>(bufferSize), level);
}
-
-DIAGNOSTIC_PUSH
+
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
void ZSTDCompressionStream::init() {
-
cctx = ZSTD_createCCtx();
if (!cctx) {
throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd.");
}
}
-
void ZSTDCompressionStream::end() {
(void)ZSTD_freeCCtx(cctx);
cctx = nullptr;
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
/**
* ZSTD block decompression
*/
- class ZSTDDecompressionStream: public BlockDecompressionStream {
- public:
- ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool)
- : BlockDecompressionStream(std::move(inStream),
- blockSize,
- _pool) {
+ class ZSTDDecompressionStream : public BlockDecompressionStream {
+ public:
+ ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
this->init();
}
@@ -1196,127 +1104,106 @@ DIAGNOSTIC_PUSH
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input,
- uint64_t length,
- char *output,
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
size_t maxOutputLength) override;
- private:
+ private:
void init();
void end();
- ZSTD_DCtx *dctx;
+ ZSTD_DCtx* dctx;
};
- uint64_t ZSTDDecompressionStream::decompress(const char *inputPtr,
- uint64_t length,
- char *output,
+ uint64_t ZSTDDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
- return static_cast<uint64_t>(ZSTD_decompressDCtx(dctx,
- output,
- maxOutputLength,
- inputPtr,
- length));
+ return static_cast<uint64_t>(
+ ZSTD_decompressDCtx(dctx, output, maxOutputLength, inputPtr, length));
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
void ZSTDDecompressionStream::init() {
-
dctx = ZSTD_createDCtx();
if (!dctx) {
throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd.");
}
}
-
void ZSTDDecompressionStream::end() {
(void)ZSTD_freeDCtx(dctx);
dctx = nullptr;
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
- std::unique_ptr<BufferedOutputStream>
- createCompressor(
- CompressionKind kind,
- OutputStream * outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool) {
+ std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind,
+ OutputStream* outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics) {
switch (static_cast<int64_t>(kind)) {
- case CompressionKind_NONE: {
- return std::unique_ptr<BufferedOutputStream>
- (new BufferedOutputStream(
- pool, outStream, bufferCapacity, compressionBlockSize));
- }
- case CompressionKind_ZLIB: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
- return std::unique_ptr<BufferedOutputStream>
- (new ZlibCompressionStream(
- outStream, level, bufferCapacity, compressionBlockSize, pool));
- }
- case CompressionKind_ZSTD: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- 1 : ZSTD_CLEVEL_DEFAULT;
- return std::unique_ptr<BufferedOutputStream>
- (new ZSTDCompressionStream(
- outStream, level, bufferCapacity, compressionBlockSize, pool));
- }
- case CompressionKind_LZ4: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- LZ4_ACCELERATION_MAX : LZ4_ACCELERATION_DEFAULT;
- return std::unique_ptr<BufferedOutputStream>
- (new Lz4CompressionSteam(
- outStream, level, bufferCapacity, compressionBlockSize, pool));
- }
- case CompressionKind_SNAPPY: {
- int level = 0;
- return std::unique_ptr<BufferedOutputStream>
- (new SnappyCompressionStream(
- outStream, level, bufferCapacity, compressionBlockSize, pool));
- }
- case CompressionKind_LZO:
- default:
- throw NotImplementedYet("compression codec");
+ case CompressionKind_NONE: {
+ return std::make_unique<BufferedOutputStream>(pool, outStream, bufferCapacity,
+ compressionBlockSize, metrics);
+ }
+ case CompressionKind_ZLIB: {
+ int level =
+ (strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
+ return std::make_unique<ZlibCompressionStream>(outStream, level, bufferCapacity,
+ compressionBlockSize, pool, metrics);
+ }
+ case CompressionKind_ZSTD: {
+ int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT;
+ return std::make_unique<ZSTDCompressionStream>(outStream, level, bufferCapacity,
+ compressionBlockSize, pool, metrics);
+ }
+ case CompressionKind_LZ4: {
+ int level = (strategy == CompressionStrategy_SPEED) ? LZ4_ACCELERATION_MAX
+ : LZ4_ACCELERATION_DEFAULT;
+ return std::make_unique<Lz4CompressionSteam>(outStream, level, bufferCapacity,
+ compressionBlockSize, pool, metrics);
+ }
+ case CompressionKind_SNAPPY: {
+ int level = 0;
+ return std::make_unique<SnappyCompressionStream>(outStream, level, bufferCapacity,
+ compressionBlockSize, pool, metrics);
+ }
+ case CompressionKind_LZO:
+ default:
+ throw NotImplementedYet("compression codec");
}
}
- std::unique_ptr<SeekableInputStream>
- createDecompressor(CompressionKind kind,
- std::unique_ptr<SeekableInputStream> input,
- uint64_t blockSize,
- MemoryPool& pool) {
+ std::unique_ptr<SeekableInputStream> createDecompressor(
+ CompressionKind kind, std::unique_ptr<SeekableInputStream> input, uint64_t blockSize,
+ MemoryPool& pool, ReaderMetrics* metrics) {
switch (static_cast<int64_t>(kind)) {
- case CompressionKind_NONE:
- return REDUNDANT_MOVE(input);
- case CompressionKind_ZLIB:
- return std::unique_ptr<SeekableInputStream>
- (new ZlibDecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_SNAPPY:
- return std::unique_ptr<SeekableInputStream>
- (new SnappyDecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_LZO:
- return std::unique_ptr<SeekableInputStream>
- (new LzoDecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_LZ4:
- return std::unique_ptr<SeekableInputStream>
- (new Lz4DecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_ZSTD:
- return std::unique_ptr<SeekableInputStream>
- (new ZSTDDecompressionStream(std::move(input), blockSize, pool));
- default: {
- std::ostringstream buffer;
- buffer << "Unknown compression codec " << kind;
- throw NotImplementedYet(buffer.str());
- }
+ case CompressionKind_NONE:
+ return input;
+ case CompressionKind_ZLIB:
+ return std::make_unique<ZlibDecompressionStream>(std::move(input), blockSize, pool,
+ metrics);
+ case CompressionKind_SNAPPY:
+ return std::make_unique<SnappyDecompressionStream>(std::move(input), blockSize, pool,
+ metrics);
+ case CompressionKind_LZO:
+ return std::make_unique<LzoDecompressionStream>(std::move(input), blockSize, pool, metrics);
+ case CompressionKind_LZ4:
+ return std::make_unique<Lz4DecompressionStream>(std::move(input), blockSize, pool, metrics);
+ case CompressionKind_ZSTD:
+ return std::make_unique<ZSTDDecompressionStream>(std::move(input), blockSize, pool,
+ metrics);
+ default: {
+ std::ostringstream buffer;
+ buffer << "Unknown compression codec " << kind;
+ throw NotImplementedYet(buffer.str());
+ }
}
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh
index ff79377d83..55b152dd63 100644
--- a/contrib/libs/apache/orc/c++/src/Compression.hh
+++ b/contrib/libs/apache/orc/c++/src/Compression.hh
@@ -30,12 +30,11 @@ namespace orc {
* @param input the input stream that is the underlying source
* @param bufferSize the maximum size of the buffer
* @param pool the memory pool
+ * @param metrics the reader metrics
*/
- std::unique_ptr<SeekableInputStream>
- createDecompressor(CompressionKind kind,
- std::unique_ptr<SeekableInputStream> input,
- uint64_t bufferSize,
- MemoryPool& pool);
+ std::unique_ptr<SeekableInputStream> createDecompressor(
+ CompressionKind kind, std::unique_ptr<SeekableInputStream> input, uint64_t bufferSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
/**
* Create a compressor for the given compression kind.
@@ -46,13 +45,12 @@ namespace orc {
* @param compressionBlockSize compression buffer block size
* @param pool the memory pool
*/
- std::unique_ptr<BufferedOutputStream>
- createCompressor(CompressionKind kind,
- OutputStream * outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool);
-}
+ std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind,
+ OutputStream* outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
new file mode 100644
index 0000000000..459cafa1a0
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
@@ -0,0 +1,1001 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+ // Assume that we are using tight numeric vector batch
+ using BooleanVectorBatch = ByteVectorBatch;
+
+ ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool _throwOnOverflow)
+ : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+ reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+ /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+ data =
+ fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+ }
+
+ void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+ reader->next(*data, numValues, notNull);
+ rowBatch.resize(data->capacity);
+ rowBatch.numElements = data->numElements;
+ rowBatch.hasNulls = data->hasNulls;
+ if (!rowBatch.hasNulls) {
+ memset(rowBatch.notNull.data(), 1, data->notNull.size());
+ } else {
+ memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+ }
+ }
+
+ uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+ return reader->skip(numValues);
+ }
+
+ void ConvertColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ reader->seekToRowGroup(positions);
+ }
+
+ static inline bool canFitInLong(double value) {
+ constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+ constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+ return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+ }
+
+ template <typename FileType, typename ReadType>
+ static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+ if (!shouldThrow) {
+ dstBatch.notNull.data()[idx] = 0;
+ dstBatch.hasNulls = true;
+ } else {
+ std::ostringstream ss;
+ ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+ << typeid(ReadType).name();
+ throw SchemaEvolutionError(ss.str());
+ }
+ }
+
+ // return false if overflow
+ template <typename ReadType>
+ static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+ dstValue = static_cast<ReadType>(inputLong);
+ if constexpr (std::is_same<ReadType, int64_t>::value) {
+ return true;
+ }
+ if (static_cast<int64_t>(dstValue) != inputLong) {
+ return false;
+ }
+ return true;
+ }
+
+ template <typename DestBatchPtrType>
+ static inline DestBatchPtrType SafeCastBatchTo(ColumnVectorBatch* batch) {
+ auto result = dynamic_cast<DestBatchPtrType>(batch);
+ if (result == nullptr) {
+ std::ostringstream ss;
+ ss << "Bad cast when convert from ColumnVectorBatch to "
+ << typeid(typename std::remove_const<
+ typename std::remove_pointer<DestBatchPtrType>::type>::type)
+ .name();
+ throw InvalidArgument(ss.str());
+ }
+ return result;
+ }
+
+ // set null or throw exception if overflow
+ template <typename ReadType, typename FileType>
+ static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+ ColumnVectorBatch& destBatch, uint64_t idx,
+ bool shouldThrow) {
+ constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+ constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+ int64_t longValue = static_cast<int64_t>(srcValue);
+ if (isFileTypeFloatingPoint) {
+ if (isReadTypeFloatingPoint) {
+ destValue = static_cast<ReadType>(srcValue);
+ } else {
+ if (!canFitInLong(static_cast<double>(srcValue)) ||
+ !downCastToInteger(destValue, longValue)) {
+ handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+ }
+ }
+ } else {
+ if (isReadTypeFloatingPoint) {
+ destValue = static_cast<ReadType>(srcValue);
+ if (destValue != destValue) { // check is NaN
+ handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+ }
+ } else {
+ if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+ handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+ }
+ }
+ }
+ }
+
+ // { boolean, byte, short, int, long, float, double } ->
+ // { byte, short, int, long, float, double }
+ template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+ class NumericConvertColumnReader : public ConvertColumnReader {
+ public:
+ NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ if (rowBatch.hasNulls) {
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ if (rowBatch.notNull[i]) {
+ convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+ throwOnOverflow);
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+ throwOnOverflow);
+ }
+ }
+ }
+ };
+
+ // { boolean, byte, short, int, long, float, double } -> { boolean }
+ template <typename FileTypeBatch>
+ class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+ : public ConvertColumnReader {
+ public:
+ NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
+ if (rowBatch.hasNulls) {
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ if (rowBatch.notNull[i]) {
+ dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+ dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+ }
+ }
+ }
+ };
+
+ class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+ virtual uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+ protected:
+ std::vector<std::string> strBuffer;
+ };
+
+ void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ // cache converted string in the buffer
+ auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+ // contact string values to blob buffer of vector batch
+ auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+ dstBatch.blob.resize(totalLength);
+ char* blob = dstBatch.blob.data();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const auto size = strBuffer[i].size();
+ ::memcpy(blob, strBuffer[i].c_str(), size);
+ dstBatch.data[i] = blob;
+ dstBatch.length[i] = static_cast<int32_t>(size);
+ blob += size;
+ }
+ }
+ strBuffer.clear();
+ }
+
+ class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+ public:
+ BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool _throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ trueValue = "TRUE";
+ falseValue = "FALSE";
+ if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) {
+ if (readType.getMaximumLength() < 5) {
+ throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+ std::to_string(readType.getMaximumLength()));
+ }
+ if (readType.getKind() == CHAR) {
+ trueValue.resize(readType.getMaximumLength(), ' ');
+ falseValue.resize(readType.getMaximumLength(), ' ');
+ }
+ }
+ }
+
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+ private:
+ std::string trueValue;
+ std::string falseValue;
+ };
+
+ uint64_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+ uint64_t numValues) {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+ // cast the bool value to string
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+ size += strBuffer[i].size();
+ }
+ }
+ return size;
+ }
+
+ template <typename FileTypeBatch>
+ class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+ public:
+ NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool _throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+ };
+
+ template <typename FileTypeBatch>
+ uint64_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+ ColumnVectorBatch& rowBatch, uint64_t numValues) {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ size += strBuffer[i].size();
+ }
+ }
+ } else if (readType.getKind() == VARCHAR) {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+ } else {
+ size += strBuffer[i].size();
+ }
+ }
+ }
+ } else if (readType.getKind() == CHAR) {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+ } else {
+ strBuffer[i].resize(maxLength, ' ');
+ size += strBuffer[i].size();
+ }
+ }
+ }
+ } else {
+ throw SchemaEvolutionError("Invalid type for numeric to string conversion: " +
+ readType.toString());
+ }
+ return size;
+ }
+
+ template <typename FileTypeBatch, typename ReadTypeBatch, bool isFloatingFileType>
+ class NumericToDecimalColumnReader : public ConvertColumnReader {
+ public:
+ NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ precision = static_cast<int32_t>(readType.getPrecision());
+ scale = static_cast<int32_t>(readType.getScale());
+ bool overflow = false;
+ upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+ }
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ dstBatch.precision = precision;
+ dstBatch.scale = scale;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (isFloatingFileType) {
+ convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+ } else {
+ convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+ }
+
+ private:
+ template <typename SrcType>
+ void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {
+ const auto result = convertDecimal(value, precision, scale);
+ Int128 i128 = result.second;
+ if (result.first) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+
+ if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+ if (!i128.fitsInLong()) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = i128.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = i128;
+ }
+ }
+
+ template <typename SrcType>
+ void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {
+ int fromScale = 0;
+ auto result = convertDecimal(value, fromScale, precision, scale);
+ if (result.first) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+ } else {
+ if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+ if (!result.second.fitsInLong()) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = result.second.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = result.second;
+ }
+ }
+ }
+
+ int32_t precision;
+ int32_t scale;
+ int64_t scaleMultiplier;
+ Int128 upperBound;
+ };
+
+ class ConvertToTimestampColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
+ readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT")
+ : &stripe.getReaderTimezone()),
+ needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+ protected:
+ const orc::Timezone* readerTimezone;
+ const bool needConvertTimezone;
+ };
+
+ // avoid emitting vtable in every translation unit
+ void ConvertToTimestampColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+ }
+
+ template <typename FileTypeBatch>
+ class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader {
+ public:
+ NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool _throwOnOverflow)
+ : ConvertToTimestampColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToTimestamp(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+
+ private:
+ template <typename FileType>
+ void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, FileType value);
+ };
+
+ template <typename FileTypeBatch>
+ template <typename FileType>
+ void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
+ TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
+ if constexpr (std::is_floating_point<FileType>::value) {
+ if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
+ value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
+ handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ dstBatch.data[idx] = static_cast<int64_t>(value);
+ dstBatch.nanoseconds[idx] = static_cast<int32_t>(
+ static_cast<double>(value - static_cast<FileType>(dstBatch.data[idx])) * 1e9);
+ if (dstBatch.nanoseconds[idx] < 0) {
+ dstBatch.data[idx] -= 1;
+ dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
+ }
+ } else {
+ dstBatch.data[idx] = value;
+ dstBatch.nanoseconds[idx] = 0;
+ }
+ if (needConvertTimezone) {
+ dstBatch.data[idx] = readerTimezone->convertFromUTC(dstBatch.data[idx]);
+ }
+ }
+
+ template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+ class DecimalToNumericColumnReader : public ConvertColumnReader {
+ public:
+ DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ precision = fileType.getPrecision();
+ scale = fileType.getScale();
+ factor = 1;
+ for (int i = 0; i < scale; i++) {
+ factor *= 10;
+ }
+ }
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (std::is_floating_point_v<ReadType>) {
+ convertDecimalToDouble(dstBatch, i, srcBatch);
+ } else {
+ convertDecimalToInteger(dstBatch, i, srcBatch);
+ }
+ }
+ }
+ }
+
+ private:
+ void convertDecimalToInteger(ReadTypeBatch& dstBatch, uint64_t idx,
+ const FileTypeBatch& srcBatch) {
+ using FileType = decltype(srcBatch.values[idx]);
+ Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale);
+ if (!result.fitsInLong()) {
+ handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ convertNumericElement<ReadType, int64_t>(result.toLong(), dstBatch.data[idx], dstBatch, idx,
+ throwOnOverflow);
+ }
+
+ void convertDecimalToDouble(ReadTypeBatch& dstBatch, uint64_t idx,
+ const FileTypeBatch& srcBatch) {
+ double doubleValue = Int128(srcBatch.values[idx]).toDouble();
+ dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor);
+ }
+
+ int32_t precision;
+ int32_t scale;
+ int64_t factor;
+ };
+
+ template <typename FileTypeBatch>
+ class DecimalToNumericColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+ : public ConvertColumnReader {
+ public:
+ DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ dstBatch.data[i] = srcBatch.values[i] == 0 ? 0 : 1;
+ }
+ }
+ }
+ };
+
+ template <typename FileTypeBatch, typename ReadTypeBatch>
+ class DecimalConvertColumnReader : public ConvertColumnReader {
+ public:
+ DecimalConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ fromPrecision = fileType.getPrecision();
+ fromScale = fileType.getScale();
+ toPrecision = _readType.getPrecision();
+ toScale = _readType.getScale();
+ }
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertDecimalToDecimal(dstBatch, i, srcBatch);
+ }
+ }
+ }
+
+ private:
+ void convertDecimalToDecimal(ReadTypeBatch& dstBatch, uint64_t idx,
+ const FileTypeBatch& srcBatch) {
+ using FileType = decltype(srcBatch.values[idx]);
+ using ReadType = decltype(dstBatch.values[idx]);
+
+ auto [overflows, resultI128] =
+ convertDecimal(srcBatch.values[idx], fromScale, toPrecision, toScale);
+ if (overflows) {
+ handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
+ }
+ if constexpr (std::is_same_v<ReadTypeBatch, Decimal64VectorBatch>) {
+ if (!resultI128.fitsInLong()) {
+ handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = resultI128.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = resultI128;
+ }
+ }
+
+ int32_t fromPrecision;
+ int32_t fromScale;
+ int32_t toPrecision;
+ int32_t toScale;
+ };
+
+#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
+ using FROM##To##TO##ColumnReader = \
+ NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
+
+#define DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = NumericToStringVariantColumnReader<FROM##VectorBatch>;
+
+#define DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(FROM, IS_FROM_FLOATING) \
+ using FROM##To##Decimal64##ColumnReader = \
+ NumericToDecimalColumnReader<FROM##VectorBatch, Decimal64VectorBatch, IS_FROM_FLOATING>; \
+ using FROM##To##Decimal128##ColumnReader = \
+ NumericToDecimalColumnReader<FROM##VectorBatch, Decimal128VectorBatch, IS_FROM_FLOATING>;
+
+#define DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(FROM) \
+ using FROM##ToTimestampColumnReader = NumericToTimestampColumnReader<FROM##VectorBatch>;
+
+#define DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(TO, TYPE) \
+ using Decimal64##To##TO##ColumnReader = \
+ DecimalToNumericColumnReader<Decimal64VectorBatch, TO##VectorBatch, TYPE>; \
+ using Decimal128##To##TO##ColumnReader = \
+ DecimalToNumericColumnReader<Decimal128VectorBatch, TO##VectorBatch, TYPE>;
+
+#define DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(TO) \
+ using Decimal64##To##TO##ColumnReader = \
+ DecimalConvertColumnReader<Decimal64VectorBatch, TO##VectorBatch>; \
+ using Decimal128##To##TO##ColumnReader = \
+ DecimalConvertColumnReader<Decimal128VectorBatch, TO##VectorBatch>;
+
+ DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
+ DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
+ DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
+ DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t)
+ DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t)
+ DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t)
+ DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t)
+ DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t)
+ DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t)
+ DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t)
+ DEFINE_NUMERIC_CONVERT_READER(Float, Double, double)
+ DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool)
+ DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool)
+ DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t)
+ DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool)
+ DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t)
+ DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t)
+ DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool)
+ DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t)
+ DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t)
+ DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t)
+ DEFINE_NUMERIC_CONVERT_READER(Double, Float, float)
+ // Floating to integer
+ DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool)
+ DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t)
+ DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t)
+ DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t)
+ DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t)
+ DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool)
+ DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t)
+ DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t)
+ DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t)
+ DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t)
+ // Integer to Floating
+ DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float)
+ DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float)
+ DEFINE_NUMERIC_CONVERT_READER(Short, Float, float)
+ DEFINE_NUMERIC_CONVERT_READER(Int, Float, float)
+ DEFINE_NUMERIC_CONVERT_READER(Long, Float, float)
+ DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double)
+ DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double)
+ DEFINE_NUMERIC_CONVERT_READER(Short, Double, double)
+ DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
+ DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
+
+ // Numeric to String/Char
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
+ using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
+ using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
+ using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
+
+ // Numeric to Decimal
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
+
+ // Numeric to Timestamp
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
+
+ // Decimal to Numeric
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Boolean, bool)
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Byte, int8_t)
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Short, int16_t)
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Int, int32_t)
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Long, int64_t)
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Float, float)
+ DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Double, double)
+
+ // Decimal to Decimal
+ DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal64)
+ DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal128)
+
+#define CREATE_READER(NAME) \
+ return std::make_unique<NAME>(_readType, fileType, stripe, throwOnOverflow);
+
+#define CASE_CREATE_READER(TYPE, CONVERT) \
+ case TYPE: \
+ CREATE_READER(CONVERT##ColumnReader)
+
+ const static int32_t MAX_PRECISION_64 = 18;
+
+ static inline bool isDecimal64(const Type& type) {
+ return type.getPrecision() > 0 && type.getPrecision() <= MAX_PRECISION_64;
+ }
+
+#define CASE_CREATE_FROM_DECIMAL_READER(TYPE, TO) \
+ case TYPE: { \
+ if (isDecimal64(fileType)) { \
+ CREATE_READER(Decimal64To##TO##ColumnReader) \
+ } else { \
+ CREATE_READER(Decimal128To##TO##ColumnReader) \
+ } \
+ }
+
+#define CASE_CREATE_DECIMAL_READER(FROM) \
+ case DECIMAL: { \
+ if (isDecimal64(_readType)) { \
+ CREATE_READER(FROM##ToDecimal64ColumnReader) \
+ } else { \
+ CREATE_READER(FROM##ToDecimal128ColumnReader) \
+ } \
+ }
+
+#define CASE_EXCEPTION \
+ default: \
+ throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
+ _readType.toString());
+
+ std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
+ bool useTightNumericVector,
+ bool throwOnOverflow) {
+ if (!useTightNumericVector) {
+ throw SchemaEvolutionError(
+ "SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
+ "option useTightNumericVector");
+ }
+ const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType);
+
+ switch (fileType.getKind()) {
+ case BOOLEAN: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BYTE, BooleanToByte)
+ CASE_CREATE_READER(SHORT, BooleanToShort)
+ CASE_CREATE_READER(INT, BooleanToInt)
+ CASE_CREATE_READER(LONG, BooleanToLong)
+ CASE_CREATE_READER(FLOAT, BooleanToFloat)
+ CASE_CREATE_READER(DOUBLE, BooleanToDouble)
+ CASE_CREATE_READER(STRING, BooleanToString)
+ CASE_CREATE_READER(CHAR, BooleanToChar)
+ CASE_CREATE_READER(VARCHAR, BooleanToVarchar)
+ CASE_CREATE_DECIMAL_READER(Boolean)
+ CASE_CREATE_READER(TIMESTAMP, BooleanToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, BooleanToTimestamp)
+ case BOOLEAN:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case BYTE: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, ByteToBoolean)
+ CASE_CREATE_READER(SHORT, ByteToShort)
+ CASE_CREATE_READER(INT, ByteToInt)
+ CASE_CREATE_READER(LONG, ByteToLong)
+ CASE_CREATE_READER(FLOAT, ByteToFloat)
+ CASE_CREATE_READER(DOUBLE, ByteToDouble)
+ CASE_CREATE_READER(STRING, ByteToString)
+ CASE_CREATE_READER(CHAR, ByteToChar)
+ CASE_CREATE_READER(VARCHAR, ByteToVarchar)
+ CASE_CREATE_DECIMAL_READER(Byte)
+ CASE_CREATE_READER(TIMESTAMP, ByteToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, ByteToTimestamp)
+ case BYTE:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case SHORT: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, ShortToBoolean)
+ CASE_CREATE_READER(BYTE, ShortToByte)
+ CASE_CREATE_READER(INT, ShortToInt)
+ CASE_CREATE_READER(LONG, ShortToLong)
+ CASE_CREATE_READER(FLOAT, ShortToFloat)
+ CASE_CREATE_READER(DOUBLE, ShortToDouble)
+ CASE_CREATE_READER(STRING, ShortToString)
+ CASE_CREATE_READER(CHAR, ShortToChar)
+ CASE_CREATE_READER(VARCHAR, ShortToVarchar)
+ CASE_CREATE_DECIMAL_READER(Short)
+ CASE_CREATE_READER(TIMESTAMP, ShortToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, ShortToTimestamp)
+ case SHORT:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case INT: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, IntToBoolean)
+ CASE_CREATE_READER(BYTE, IntToByte)
+ CASE_CREATE_READER(SHORT, IntToShort)
+ CASE_CREATE_READER(LONG, IntToLong)
+ CASE_CREATE_READER(FLOAT, IntToFloat)
+ CASE_CREATE_READER(DOUBLE, IntToDouble)
+ CASE_CREATE_READER(STRING, IntToString)
+ CASE_CREATE_READER(CHAR, IntToChar)
+ CASE_CREATE_READER(VARCHAR, IntToVarchar)
+ CASE_CREATE_DECIMAL_READER(Int)
+ CASE_CREATE_READER(TIMESTAMP, IntToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, IntToTimestamp)
+ case INT:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case LONG: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, LongToBoolean)
+ CASE_CREATE_READER(BYTE, LongToByte)
+ CASE_CREATE_READER(SHORT, LongToShort)
+ CASE_CREATE_READER(INT, LongToInt)
+ CASE_CREATE_READER(FLOAT, LongToFloat)
+ CASE_CREATE_READER(DOUBLE, LongToDouble)
+ CASE_CREATE_READER(STRING, LongToString)
+ CASE_CREATE_READER(CHAR, LongToChar)
+ CASE_CREATE_READER(VARCHAR, LongToVarchar)
+ CASE_CREATE_DECIMAL_READER(Long)
+ CASE_CREATE_READER(TIMESTAMP, LongToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, LongToTimestamp)
+ case LONG:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case FLOAT: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, FloatToBoolean)
+ CASE_CREATE_READER(BYTE, FloatToByte)
+ CASE_CREATE_READER(SHORT, FloatToShort)
+ CASE_CREATE_READER(INT, FloatToInt)
+ CASE_CREATE_READER(LONG, FloatToLong)
+ CASE_CREATE_READER(DOUBLE, FloatToDouble)
+ CASE_CREATE_READER(STRING, FloatToString)
+ CASE_CREATE_READER(CHAR, FloatToChar)
+ CASE_CREATE_READER(VARCHAR, FloatToVarchar)
+ CASE_CREATE_DECIMAL_READER(Float)
+ CASE_CREATE_READER(TIMESTAMP, FloatToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, FloatToTimestamp)
+ case FLOAT:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case DOUBLE: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, DoubleToBoolean)
+ CASE_CREATE_READER(BYTE, DoubleToByte)
+ CASE_CREATE_READER(SHORT, DoubleToShort)
+ CASE_CREATE_READER(INT, DoubleToInt)
+ CASE_CREATE_READER(LONG, DoubleToLong)
+ CASE_CREATE_READER(FLOAT, DoubleToFloat)
+ CASE_CREATE_READER(STRING, DoubleToString)
+ CASE_CREATE_READER(CHAR, DoubleToChar)
+ CASE_CREATE_READER(VARCHAR, DoubleToVarchar)
+ CASE_CREATE_DECIMAL_READER(Double)
+ CASE_CREATE_READER(TIMESTAMP, DoubleToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, DoubleToTimestamp)
+ case DOUBLE:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case STRING:
+ case BINARY:
+ case TIMESTAMP:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DECIMAL: {
+ switch (_readType.getKind()) {
+ CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean)
+ CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte)
+ CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short)
+ CASE_CREATE_FROM_DECIMAL_READER(INT, Int)
+ CASE_CREATE_FROM_DECIMAL_READER(LONG, Long)
+ CASE_CREATE_FROM_DECIMAL_READER(FLOAT, Float)
+ CASE_CREATE_FROM_DECIMAL_READER(DOUBLE, Double)
+ case DECIMAL: {
+ if (isDecimal64(fileType)) {
+ if (isDecimal64(_readType)) {
+ CREATE_READER(Decimal64ToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(Decimal64ToDecimal128ColumnReader)
+ }
+ } else {
+ if (isDecimal64(_readType)) {
+ CREATE_READER(Decimal128ToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(Decimal128ToDecimal128ColumnReader)
+ }
+ }
+ }
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ case TIMESTAMP:
+ case TIMESTAMP_INSTANT:
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case DATE:
+ case VARCHAR:
+ case CHAR:
+ case TIMESTAMP_INSTANT:
+ CASE_EXCEPTION
+ }
+ }
+
+#undef DEFINE_NUMERIC_CONVERT_READER
+#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER
+#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER
+#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER
+#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER
+#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER
+#undef CASE_CREATE_FROM_DECIMAL_READER
+#undef CASE_CREATE_READER
+#undef CASE_EXCEPTION
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh
new file mode 100644
index 0000000000..6ed4d0170d
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_CONVERT_COLUMN_READER_HH
+#define ORC_CONVERT_COLUMN_READER_HH
+
+#include "ColumnReader.hh"
+#include "SchemaEvolution.hh"
+
+namespace orc {
+
+ class ConvertColumnReader : public ColumnReader {
+ public:
+ ConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
+ bool throwOnOverflow);
+
+ // override next() to implement convert logic
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
+
+ protected:
+ bool useTightNumericVector;
+ const Type& readType;
+ std::unique_ptr<ColumnReader> reader;
+ std::unique_ptr<ColumnVectorBatch> data;
+ const bool throwOnOverflow;
+ };
+
+ std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
+ bool useTightNumericVector,
+ bool throwOnOverflow);
+
+} // namespace orc
+
+#endif // ORC_CONVERT_COLUMN_READER_HH
diff --git a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc
new file mode 100644
index 0000000000..7e6958deef
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc
@@ -0,0 +1,589 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CpuInfoUtil.cc is from Apache Arrow as of 2023-03-21
+ */
+
+#include "CpuInfoUtil.hh"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#ifndef _MSC_VER
+#include <unistd.h>
+#endif
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <Windows.h>
+#include <intrin.h>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cstdint>
+#include <fstream>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "orc/Exceptions.hh"
+
+#undef CPUINFO_ARCH_X86
+#undef CPUINFO_ARCH_ARM
+#undef CPUINFO_ARCH_PPC
+
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#define CPUINFO_ARCH_X86
+#ifndef ORC_HAVE_RUNTIME_AVX512
+#define UNUSED(x) (void)(x)
+#endif
+#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__)
+#define CPUINFO_ARCH_ARM
+#elif defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__)
+#define CPUINFO_ARCH_PPC
+#endif
+
+namespace orc {
+
+ namespace {
+
+ constexpr int kCacheLevels = static_cast<int>(CpuInfo::CacheLevel::Last) + 1;
+
+ //============================== OS Dependent ==============================//
+
+#if defined(_WIN32)
+ //------------------------------ WINDOWS ------------------------------//
+ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
+ DWORD buffer_size = 0;
+ size_t offset = 0;
+ typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*);
+ GetLogicalProcessorInformationFuncPointer func_pointer =
+ (GetLogicalProcessorInformationFuncPointer)GetProcAddress(
+ GetModuleHandle("kernel32"), "GetLogicalProcessorInformation");
+
+ if (!func_pointer) {
+ throw ParseError("Failed to find procedure GetLogicalProcessorInformation");
+ }
+
+ // Get buffer size
+ if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+ throw ParseError("Failed to get size of processor information buffer");
+ }
+
+ buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size);
+ if (!buffer) {
+ return;
+ }
+
+ if (!func_pointer(buffer, &buffer_size)) {
+ free(buffer);
+ throw ParseError("Failed to get processor information");
+ }
+
+ buffer_position = buffer;
+ while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) {
+ if (RelationCache == buffer_position->Relationship) {
+ PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
+ if (cache->Level >= 1 && cache->Level <= kCacheLevels) {
+ const int64_t current = (*cache_sizes)[cache->Level - 1];
+ (*cache_sizes)[cache->Level - 1] = std::max<int64_t>(current, cache->Size);
+ }
+ }
+ offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+ buffer_position++;
+ }
+
+ free(buffer);
+ }
+
+#if defined(CPUINFO_ARCH_X86)
+ // On x86, get CPU features by cpuid, https://en.wikipedia.org/wiki/CPUID
+
+#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
+ void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
+ __asm__ __volatile__("cpuid"
+ : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]), "=d"(CPUInfo[3])
+ : "a"(function_id), "c"(subfunction_id));
+ }
+
+ int64_t _xgetbv(int xcr) {
+ int out = 0;
+ __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
+ return out;
+ }
+#endif // MINGW
+
+ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
+ std::string* model_name) {
+ int register_EAX_id = 1;
+ int highest_valid_id = 0;
+ int highest_extended_valid_id = 0;
+ std::bitset<32> features_ECX;
+ std::array<int, 4> cpu_info;
+
+ // Get highest valid id
+ __cpuid(cpu_info.data(), 0);
+ highest_valid_id = cpu_info[0];
+ // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
+ // HEX of "AuthenticAMD": 41757468 656E7469 63414D44
+ if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) {
+ *vendor = CpuInfo::Vendor::Intel;
+ } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 &&
+ cpu_info[2] == 0x444d4163) {
+ *vendor = CpuInfo::Vendor::AMD;
+ }
+
+ if (highest_valid_id <= register_EAX_id) {
+ return;
+ }
+
+ // EAX=1: Processor Info and Feature Bits
+ __cpuidex(cpu_info.data(), register_EAX_id, 0);
+ features_ECX = cpu_info[2];
+
+ // Get highest extended id
+ __cpuid(cpu_info.data(), 0x80000000);
+ highest_extended_valid_id = cpu_info[0];
+
+ // Retrieve CPU model name
+ if (highest_extended_valid_id >= static_cast<int>(0x80000004)) {
+ model_name->clear();
+ for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) {
+ __cpuidex(cpu_info.data(), i, 0);
+ *model_name += std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info));
+ }
+ }
+
+ bool zmm_enabled = false;
+ if (features_ECX[27]) { // OSXSAVE
+ // Query if the OS supports saving ZMM registers when switching contexts
+ int64_t xcr0 = _xgetbv(0);
+ zmm_enabled = (xcr0 & 0xE0) == 0xE0;
+ }
+
+ if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
+ if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
+ if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
+ if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT;
+ if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX;
+
+ // cpuid with EAX=7, ECX=0: Extended Features
+ register_EAX_id = 7;
+ if (highest_valid_id > register_EAX_id) {
+ __cpuidex(cpu_info.data(), register_EAX_id, 0);
+ std::bitset<32> features_EBX = cpu_info[1];
+
+ if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
+ if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
+ if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
+ if (zmm_enabled) {
+ if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
+ if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
+ if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
+ if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
+ if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
+ }
+ }
+ }
+
+#elif defined(CPUINFO_ARCH_ARM)
+ // Windows on Arm
+ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
+ std::string* model_name) {
+ *hardware_flags |= CpuInfo::ASIMD;
+ // TODO: vendor, model_name
+ }
+#endif
+
+#elif defined(__APPLE__)
+ //------------------------------ MACOS ------------------------------//
+ std::optional<int64_t> IntegerSysCtlByName(const char* name) {
+ size_t len = sizeof(int64_t);
+ int64_t data = 0;
+ if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
+ return data;
+ }
+ // ENOENT is the official errno value for non-existing sysctl's,
+ // but EINVAL and ENOTSUP have been seen in the wild.
+ if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
+ std::ostringstream ss;
+ ss << "sysctlbyname failed for '" << name << "'";
+ throw ParseError(ss.str());
+ }
+ return std::nullopt;
+ }
+
+ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
+ static_assert(kCacheLevels >= 3, "");
+ auto c = IntegerSysCtlByName("hw.l1dcachesize");
+ if (c.has_value()) {
+ (*cache_sizes)[0] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l2cachesize");
+ if (c.has_value()) {
+ (*cache_sizes)[1] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l3cachesize");
+ if (c.has_value()) {
+ (*cache_sizes)[2] = *c;
+ }
+ }
+
+ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
+ std::string* model_name) {
+ // hardware_flags
+ struct SysCtlCpuFeature {
+ const char* name;
+ int64_t flag;
+ };
+ std::vector<SysCtlCpuFeature> features = {
+#if defined(CPUINFO_ARCH_X86)
+ {"hw.optional.sse4_2",
+ CpuInfo::SSSE3 | CpuInfo::SSE4_1 | CpuInfo::SSE4_2 | CpuInfo::POPCNT},
+ {"hw.optional.avx1_0", CpuInfo::AVX},
+ {"hw.optional.avx2_0", CpuInfo::AVX2},
+ {"hw.optional.bmi1", CpuInfo::BMI1},
+ {"hw.optional.bmi2", CpuInfo::BMI2},
+ {"hw.optional.avx512f", CpuInfo::AVX512F},
+ {"hw.optional.avx512cd", CpuInfo::AVX512CD},
+ {"hw.optional.avx512dq", CpuInfo::AVX512DQ},
+ {"hw.optional.avx512bw", CpuInfo::AVX512BW},
+ {"hw.optional.avx512vl", CpuInfo::AVX512VL},
+#elif defined(CPUINFO_ARCH_ARM)
+ // ARM64 (note that this is exposed under Rosetta as well)
+ {"hw.optional.neon", CpuInfo::ASIMD},
+#endif
+ };
+ for (const auto& feature : features) {
+ auto v = IntegerSysCtlByName(feature.name);
+ if (v.value_or(0)) {
+ *hardware_flags |= feature.flag;
+ }
+ }
+
+ // TODO: vendor, model_name
+ *vendor = CpuInfo::Vendor::Unknown;
+ *model_name = "Unknown";
+ }
+
+#else
+ //------------------------------ LINUX ------------------------------//
+ // Get cache size, return 0 on error
+ int64_t LinuxGetCacheSize(int level) {
+ // get cache size by sysconf()
+#ifdef _SC_LEVEL1_DCACHE_SIZE
+ const int kCacheSizeConf[] = {
+ _SC_LEVEL1_DCACHE_SIZE,
+ _SC_LEVEL2_CACHE_SIZE,
+ _SC_LEVEL3_CACHE_SIZE,
+ };
+ static_assert(sizeof(kCacheSizeConf) / sizeof(kCacheSizeConf[0]) == kCacheLevels, "");
+
+ errno = 0;
+ const int64_t cache_size = sysconf(kCacheSizeConf[level]);
+ if (errno == 0 && cache_size > 0) {
+ return cache_size;
+ }
+#endif
+
+ // get cache size from sysfs if sysconf() fails or not supported
+ const char* kCacheSizeSysfs[] = {
+ "/sys/devices/system/cpu/cpu0/cache/index0/size", // l1d (index1 is l1i)
+ "/sys/devices/system/cpu/cpu0/cache/index2/size", // l2
+ "/sys/devices/system/cpu/cpu0/cache/index3/size", // l3
+ };
+ static_assert(sizeof(kCacheSizeSysfs) / sizeof(kCacheSizeSysfs[0]) == kCacheLevels, "");
+
+ std::ifstream cacheinfo(kCacheSizeSysfs[level], std::ios::in);
+ if (!cacheinfo) {
+ return 0;
+ }
+ // cacheinfo is one line like: 65536, 64K, 1M, etc.
+ uint64_t size = 0;
+ char unit = '\0';
+ cacheinfo >> size >> unit;
+ if (unit == 'K') {
+ size <<= 10;
+ } else if (unit == 'M') {
+ size <<= 20;
+ } else if (unit == 'G') {
+ size <<= 30;
+ } else if (unit != '\0') {
+ return 0;
+ }
+ return static_cast<int64_t>(size);
+ }
+
+ // Helper function to parse for hardware flags from /proc/cpuinfo
+ // values contains a list of space-separated flags. check to see if the flags we
+ // care about are present.
+ // Returns a bitmap of flags.
+ int64_t LinuxParseCpuFlags(const std::string& values) {
+ const struct {
+ std::string name;
+ int64_t flag;
+ } flag_mappings[] = {
+#if defined(CPUINFO_ARCH_X86)
+ {"ssse3", CpuInfo::SSSE3},
+ {"sse4_1", CpuInfo::SSE4_1},
+ {"sse4_2", CpuInfo::SSE4_2},
+ {"popcnt", CpuInfo::POPCNT},
+ {"avx", CpuInfo::AVX},
+ {"avx2", CpuInfo::AVX2},
+ {"avx512f", CpuInfo::AVX512F},
+ {"avx512cd", CpuInfo::AVX512CD},
+ {"avx512vl", CpuInfo::AVX512VL},
+ {"avx512dq", CpuInfo::AVX512DQ},
+ {"avx512bw", CpuInfo::AVX512BW},
+ {"bmi1", CpuInfo::BMI1},
+ {"bmi2", CpuInfo::BMI2},
+#elif defined(CPUINFO_ARCH_ARM)
+ {"asimd", CpuInfo::ASIMD},
+#endif
+ };
+ const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+
+ int64_t flags = 0;
+ for (int i = 0; i < num_flags; ++i) {
+ if (values.find(flag_mappings[i].name) != std::string::npos) {
+ flags |= flag_mappings[i].flag;
+ }
+ }
+ return flags;
+ }
+
+ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
+ for (int i = 0; i < kCacheLevels; ++i) {
+ const int64_t cache_size = LinuxGetCacheSize(i);
+ if (cache_size > 0) {
+ (*cache_sizes)[i] = cache_size;
+ }
+ }
+ }
+
+ static constexpr bool IsWhitespace(char c) {
+ return c == ' ' || c == '\t';
+ }
+
+ std::string TrimString(std::string value) {
+ size_t ltrim_chars = 0;
+ while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) {
+ ++ltrim_chars;
+ }
+ value.erase(0, ltrim_chars);
+ size_t rtrim_chars = 0;
+ while (rtrim_chars < value.size() && IsWhitespace(value[value.size() - 1 - rtrim_chars])) {
+ ++rtrim_chars;
+ }
+ value.erase(value.size() - rtrim_chars, rtrim_chars);
+ return value;
+ }
+
+ // Read from /proc/cpuinfo
+ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
+ std::string* model_name) {
+ std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
+ while (cpuinfo) {
+ std::string line;
+ std::getline(cpuinfo, line);
+ const size_t colon = line.find(':');
+ if (colon != std::string::npos) {
+ const std::string name = TrimString(line.substr(0, colon - 1));
+ const std::string value = TrimString(line.substr(colon + 1, std::string::npos));
+ if (name.compare("flags") == 0 || name.compare("Features") == 0) {
+ *hardware_flags |= LinuxParseCpuFlags(value);
+ } else if (name.compare("model name") == 0) {
+ *model_name = value;
+ } else if (name.compare("vendor_id") == 0) {
+ if (value.compare("GenuineIntel") == 0) {
+ *vendor = CpuInfo::Vendor::Intel;
+ } else if (value.compare("AuthenticAMD") == 0) {
+ *vendor = CpuInfo::Vendor::AMD;
+ }
+ }
+ }
+ }
+ }
+#endif // WINDOWS, MACOS, LINUX
+
+ //============================== Arch Dependent ==============================//
+
+#if defined(CPUINFO_ARCH_X86)
+ //------------------------------ X86_64 ------------------------------//
+ bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
+ enum {
+ USER_SIMD_NONE,
+ USER_SIMD_AVX512,
+ USER_SIMD_MAX,
+ };
+
+ int level = USER_SIMD_MAX;
+ // Parse the level
+ if (simd_level == "AVX512") {
+ level = USER_SIMD_AVX512;
+ } else if (simd_level == "NONE") {
+ level = USER_SIMD_NONE;
+ } else {
+ return false;
+ }
+
+ // Disable feature as the level
+ if (level < USER_SIMD_AVX512) {
+ *hardware_flags &= ~CpuInfo::AVX512;
+ }
+ return true;
+ }
+
+ void ArchVerifyCpuRequirements(const CpuInfo* ci) {
+#if defined(ORC_HAVE_RUNTIME_AVX512)
+ if (!ci->isDetected(CpuInfo::AVX512)) {
+ throw ParseError("CPU does not support the Supplemental AVX512 instruction set");
+ }
+#else
+ UNUSED(ci);
+#endif
+ }
+
+#elif defined(CPUINFO_ARCH_ARM)
+ //------------------------------ AARCH64 ------------------------------//
+ bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
+ if (simd_level == "NONE") {
+ *hardware_flags &= ~CpuInfo::ASIMD;
+ return true;
+ }
+ return false;
+ }
+
+ void ArchVerifyCpuRequirements(const CpuInfo* ci) {
+ if (!ci->isDetected(CpuInfo::ASIMD)) {
+ throw ParseError("CPU does not support the Armv8 Neon instruction set");
+ }
+ }
+
+#else
+ //------------------------------ PPC, ... ------------------------------//
+ bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
+ return true;
+ }
+
+ void ArchVerifyCpuRequirements(const CpuInfo* ci) {}
+
+#endif // X86, ARM, PPC
+
+ } // namespace
+
+ struct CpuInfo::Impl {
+ int64_t hardware_flags = 0;
+ int numCores = 0;
+ int64_t original_hardware_flags = 0;
+ Vendor vendor = Vendor::Unknown;
+ std::string model_name = "Unknown";
+ std::array<int64_t, kCacheLevels> cache_sizes{};
+
+ Impl() {
+ OsRetrieveCacheSize(&cache_sizes);
+ OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name);
+ original_hardware_flags = hardware_flags;
+ numCores = std::max(static_cast<int>(std::thread::hardware_concurrency()), 1);
+
+ // parse user simd level
+ const auto maybe_env_var = std::getenv("ORC_USER_SIMD_LEVEL");
+ std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var);
+ std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(),
+ [](unsigned char c) { return std::toupper(c); });
+ if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) {
+ throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel);
+ }
+ }
+ };
+
+ CpuInfo::~CpuInfo() = default;
+
+ CpuInfo::CpuInfo() : impl_(new Impl) {}
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+ const CpuInfo* CpuInfo::getInstance() {
+ static CpuInfo cpu_info;
+ return &cpu_info;
+ }
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+ int64_t CpuInfo::hardwareFlags() const {
+ return impl_->hardware_flags;
+ }
+
+ int CpuInfo::numCores() const {
+ return impl_->numCores <= 0 ? 1 : impl_->numCores;
+ }
+
+ CpuInfo::Vendor CpuInfo::vendor() const {
+ return impl_->vendor;
+ }
+
+ const std::string& CpuInfo::modelName() const {
+ return impl_->model_name;
+ }
+
+ int64_t CpuInfo::cacheSize(CacheLevel level) const {
+ constexpr int64_t kDefaultCacheSizes[] = {
+ 32 * 1024, // Level 1: 32K
+ 256 * 1024, // Level 2: 256K
+ 3072 * 1024, // Level 3: 3M
+ };
+ static_assert(sizeof(kDefaultCacheSizes) / sizeof(kDefaultCacheSizes[0]) == kCacheLevels, "");
+
+ static_assert(static_cast<int>(CacheLevel::L1) == 0, "");
+ const int i = static_cast<int>(level);
+ if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i];
+ if (i == 0) return kDefaultCacheSizes[0];
+ // l3 may be not available, return maximum of l2 or default size
+ return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]);
+ }
+
+ bool CpuInfo::isSupported(int64_t flags) const {
+ return (impl_->hardware_flags & flags) == flags;
+ }
+
+ bool CpuInfo::isDetected(int64_t flags) const {
+ return (impl_->original_hardware_flags & flags) == flags;
+ }
+
+ void CpuInfo::verifyCpuRequirements() const {
+ return ArchVerifyCpuRequirements(this);
+ }
+
+} // namespace orc
+
+#undef CPUINFO_ARCH_X86
+#undef CPUINFO_ARCH_ARM
+#undef CPUINFO_ARCH_PPC
diff --git a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh
new file mode 100644
index 0000000000..5637053e6d
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CpuInfoUtil.hh is from Apache Arrow as of 2023-03-21
+ */
+
+#ifndef ORC_CPUINFOUTIL_HH
+#define ORC_CPUINFOUTIL_HH
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace orc {
+
+ /**
+ * CpuInfo is an interface to query for cpu information at runtime. The caller can
+ * ask for the sizes of the caches and what hardware features are supported.
+ * On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+ * /sys/devices)
+ */
+ class CpuInfo {
+ public:
+ ~CpuInfo();
+
+ // x86 features
+ static constexpr int64_t SSSE3 = (1LL << 0);
+ static constexpr int64_t SSE4_1 = (1LL << 1);
+ static constexpr int64_t SSE4_2 = (1LL << 2);
+ static constexpr int64_t POPCNT = (1LL << 3);
+ static constexpr int64_t AVX = (1LL << 4);
+ static constexpr int64_t AVX2 = (1LL << 5);
+ static constexpr int64_t AVX512F = (1LL << 6);
+ static constexpr int64_t AVX512CD = (1LL << 7);
+ static constexpr int64_t AVX512VL = (1LL << 8);
+ static constexpr int64_t AVX512DQ = (1LL << 9);
+ static constexpr int64_t AVX512BW = (1LL << 10);
+ static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW;
+ static constexpr int64_t BMI1 = (1LL << 11);
+ static constexpr int64_t BMI2 = (1LL << 12);
+
+ /// Arm features
+ static constexpr int64_t ASIMD = (1LL << 32);
+
+ // Cache enums for L1 (data), L2 and L3
+ enum class CacheLevel { L1 = 0, L2, L3, Last = L3 };
+
+ // CPU vendors
+ enum class Vendor { Unknown, Intel, AMD };
+
+ static const CpuInfo* getInstance();
+
+ // Returns all the flags for this cpu
+ int64_t hardwareFlags() const;
+
+ // Returns the number of cores (including hyper-threaded) on this machine.
+ int numCores() const;
+
+ // Returns the vendor of the cpu.
+ Vendor vendor() const;
+
+ // Returns the model name of the cpu (e.g. Intel i7-2600)
+ const std::string& modelName() const;
+
+ // Returns the size of the cache in KB at this cache level
+ int64_t cacheSize(CacheLevel level) const;
+
+ /**
+ * Returns whether or not the given feature is enabled.
+ * isSupported() is true if isDetected() is also true and the feature
+ * wasn't disabled by the user (for example by setting the ORC_USER_SIMD_LEVEL
+ * environment variable).
+ */
+ bool isSupported(int64_t flags) const;
+
+ // Returns whether or not the given feature is available on the CPU.
+ bool isDetected(int64_t flags) const;
+
+ // Determine if the CPU meets the minimum CPU requirements and if not, issue an error
+ // and terminate.
+ void verifyCpuRequirements() const;
+
+ bool hasEfficientBmi2() const {
+ // BMI2 (pext, pdep) is only efficient on Intel X86 processors.
+ return vendor() == Vendor::Intel && isSupported(BMI2);
+ }
+
+ private:
+ CpuInfo();
+
+ struct Impl;
+ std::unique_ptr<Impl> impl_;
+ };
+
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Dispatch.hh b/contrib/libs/apache/orc/c++/src/Dispatch.hh
new file mode 100644
index 0000000000..489317b28a
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Dispatch.hh
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_DISPATCH_HH
+#define ORC_DISPATCH_HH
+
+#include <utility>
+#include <vector>
+
+#include "CpuInfoUtil.hh"
+
+namespace orc {
+ enum class DispatchLevel : int {
+ // These dispatch levels, corresponding to instruction set features,
+ // are sorted in increasing order of preference.
+ NONE = 0,
+ AVX512,
+ MAX
+ };
+
+ /**
+ * A facility for dynamic dispatch according to available DispatchLevel.
+ *
+ * Typical use:
+ *
+ * static void my_function_default(...);
+ * static void my_function_avx512(...);
+ *
+ * struct MyDynamicFunction {
+ * using FunctionType = decltype(&my_function_default);
+ *
+ * static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ * return {
+ * { DispatchLevel::NONE, my_function_default }
+ * #if defined(ORC_HAVE_RUNTIME_AVX512)
+ * , { DispatchLevel::AVX512, my_function_avx512 }
+ * #endif
+ * };
+ * }
+ * };
+ *
+ * void my_function(...) {
+ * static DynamicDispatch<MyDynamicFunction> dispatch;
+ * return dispatch.func(...);
+ * }
+ */
+ template <typename DynamicFunction>
+ class DynamicDispatch {
+ protected:
+ using FunctionType = typename DynamicFunction::FunctionType;
+ using Implementation = std::pair<DispatchLevel, FunctionType>;
+
+ public:
+ DynamicDispatch() {
+ Resolve(DynamicFunction::implementations());
+ }
+
+ FunctionType func = {};
+
+ protected:
+ // Use the Implementation with the highest DispatchLevel
+ void Resolve(const std::vector<Implementation>& implementations) {
+ Implementation cur{DispatchLevel::NONE, {}};
+
+ for (const auto& impl : implementations) {
+ if (impl.first >= cur.first && levelSupported(impl.first)) {
+ // Higher (or same) level than current
+ cur = impl;
+ }
+ }
+
+ if (!cur.second) {
+ throw InvalidArgument("No appropriate implementation found");
+ }
+ func = cur.second;
+ }
+
+ private:
+ bool levelSupported(DispatchLevel level) const {
+ static const auto cpu_info = CpuInfo::getInstance();
+
+ switch (level) {
+ case DispatchLevel::NONE:
+ return true;
+ case DispatchLevel::AVX512:
+ case DispatchLevel::MAX:
+ return cpu_info->isSupported(CpuInfo::AVX512);
+ default:
+ return false;
+ }
+ }
+ };
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc
index 2077b27df4..23703ff324 100644
--- a/contrib/libs/apache/orc/c++/src/Exceptions.cc
+++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc
@@ -20,59 +20,68 @@
namespace orc {
- NotImplementedYet::NotImplementedYet(const std::string& what_arg
- ) : logic_error(what_arg) {
+ NotImplementedYet::NotImplementedYet(const std::string& what_arg) : logic_error(what_arg) {
// PASS
}
- NotImplementedYet::NotImplementedYet(const char* what_arg
- ) :logic_error(what_arg) {
+ NotImplementedYet::NotImplementedYet(const char* what_arg) : logic_error(what_arg) {
// PASS
}
- NotImplementedYet::NotImplementedYet(const NotImplementedYet& error
- ): logic_error(error) {
+ NotImplementedYet::NotImplementedYet(const NotImplementedYet& error) : logic_error(error) {
// PASS
}
- NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT {
+ NotImplementedYet::~NotImplementedYet() noexcept {
// PASS
}
- ParseError::ParseError(const std::string& what_arg
- ): runtime_error(what_arg) {
+ ParseError::ParseError(const std::string& what_arg) : runtime_error(what_arg) {
// PASS
}
- ParseError::ParseError(const char* what_arg
- ): runtime_error(what_arg) {
+ ParseError::ParseError(const char* what_arg) : runtime_error(what_arg) {
// PASS
}
- ParseError::ParseError(const ParseError& error): runtime_error(error) {
+ ParseError::ParseError(const ParseError& error) : runtime_error(error) {
// PASS
}
- ParseError::~ParseError() ORC_NOEXCEPT {
+ ParseError::~ParseError() noexcept {
// PASS
}
- InvalidArgument::InvalidArgument(const std::string& what_arg
- ): runtime_error(what_arg) {
+ InvalidArgument::InvalidArgument(const std::string& what_arg) : runtime_error(what_arg) {
// PASS
}
- InvalidArgument::InvalidArgument(const char* what_arg
- ): runtime_error(what_arg) {
+ InvalidArgument::InvalidArgument(const char* what_arg) : runtime_error(what_arg) {
// PASS
}
- InvalidArgument::InvalidArgument(const InvalidArgument& error
- ): runtime_error(error) {
+ InvalidArgument::InvalidArgument(const InvalidArgument& error) : runtime_error(error) {
// PASS
}
- InvalidArgument::~InvalidArgument() ORC_NOEXCEPT {
+ InvalidArgument::~InvalidArgument() noexcept {
// PASS
}
-}
+
+ SchemaEvolutionError::SchemaEvolutionError(const std::string& what_arg) : logic_error(what_arg) {
+ // PASS
+ }
+
+ SchemaEvolutionError::SchemaEvolutionError(const char* what_arg) : logic_error(what_arg) {
+ // PASS
+ }
+
+ SchemaEvolutionError::SchemaEvolutionError(const SchemaEvolutionError& error)
+ : logic_error(error) {
+ // PASS
+ }
+
+ SchemaEvolutionError::~SchemaEvolutionError() noexcept {
+ // PASS
+ }
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc
index 4ff500fbac..3c159f3775 100644
--- a/contrib/libs/apache/orc/c++/src/Int128.cc
+++ b/contrib/libs/apache/orc/c++/src/Int128.cc
@@ -45,7 +45,7 @@ namespace orc {
size_t group = std::min(static_cast<size_t>(18), length - posn);
int64_t chunk = std::stoll(str.substr(posn, group));
int64_t multiple = 1;
- for(size_t i=0; i < group; ++i) {
+ for (size_t i = 0; i < group; ++i) {
multiple *= 10;
}
*this *= multiple;
@@ -58,7 +58,7 @@ namespace orc {
}
}
- Int128& Int128::operator*=(const Int128 &right) {
+ Int128& Int128::operator*=(const Int128& right) {
const uint64_t INT_MASK = 0xffffffff;
const uint64_t CARRY_BIT = INT_MASK + 1;
@@ -100,7 +100,7 @@ namespace orc {
* @param wasNegative a flag for whether the value was original negative
* @result the output length of the array
*/
- int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const {
+ int64_t Int128::fillInArray(uint32_t* array, bool& wasNegative) const {
uint64_t high;
uint64_t low;
if (highbits < 0) {
@@ -140,7 +140,6 @@ namespace orc {
}
}
-
/**
* Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is
* the MSB. We can replace this with bsrq asm instruction on x64.
@@ -162,10 +161,10 @@ namespace orc {
*/
void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
if (length > 0 && bits != 0) {
- for(int64_t i=0; i < length-1; ++i) {
- array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits));
+ for (int64_t i = 0; i < length - 1; ++i) {
+ array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits));
}
- array[length-1] <<= bits;
+ array[length - 1] <<= bits;
}
}
@@ -177,8 +176,8 @@ namespace orc {
*/
void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
if (length > 0 && bits != 0) {
- for(int64_t i=length-1; i > 0; --i) {
- array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits));
+ for (int64_t i = length - 1; i > 0; --i) {
+ array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits));
}
array[0] >>= bits;
}
@@ -188,8 +187,8 @@ namespace orc {
* Fix the signs of the result and remainder at the end of the division
* based on the signs of the dividend and divisor.
*/
- void fixDivisionSigns(Int128 &result, Int128 &remainder,
- bool dividendWasNegative, bool divisorWasNegative) {
+ void fixDivisionSigns(Int128& result, Int128& remainder, bool dividendWasNegative,
+ bool divisorWasNegative) {
if (dividendWasNegative != divisorWasNegative) {
result.negate();
}
@@ -203,44 +202,42 @@ namespace orc {
*/
void buildFromArray(Int128& value, uint32_t* array, int64_t length) {
switch (length) {
- case 0:
- value = 0;
- break;
- case 1:
- value = array[0];
- break;
- case 2:
- value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]);
- break;
- case 3:
- value = Int128(array[0],
- (static_cast<uint64_t>(array[1]) << 32) + array[2]);
- break;
- case 4:
- value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1],
- (static_cast<uint64_t>(array[2]) << 32) + array[3]);
- break;
- case 5:
- if (array[0] != 0) {
- throw std::logic_error("Can't build Int128 with 5 ints.");
- }
- value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2],
- (static_cast<uint64_t>(array[3]) << 32) + array[4]);
- break;
- default:
- throw std::logic_error("Unsupported length for building Int128");
+ case 0:
+ value = 0;
+ break;
+ case 1:
+ value = array[0];
+ break;
+ case 2:
+ value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]);
+ break;
+ case 3:
+ value = Int128(array[0], (static_cast<uint64_t>(array[1]) << 32) + array[2]);
+ break;
+ case 4:
+ value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1],
+ (static_cast<uint64_t>(array[2]) << 32) + array[3]);
+ break;
+ case 5:
+ if (array[0] != 0) {
+ throw std::logic_error("Can't build Int128 with 5 ints.");
+ }
+ value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2],
+ (static_cast<uint64_t>(array[3]) << 32) + array[4]);
+ break;
+ default:
+ throw std::logic_error("Unsupported length for building Int128");
}
}
/**
* Do a division where the divisor fits into a single 32 bit value.
*/
- Int128 singleDivide(uint32_t* dividend, int64_t dividendLength,
- uint32_t divisor, Int128& remainder,
- bool dividendWasNegative, bool divisorWasNegative) {
+ Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, uint32_t divisor,
+ Int128& remainder, bool dividendWasNegative, bool divisorWasNegative) {
uint64_t r = 0;
uint32_t resultArray[5];
- for(int64_t j=0; j < dividendLength; j++) {
+ for (int64_t j = 0; j < dividendLength; j++) {
r <<= 32;
r += dividend[j];
resultArray[j] = static_cast<uint32_t>(r / divisor);
@@ -249,12 +246,11 @@ namespace orc {
Int128 result;
buildFromArray(result, resultArray, dividendLength);
remainder = static_cast<int64_t>(r);
- fixDivisionSigns(result, remainder, dividendWasNegative,
- divisorWasNegative);
+ fixDivisionSigns(result, remainder, dividendWasNegative, divisorWasNegative);
return result;
}
- Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const {
+ Int128 Int128::divide(const Int128& divisor, Int128& remainder) const {
// Split the dividend and divisor into integer pieces so that we can
// work on them.
uint32_t dividendArray[5];
@@ -263,7 +259,7 @@ namespace orc {
bool divisorWasNegative;
// leave an extra zero before the dividend
dividendArray[0] = 0;
- int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1;
+ int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative) + 1;
int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative);
// Handle some of the easy cases.
@@ -273,8 +269,8 @@ namespace orc {
} else if (divisorLength == 0) {
throw std::range_error("Division by 0 in Int128");
} else if (divisorLength == 1) {
- return singleDivide(dividendArray, dividendLength, divisorArray[0],
- remainder, dividendWasNegative, divisorWasNegative);
+ return singleDivide(dividendArray, dividendLength, divisorArray[0], remainder,
+ dividendWasNegative, divisorWasNegative);
}
int64_t resultLength = dividendLength - divisorLength;
@@ -288,11 +284,10 @@ namespace orc {
shiftArrayLeft(dividendArray, dividendLength, normalizeBits);
// compute each digit in the result
- for(int64_t j=0; j < resultLength; ++j) {
+ for (int64_t j = 0; j < resultLength; ++j) {
// Guess the next digit. At worst it is two too large
uint32_t guess = UINT32_MAX;
- uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 |
- dividendArray[j+1];
+ uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | dividendArray[j + 1];
if (dividendArray[j] != divisorArray[0]) {
guess = static_cast<uint32_t>(highDividend / divisorArray[0]);
}
@@ -300,10 +295,9 @@ namespace orc {
// catch all of the cases where guess is two too large and most of the
// cases where it is one too large
uint32_t rhat =
- static_cast<uint32_t>(highDividend - guess *
- static_cast<uint64_t>(divisorArray[0]));
+ static_cast<uint32_t>(highDividend - guess * static_cast<uint64_t>(divisorArray[0]));
while (static_cast<uint64_t>(divisorArray[1]) * guess >
- (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) {
+ (static_cast<uint64_t>(rhat) << 32) + dividendArray[j + 2]) {
guess -= 1;
rhat += divisorArray[0];
if (static_cast<uint64_t>(rhat) < divisorArray[0]) {
@@ -313,12 +307,12 @@ namespace orc {
// subtract off the guess * divisor from the dividend
uint64_t mult = 0;
- for(int64_t i=divisorLength-1; i >= 0; --i) {
+ for (int64_t i = divisorLength - 1; i >= 0; --i) {
mult += static_cast<uint64_t>(guess) * divisorArray[i];
- uint32_t prev = dividendArray[j+i+1];
- dividendArray[j+i+1] -= static_cast<uint32_t>(mult);
+ uint32_t prev = dividendArray[j + i + 1];
+ dividendArray[j + i + 1] -= static_cast<uint32_t>(mult);
mult >>= 32;
- if (dividendArray[j+i+1] > prev) {
+ if (dividendArray[j + i + 1] > prev) {
mult += 1;
}
}
@@ -329,10 +323,9 @@ namespace orc {
if (dividendArray[j] > prev) {
guess -= 1;
uint32_t carry = 0;
- for(int64_t i=divisorLength-1; i >= 0; --i) {
- uint64_t sum = static_cast<uint64_t>(divisorArray[i]) +
- dividendArray[j+i+1] + carry;
- dividendArray[j+i+1] = static_cast<uint32_t>(sum);
+ for (int64_t i = divisorLength - 1; i >= 0; --i) {
+ uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + dividendArray[j + i + 1] + carry;
+ dividendArray[j + i + 1] = static_cast<uint32_t>(sum);
carry = static_cast<uint32_t>(sum >> 32);
}
dividendArray[j] += carry;
@@ -348,8 +341,7 @@ namespace orc {
Int128 result;
buildFromArray(result, resultArray, resultLength);
buildFromArray(remainder, dividendArray, dividendLength);
- fixDivisionSigns(result, remainder,
- dividendWasNegative, divisorWasNegative);
+ fixDivisionSigns(result, remainder, dividendWasNegative, divisorWasNegative);
return result;
}
@@ -400,8 +392,7 @@ namespace orc {
int32_t len = static_cast<int32_t>(str.length());
if (len - 1 > scale) {
result = str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(len));
+ str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(len));
} else if (len - 1 == scale) {
result = "-0." + str.substr(1, std::string::npos);
} else {
@@ -415,8 +406,7 @@ namespace orc {
int32_t len = static_cast<int32_t>(str.length());
if (len > scale) {
result = str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(len));
+ str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(len));
} else if (len == scale) {
result = "0." + str;
} else {
@@ -440,37 +430,41 @@ namespace orc {
std::string Int128::toHexString() const {
std::stringstream buf;
- buf << std::hex << "0x"
- << std::setw(16) << std::setfill('0') << highbits
- << std::setw(16) << std::setfill('0') << lowbits;
+ buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits << std::setw(16)
+ << std::setfill('0') << lowbits;
return buf.str();
}
+ double Int128::toDouble() const {
+ if (fitsInLong()) {
+ return static_cast<double>(toLong());
+ }
+ return static_cast<double>(lowbits) + std::ldexp(static_cast<double>(highbits), 64);
+ }
+
const static int32_t MAX_PRECISION_64 = 18;
- const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] =
- {1,
- 10,
- 100,
- 1000,
- 10000,
- 100000,
- 1000000,
- 10000000,
- 100000000,
- 1000000000,
- 10000000000,
- 100000000000,
- 1000000000000,
- 10000000000000,
- 100000000000000,
- 1000000000000000,
- 10000000000000000,
- 100000000000000000,
- 1000000000000000000};
-
- Int128 scaleUpInt128ByPowerOfTen(Int128 value,
- int32_t power,
- bool &overflow) {
+ const static int32_t MAX_PRECISION_128 = 38;
+ const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+ Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power, bool& overflow) {
overflow = false;
Int128 remainder;
@@ -479,7 +473,8 @@ namespace orc {
if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) {
overflow = true;
return Int128::maximumValue();
- } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) {
+ } else if (value < 0 &&
+ Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) {
overflow = true;
return Int128::minimumValue();
}
@@ -501,4 +496,100 @@ namespace orc {
return value;
}
-}
+ std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+ int32_t toScale, bool round) {
+ if (toPrecision > MAX_PRECISION_128 || toPrecision < 1 || toScale < 0 ||
+ toScale > toPrecision || fromScale < 0 ||
+ std::abs(fromScale - toScale) > MAX_PRECISION_128) {
+ std::stringstream buf;
+ buf << "Invalid argument: fromScale=" << fromScale << ", toPrecision=" << toPrecision
+ << ", toScale=" << toScale;
+ throw std::invalid_argument(buf.str());
+ }
+ std::pair<bool, Int128> result;
+ bool negative = value < 0;
+ result.second = value.abs();
+ result.first = false;
+
+ Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+ int8_t roundOffset = 0;
+ int32_t deltaScale = fromScale - toScale;
+
+ if (deltaScale > 0) {
+ Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+ result.second = result.second.divide(scale, remainder);
+ remainder *= 2;
+ if (round && remainder >= scale) {
+ upperBound -= 1;
+ roundOffset = 1;
+ }
+ } else if (deltaScale < 0) {
+ if (result.second > upperBound) {
+ result.first = true;
+ return result;
+ }
+ result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+ }
+
+ if (result.second > upperBound) {
+ result.first = true;
+ return result;
+ }
+
+ result.second += roundOffset;
+ if (negative) {
+ result.second *= -1;
+ }
+ return result;
+ }
+
+ template <typename T>
+ std::enable_if_t<std::is_floating_point_v<T>, std::pair<bool, Int128>> convertDecimal(
+ T value, int32_t precision, int32_t scale) {
+ const static T upperbound = std::ldexp(static_cast<T>(1), 127);
+ const static T lowerbound = -upperbound;
+
+ std::pair<bool, Int128> result = {false, 0};
+ if (precision > MAX_PRECISION_128 || precision < 1 || scale > precision || scale < 0) {
+ result.first = true;
+ return result;
+ }
+
+ if (std::isnan(value) || value <= lowerbound || value >= upperbound) {
+ result.first = true;
+ return result;
+ }
+
+ bool isNegative = (value < 0);
+ Int128 i128, remainder;
+ value = std::fabs(value);
+ if (value >= std::ldexp(static_cast<T>(1.0), 64)) {
+ int64_t hi = static_cast<int64_t>(std::ldexp(value, -64));
+ uint64_t lo = static_cast<uint64_t>(value - std::ldexp(static_cast<T>(hi), 64));
+ i128 = Int128(hi, lo);
+ } else {
+ i128 = Int128(0, static_cast<uint64_t>(value));
+ }
+ value = value - std::floor(value);
+
+ bool overflow = false;
+ i128 = scaleUpInt128ByPowerOfTen(i128, scale, overflow);
+ if (overflow || i128 >= scaleUpInt128ByPowerOfTen(1, precision, overflow)) {
+ result.first = true;
+ return result;
+ }
+
+ value = value * static_cast<T>(pow(10, scale));
+ i128 += static_cast<int64_t>(std::round(value));
+ if (isNegative) {
+ i128 = i128.negate();
+ }
+ result.second = i128;
+ return result;
+ }
+
+ template std::pair<bool, Int128> convertDecimal(float value, int32_t precision, int32_t scale);
+
+ template std::pair<bool, Int128> convertDecimal(double value, int32_t precision, int32_t scale);
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc
index 21bf194fed..f494f4b651 100644
--- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc
+++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc
@@ -1,15 +1,20 @@
/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#include "Adaptor.hh"
@@ -24,8 +29,8 @@ namespace orc {
static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3};
static const int32_t SIZE_OF_SHORT = 2;
- static const int32_t SIZE_OF_INT = 4;
- static const int32_t SIZE_OF_LONG = 8;
+ static const int32_t SIZE_OF_INT = 4;
+ static const int32_t SIZE_OF_LONG = 8;
static std::string toHex(uint64_t val) {
std::ostringstream out;
@@ -39,45 +44,37 @@ namespace orc {
return out.str();
}
- class MalformedInputException: public ParseError {
- public:
- MalformedInputException(int64_t off
- ) :ParseError("MalformedInputException at " +
- toString(off)) {
- }
+ class MalformedInputException : public ParseError {
+ public:
+ MalformedInputException(int64_t off)
+ : ParseError("MalformedInputException at " + toString(off)) {}
- MalformedInputException(int64_t off, const std::string& msg
- ): ParseError("MalformedInputException " + msg +
- " at " + toString(off)) {
- }
+ MalformedInputException(int64_t off, const std::string& msg)
+ : ParseError("MalformedInputException " + msg + " at " + toString(off)) {}
- MalformedInputException(const MalformedInputException& other
- ): ParseError(other.what()) {
- }
+ MalformedInputException(const MalformedInputException& other) : ParseError(other.what()) {}
- virtual ~MalformedInputException() noexcept;
+ ~MalformedInputException() noexcept override;
};
MalformedInputException::~MalformedInputException() noexcept {
// PASS
}
- uint64_t lzoDecompress(const char *inputAddress,
- const char *inputLimit,
- char *outputAddress,
- char *outputLimit) {
+ uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* outputAddress,
+ char* outputLimit) {
// nothing compresses to nothing
if (inputAddress == inputLimit) {
return 0;
}
// maximum offset in buffers to which it's safe to write long-at-a-time
- char * const fastOutputLimit = outputLimit - SIZE_OF_LONG;
+ char* const fastOutputLimit = outputLimit - SIZE_OF_LONG;
// LZO can concat two blocks together so, decode until the input data is
// consumed
- const char *input = inputAddress;
- char *output = outputAddress;
+ const char* input = inputAddress;
+ char* output = outputAddress;
while (input < inputLimit) {
//
// Note: For safety some of the code below may stop decoding early or
@@ -127,8 +124,7 @@ namespace orc {
literalLength = 0xf;
uint32_t nextByte = 0;
- while (input < inputLimit &&
- (nextByte = *(input++) & 0xFF) == 0) {
+ while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) {
literalLength += 0xff;
}
literalLength += nextByte;
@@ -191,8 +187,7 @@ namespace orc {
matchLength = 0x7;
int32_t nextByte = 0;
- while (input < inputLimit &&
- (nextByte = *(input++) & 0xFF) == 0) {
+ while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) {
matchLength += 0xff;
}
matchLength += nextByte;
@@ -231,8 +226,7 @@ namespace orc {
matchLength = 0x1f;
int nextByte = 0;
- while (input < inputLimit &&
- (nextByte = *(input++) & 0xFF) == 0) {
+ while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) {
matchLength += 0xff;
}
matchLength += nextByte;
@@ -276,8 +270,7 @@ namespace orc {
literalLength = (command & 0x3);
} else {
throw MalformedInputException(input - inputAddress - 1,
- "Invalid LZO command " +
- toHex(command));
+ "Invalid LZO command " + toHex(command));
}
firstCommand = false;
@@ -286,12 +279,11 @@ namespace orc {
// lzo encodes match offset minus one
matchOffset++;
- char *matchAddress = output - matchOffset;
- if (matchAddress < outputAddress ||
- output + matchLength > outputLimit) {
+ char* matchAddress = output - matchOffset;
+ if (matchAddress < outputAddress || output + matchLength > outputLimit) {
throw MalformedInputException(input - inputAddress);
}
- char *matchOutputLimit = output + matchLength;
+ char* matchOutputLimit = output + matchLength;
if (output > fastOutputLimit) {
// slow match copy
@@ -343,11 +335,11 @@ namespace orc {
}
}
}
- output = matchOutputLimit; // correction in case we over-copied
+ output = matchOutputLimit; // correction in case we over-copied
}
// copy literal
- char *literalOutputLimit = output + literalLength;
+ char* literalOutputLimit = output + literalLength;
if (literalOutputLimit > fastOutputLimit ||
input + literalLength > inputLimit - SIZE_OF_LONG) {
if (literalOutputLimit > outputLimit) {
@@ -373,8 +365,7 @@ namespace orc {
lastLiteralLength = literalLength;
}
- if (input + SIZE_OF_SHORT > inputLimit &&
- *reinterpret_cast<const int16_t*>(input) != 0) {
+ if (input + SIZE_OF_SHORT > inputLimit && *reinterpret_cast<const int16_t*>(input) != 0) {
throw MalformedInputException(input - inputAddress);
}
input += SIZE_OF_SHORT;
@@ -383,4 +374,4 @@ namespace orc {
return static_cast<uint64_t>(output - outputAddress);
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh
index 9de8537dd8..a37ce8e582 100644
--- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh
+++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh
@@ -33,10 +33,8 @@ namespace orc {
* @param outputLimit one past the last byte of the output buffer
* @result the number of bytes decompressed
*/
- uint64_t lzoDecompress(const char *inputAddress,
- const char *inputLimit,
- char *outputAddress,
- char *outputLimit);
-}
+ uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* outputAddress,
+ char* outputLimit);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc
index ecfb295bae..8c8837aa64 100644
--- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc
+++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc
@@ -16,14 +16,14 @@
* limitations under the License.
*/
-#include "orc/Int128.hh"
#include "orc/MemoryPool.hh"
+#include "orc/Int128.hh"
#include "Adaptor.hh"
+#include <string.h>
#include <cstdlib>
#include <iostream>
-#include <string.h>
namespace orc {
@@ -31,8 +31,8 @@ namespace orc {
// PASS
}
- class MemoryPoolImpl: public MemoryPool {
- public:
+ class MemoryPoolImpl : public MemoryPool {
+ public:
virtual ~MemoryPoolImpl() override;
char* malloc(uint64_t size) override;
@@ -52,30 +52,26 @@ namespace orc {
}
template <class T>
- DataBuffer<T>::DataBuffer(MemoryPool& pool,
- uint64_t newSize
- ): memoryPool(pool),
- buf(nullptr),
- currentSize(0),
- currentCapacity(0) {
- resize(newSize);
+ DataBuffer<T>::DataBuffer(MemoryPool& pool, uint64_t newSize)
+ : memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) {
+ reserve(newSize);
+ currentSize = newSize;
}
template <class T>
- DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer
- ) noexcept:
- memoryPool(buffer.memoryPool),
- buf(buffer.buf),
- currentSize(buffer.currentSize),
- currentCapacity(buffer.currentCapacity) {
+ DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer) noexcept
+ : memoryPool(buffer.memoryPool),
+ buf(buffer.buf),
+ currentSize(buffer.currentSize),
+ currentCapacity(buffer.currentCapacity) {
buffer.buf = nullptr;
buffer.currentSize = 0;
buffer.currentCapacity = 0;
}
template <class T>
- DataBuffer<T>::~DataBuffer(){
- for(uint64_t i=currentSize; i > 0; --i) {
+ DataBuffer<T>::~DataBuffer() {
+ for (uint64_t i = currentSize; i > 0; --i) {
(buf + i - 1)->~T();
}
if (buf) {
@@ -87,11 +83,11 @@ namespace orc {
void DataBuffer<T>::resize(uint64_t newSize) {
reserve(newSize);
if (currentSize > newSize) {
- for(uint64_t i=currentSize; i > newSize; --i) {
+ for (uint64_t i = currentSize; i > newSize; --i) {
(buf + i - 1)->~T();
}
} else if (newSize > currentSize) {
- for(uint64_t i=currentSize; i < newSize; ++i) {
+ for (uint64_t i = currentSize; i < newSize; ++i) {
new (buf + i) T();
}
}
@@ -99,7 +95,7 @@ namespace orc {
}
template <class T>
- void DataBuffer<T>::reserve(uint64_t newCapacity){
+ void DataBuffer<T>::reserve(uint64_t newCapacity) {
if (newCapacity > currentCapacity || !buf) {
if (buf) {
T* buf_old = buf;
@@ -113,10 +109,23 @@ namespace orc {
}
}
+ template <class T>
+ void DataBuffer<T>::zeroOut() {
+ memset(buf, 0, sizeof(T) * currentCapacity);
+ }
+
+ // Specializations for Int128
+ template <>
+ void DataBuffer<Int128>::zeroOut() {
+ for (uint64_t i = 0; i < currentCapacity; ++i) {
+ new (buf + i) Int128();
+ }
+ }
+
// Specializations for char
template <>
- DataBuffer<char>::~DataBuffer(){
+ DataBuffer<char>::~DataBuffer() {
if (buf) {
memoryPool.free(reinterpret_cast<char*>(buf));
}
@@ -134,7 +143,7 @@ namespace orc {
// Specializations for char*
template <>
- DataBuffer<char*>::~DataBuffer(){
+ DataBuffer<char*>::~DataBuffer() {
if (buf) {
memoryPool.free(reinterpret_cast<char*>(buf));
}
@@ -152,7 +161,7 @@ namespace orc {
// Specializations for double
template <>
- DataBuffer<double>::~DataBuffer(){
+ DataBuffer<double>::~DataBuffer() {
if (buf) {
memoryPool.free(reinterpret_cast<char*>(buf));
}
@@ -167,10 +176,28 @@ namespace orc {
currentSize = newSize;
}
+ // Specializations for float
+
+ template <>
+ DataBuffer<float>::~DataBuffer() {
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<float>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(float));
+ }
+ currentSize = newSize;
+ }
+
// Specializations for int64_t
template <>
- DataBuffer<int64_t>::~DataBuffer(){
+ DataBuffer<int64_t>::~DataBuffer() {
if (buf) {
memoryPool.free(reinterpret_cast<char*>(buf));
}
@@ -185,10 +212,64 @@ namespace orc {
currentSize = newSize;
}
+ // Specializations for int32_t
+
+ template <>
+ DataBuffer<int32_t>::~DataBuffer() {
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<int32_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int32_t));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for int16_t
+
+ template <>
+ DataBuffer<int16_t>::~DataBuffer() {
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<int16_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int16_t));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for int8_t
+
+ template <>
+ DataBuffer<int8_t>::~DataBuffer() {
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<int8_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int8_t));
+ }
+ currentSize = newSize;
+ }
+
// Specializations for uint64_t
template <>
- DataBuffer<uint64_t>::~DataBuffer(){
+ DataBuffer<uint64_t>::~DataBuffer() {
if (buf) {
memoryPool.free(reinterpret_cast<char*>(buf));
}
@@ -206,7 +287,7 @@ namespace orc {
// Specializations for unsigned char
template <>
- DataBuffer<unsigned char>::~DataBuffer(){
+ DataBuffer<unsigned char>::~DataBuffer() {
if (buf) {
memoryPool.free(reinterpret_cast<char*>(buf));
}
@@ -221,24 +302,28 @@ namespace orc {
currentSize = newSize;
}
- #ifdef __clang__
- #pragma clang diagnostic ignored "-Wweak-template-vtables"
- #endif
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wweak-template-vtables"
+#endif
template class DataBuffer<char>;
template class DataBuffer<char*>;
template class DataBuffer<double>;
+ template class DataBuffer<float>;
template class DataBuffer<Int128>;
template class DataBuffer<int64_t>;
+ template class DataBuffer<int32_t>;
+ template class DataBuffer<int16_t>;
+ template class DataBuffer<int8_t>;
template class DataBuffer<uint64_t>;
template class DataBuffer<unsigned char>;
- #ifdef __clang__
- #pragma clang diagnostic ignored "-Wexit-time-destructors"
- #endif
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
MemoryPool* getDefaultPool() {
static MemoryPoolImpl internal;
return &internal;
}
-} // namespace orc
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.cc b/contrib/libs/apache/orc/c++/src/Murmur3.cc
index b45bd6d492..518e5e6de5 100644
--- a/contrib/libs/apache/orc/c++/src/Murmur3.cc
+++ b/contrib/libs/apache/orc/c++/src/Murmur3.cc
@@ -16,14 +16,14 @@
* limitations under the License.
*/
-#include "Adaptor.hh"
#include "Murmur3.hh"
+#include "Adaptor.hh"
#define ROTL64(x, r) ((x << r) | (x >> (64 - r)))
namespace orc {
- inline uint64_t rotl64 ( uint64_t x, int8_t r ) {
+ inline uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}
@@ -36,17 +36,17 @@ namespace orc {
return value;
}
- uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) {
+ uint64_t Murmur3::hash64(const uint8_t* data, uint32_t len) {
return hash64(data, len, DEFAULT_SEED);
}
DIAGNOSTIC_PUSH
#if defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough")
+ DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough")
#endif
- uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) {
+ uint64_t Murmur3::hash64(const uint8_t* data, uint32_t len, uint32_t seed) {
uint64_t h = seed;
uint32_t blocks = len >> 3;
@@ -69,16 +69,22 @@ namespace orc {
switch (len - idx) {
case 7:
k ^= static_cast<uint64_t>(data[idx + 6]) << 48;
+ [[fallthrough]];
case 6:
k ^= static_cast<uint64_t>(data[idx + 5]) << 40;
+ [[fallthrough]];
case 5:
k ^= static_cast<uint64_t>(data[idx + 4]) << 32;
+ [[fallthrough]];
case 4:
k ^= static_cast<uint64_t>(data[idx + 3]) << 24;
+ [[fallthrough]];
case 3:
k ^= static_cast<uint64_t>(data[idx + 2]) << 16;
+ [[fallthrough]];
case 2:
k ^= static_cast<uint64_t>(data[idx + 1]) << 8;
+ [[fallthrough]];
case 1:
k ^= static_cast<uint64_t>(data[idx + 0]);
@@ -95,4 +101,4 @@ namespace orc {
DIAGNOSTIC_POP
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.hh b/contrib/libs/apache/orc/c++/src/Murmur3.hh
index 02391811b0..e3db8654bf 100644
--- a/contrib/libs/apache/orc/c++/src/Murmur3.hh
+++ b/contrib/libs/apache/orc/c++/src/Murmur3.hh
@@ -24,17 +24,17 @@
namespace orc {
class Murmur3 {
- public:
+ public:
static const uint32_t DEFAULT_SEED = 104729;
static const uint64_t NULL_HASHCODE = 2862933555777941757LL;
- static uint64_t hash64(const uint8_t *data, uint32_t len);
+ static uint64_t hash64(const uint8_t* data, uint32_t len);
- private:
+ private:
static uint64_t fmix64(uint64_t value);
static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed);
};
-}
+} // namespace orc
-#endif //ORC_MURMUR3_HH
+#endif // ORC_MURMUR3_HH
diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh
index d8331b3c0a..51cd8efd64 100644
--- a/contrib/libs/apache/orc/c++/src/Options.hh
+++ b/contrib/libs/apache/orc/c++/src/Options.hh
@@ -34,31 +34,30 @@ namespace orc {
ColumnSelection_TYPE_IDS = 3,
};
-/**
- * ReaderOptions Implementation
- */
+ /**
+ * ReaderOptions Implementation
+ */
struct ReaderOptionsPrivate {
uint64_t tailLocation;
std::ostream* errorStream;
MemoryPool* memoryPool;
std::string serializedTail;
+ ReaderMetrics* metrics;
ReaderOptionsPrivate() {
tailLocation = std::numeric_limits<uint64_t>::max();
errorStream = &std::cerr;
memoryPool = getDefaultPool();
+ metrics = nullptr;
}
};
- ReaderOptions::ReaderOptions():
- privateBits(std::unique_ptr<ReaderOptionsPrivate>
- (new ReaderOptionsPrivate())) {
+ ReaderOptions::ReaderOptions() : privateBits(std::make_unique<ReaderOptionsPrivate>()) {
// PASS
}
- ReaderOptions::ReaderOptions(const ReaderOptions& rhs):
- privateBits(std::unique_ptr<ReaderOptionsPrivate>
- (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) {
+ ReaderOptions::ReaderOptions(const ReaderOptions& rhs)
+ : privateBits(std::make_unique<ReaderOptionsPrivate>(*(rhs.privateBits.get()))) {
// PASS
}
@@ -83,10 +82,19 @@ namespace orc {
return *this;
}
- MemoryPool* ReaderOptions::getMemoryPool() const{
+ MemoryPool* ReaderOptions::getMemoryPool() const {
return privateBits->memoryPool;
}
+ ReaderOptions& ReaderOptions::setReaderMetrics(ReaderMetrics* metrics) {
+ privateBits->metrics = metrics;
+ return *this;
+ }
+
+ ReaderMetrics* ReaderOptions::getReaderMetrics() const {
+ return privateBits->metrics;
+ }
+
ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) {
privateBits->tailLocation = offset;
return *this;
@@ -96,8 +104,7 @@ namespace orc {
return privateBits->tailLocation;
}
- ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value
- ) {
+ ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value) {
privateBits->serializedTail = value;
return *this;
}
@@ -115,9 +122,9 @@ namespace orc {
return privateBits->errorStream;
}
-/**
- * RowReaderOptions Implementation
- */
+ /**
+ * RowReaderOptions Implementation
+ */
struct RowReaderOptionsPrivate {
ColumnSelection selection;
@@ -131,6 +138,9 @@ namespace orc {
std::shared_ptr<SearchArgument> sargs;
std::string readerTimezone;
RowReaderOptions::IdReadIntentMap idReadIntentMap;
+ bool useTightNumericVector;
+ std::shared_ptr<Type> readType;
+ bool throwOnSchemaEvolutionOverflow;
RowReaderOptionsPrivate() {
selection = ColumnSelection_NONE;
@@ -140,18 +150,17 @@ namespace orc {
forcedScaleOnHive11Decimal = 6;
enableLazyDecoding = false;
readerTimezone = "GMT";
+ useTightNumericVector = false;
+ throwOnSchemaEvolutionOverflow = false;
}
};
- RowReaderOptions::RowReaderOptions():
- privateBits(std::unique_ptr<RowReaderOptionsPrivate>
- (new RowReaderOptionsPrivate())) {
+ RowReaderOptions::RowReaderOptions() : privateBits(std::make_unique<RowReaderOptionsPrivate>()) {
// PASS
}
- RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs):
- privateBits(std::unique_ptr<RowReaderOptionsPrivate>
- (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) {
+ RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs)
+ : privateBits(std::make_unique<RowReaderOptionsPrivate>(*(rhs.privateBits.get()))) {
// PASS
}
@@ -195,8 +204,8 @@ namespace orc {
return *this;
}
- RowReaderOptions&
- RowReaderOptions::includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap) {
+ RowReaderOptions& RowReaderOptions::includeTypesWithIntents(
+ const IdReadIntentMap& idReadIntentMap) {
privateBits->selection = ColumnSelection_TYPE_IDS;
privateBits->includedColumnIndexes.clear();
privateBits->idReadIntentMap.clear();
@@ -242,7 +251,7 @@ namespace orc {
return privateBits->dataLength;
}
- RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){
+ RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow) {
privateBits->throwOnHive11DecimalOverflow = shouldThrow;
return *this;
}
@@ -251,8 +260,16 @@ namespace orc {
return privateBits->throwOnHive11DecimalOverflow;
}
- RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale
- ) {
+ RowReaderOptions& RowReaderOptions::throwOnSchemaEvolutionOverflow(bool shouldThrow) {
+ privateBits->throwOnSchemaEvolutionOverflow = shouldThrow;
+ return *this;
+ }
+
+ bool RowReaderOptions::getThrowOnSchemaEvolutionOverflow() const {
+ return privateBits->throwOnSchemaEvolutionOverflow;
+ }
+
+ RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale) {
privateBits->forcedScaleOnHive11Decimal = forcedScale;
return *this;
}
@@ -288,10 +305,27 @@ namespace orc {
return privateBits->readerTimezone;
}
- const RowReaderOptions::IdReadIntentMap
- RowReaderOptions::getIdReadIntentMap() const {
+ const RowReaderOptions::IdReadIntentMap RowReaderOptions::getIdReadIntentMap() const {
return privateBits->idReadIntentMap;
}
-}
+
+ RowReaderOptions& RowReaderOptions::setUseTightNumericVector(bool useTightNumericVector) {
+ privateBits->useTightNumericVector = useTightNumericVector;
+ return *this;
+ }
+
+ bool RowReaderOptions::getUseTightNumericVector() const {
+ return privateBits->useTightNumericVector;
+ }
+
+ RowReaderOptions& RowReaderOptions::setReadType(std::shared_ptr<Type> type) {
+ privateBits->readType = std::move(type);
+ return *this;
+ }
+
+ std::shared_ptr<Type>& RowReaderOptions::getReadType() const {
+ return privateBits->readType;
+ }
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc
index a0158bbadf..d4b6a86e2f 100644
--- a/contrib/libs/apache/orc/c++/src/OrcFile.cc
+++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc
@@ -16,15 +16,16 @@
* limitations under the License.
*/
-#include "Adaptor.hh"
#include "orc/OrcFile.hh"
+#include "Adaptor.hh"
+#include "Utils.hh"
#include "orc/Exceptions.hh"
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
-#include <sys/stat.h>
#include <string.h>
+#include <sys/stat.h>
#ifdef _MSC_VER
#include <io.h>
@@ -32,6 +33,7 @@
#define S_IWUSR _S_IWRITE
#define stat _stat64
#define fstat _fstat64
+#define fsync _commit
#else
#include <unistd.h>
#define O_BINARY 0
@@ -39,15 +41,22 @@
namespace orc {
+ DIAGNOSTIC_PUSH
+
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wunused-private-field")
+#endif
+
class FileInputStream : public InputStream {
- private:
+ private:
std::string filename;
int file;
uint64_t totalLength;
+ ReaderMetrics* metrics;
- public:
- FileInputStream(std::string _filename) {
- filename = _filename;
+ public:
+ FileInputStream(std::string _filename, ReaderMetrics* _metrics)
+ : filename(_filename), metrics(_metrics) {
file = open(filename.c_str(), O_BINARY | O_RDONLY);
if (file == -1) {
throw ParseError("Can't open " + filename);
@@ -69,9 +78,8 @@ namespace orc {
return 128 * 1024;
}
- void read(void* buf,
- uint64_t length,
- uint64_t offset) override {
+ void read(void* buf, uint64_t length, uint64_t offset) override {
+ SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount);
if (!buf) {
throw ParseError("Buffer is null");
}
@@ -94,42 +102,41 @@ namespace orc {
close(file);
}
- std::unique_ptr<InputStream> readFile(const std::string& path) {
+ std::unique_ptr<InputStream> readFile(const std::string& path, ReaderMetrics* metrics) {
#ifdef BUILD_LIBHDFSPP
- if(strncmp (path.c_str(), "hdfs://", 7) == 0){
- return orc::readHdfsFile(std::string(path));
+ if (strncmp(path.c_str(), "hdfs://", 7) == 0) {
+ return orc::readHdfsFile(std::string(path), metrics);
} else {
#endif
- return orc::readLocalFile(std::string(path));
+ return orc::readLocalFile(std::string(path), metrics);
#ifdef BUILD_LIBHDFSPP
- }
+ }
#endif
}
- std::unique_ptr<InputStream> readLocalFile(const std::string& path) {
- return std::unique_ptr<InputStream>(new FileInputStream(path));
+ DIAGNOSTIC_POP
+
+ std::unique_ptr<InputStream> readLocalFile(const std::string& path, ReaderMetrics* metrics) {
+ return std::make_unique<FileInputStream>(path, metrics);
}
- OutputStream::~OutputStream() {
+ OutputStream::~OutputStream(){
// PASS
};
class FileOutputStream : public OutputStream {
- private:
+ private:
std::string filename;
int file;
uint64_t bytesWritten;
bool closed;
- public:
+ public:
FileOutputStream(std::string _filename) {
bytesWritten = 0;
filename = _filename;
closed = false;
- file = open(
- filename.c_str(),
- O_BINARY | O_CREAT | O_WRONLY | O_TRUNC,
- S_IRUSR | S_IWUSR);
+ file = open(filename.c_str(), O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR);
if (file == -1) {
throw ParseError("Can't open " + filename);
}
@@ -169,6 +176,12 @@ namespace orc {
closed = true;
}
}
+
+ void flush() override {
+ if (!closed) {
+ ::fsync(file);
+ }
+ }
};
FileOutputStream::~FileOutputStream() {
@@ -179,6 +192,6 @@ namespace orc {
}
std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) {
- return std::unique_ptr<OutputStream>(new FileOutputStream(path));
+ return std::make_unique<FileOutputStream>(path);
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc
index 21f9082216..89aca6a10e 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.cc
+++ b/contrib/libs/apache/orc/c++/src/RLE.cc
@@ -1,20 +1,20 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#include "RLEv1.hh"
#include "RLEv2.hh"
@@ -30,52 +30,53 @@ namespace orc {
// PASS
}
- std::unique_ptr<RleEncoder> createRleEncoder
- (std::unique_ptr<BufferedOutputStream> output,
- bool isSigned,
- RleVersion version,
- MemoryPool&,
- bool alignedBitpacking) {
+ std::unique_ptr<RleEncoder> createRleEncoder(std::unique_ptr<BufferedOutputStream> output,
+ bool isSigned, RleVersion version, MemoryPool&,
+ bool alignedBitpacking) {
switch (static_cast<int64_t>(version)) {
- case RleVersion_1:
- // We don't have std::make_unique() yet.
- return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output),
- isSigned));
- case RleVersion_2:
- return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output),
- isSigned, alignedBitpacking));
- default:
- throw NotImplementedYet("Not implemented yet");
+ case RleVersion_1:
+ return std::make_unique<RleEncoderV1>(std::move(output), isSigned);
+ case RleVersion_2:
+ return std::make_unique<RleEncoderV2>(std::move(output), isSigned, alignedBitpacking);
+ default:
+ throw NotImplementedYet("Not implemented yet");
}
}
- std::unique_ptr<RleDecoder> createRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- bool isSigned,
- RleVersion version,
- MemoryPool& pool) {
+ std::unique_ptr<RleDecoder> createRleDecoder(std::unique_ptr<SeekableInputStream> input,
+ bool isSigned, RleVersion version, MemoryPool& pool,
+ ReaderMetrics* metrics) {
switch (static_cast<int64_t>(version)) {
- case RleVersion_1:
- // We don't have std::make_unique() yet.
- return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input),
- isSigned));
- case RleVersion_2:
- return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input),
- isSigned, pool));
- default:
- throw NotImplementedYet("Not implemented yet");
+ case RleVersion_1:
+ return std::make_unique<RleDecoderV1>(std::move(input), isSigned, metrics);
+ case RleVersion_2:
+ return std::make_unique<RleDecoderV2>(std::move(input), isSigned, pool, metrics);
+ default:
+ throw NotImplementedYet("Not implemented yet");
}
}
- void RleEncoder::add(const int64_t* data, uint64_t numValues,
- const char* notNull) {
+ template <typename T>
+ void RleEncoder::add(const T* data, uint64_t numValues, const char* notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- write(data[i]);
+ write(static_cast<int64_t>(data[i]));
}
}
}
+ void RleEncoder::add(const int64_t* data, uint64_t numValues, const char* notNull) {
+ add<int64_t>(data, numValues, notNull);
+ }
+
+ void RleEncoder::add(const int32_t* data, uint64_t numValues, const char* notNull) {
+ add<int32_t>(data, numValues, notNull);
+ }
+
+ void RleEncoder::add(const int16_t* data, uint64_t numValues, const char* notNull) {
+ add<int16_t>(data, numValues, notNull);
+ }
+
void RleEncoder::writeVslong(int64_t val) {
writeVulong((val << 1) ^ (val >> 63));
}
@@ -96,7 +97,7 @@ namespace orc {
void RleEncoder::writeByte(char c) {
if (bufferPosition == bufferLength) {
int addedSize = 0;
- if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
+ if (!outputStream->Next(reinterpret_cast<void**>(&buffer), &addedSize)) {
throw std::bad_alloc();
}
bufferPosition = 0;
diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh
index 6822bd812e..51f9b6f58a 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.hh
+++ b/contrib/libs/apache/orc/c++/src/RLE.hh
@@ -35,20 +35,18 @@ namespace orc {
}
class RleEncoder {
- public:
+ public:
// must be non-inline!
virtual ~RleEncoder();
- RleEncoder(
- std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned):
- outputStream(std::move(outStream)),
- bufferPosition(0),
- bufferLength(0),
- numLiterals(0),
- isSigned(hasSigned),
- buffer(nullptr){
- //pass
+ RleEncoder(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned)
+ : outputStream(std::move(outStream)),
+ bufferPosition(0),
+ bufferLength(0),
+ numLiterals(0),
+ isSigned(hasSigned),
+ buffer(nullptr) {
+ // pass
}
/**
@@ -58,14 +56,19 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const int64_t* data, uint64_t numValues,
- const char* notNull);
+ template <typename T>
+ void add(const T* data, uint64_t numValues, const char* notNull);
+ virtual void add(const int64_t* data, uint64_t numValues, const char* notNull);
+
+ virtual void add(const int32_t* data, uint64_t numValues, const char* notNull);
+
+ virtual void add(const int16_t* data, uint64_t numValues, const char* notNull);
/**
* Get size of buffer used so far.
*/
uint64_t getBufferSize() const {
- return outputStream->getSize();
+ return outputStream->getSize();
}
/**
@@ -81,7 +84,7 @@ namespace orc {
virtual void write(int64_t val) = 0;
- protected:
+ protected:
std::unique_ptr<BufferedOutputStream> outputStream;
size_t bufferPosition;
size_t bufferLength;
@@ -98,10 +101,14 @@ namespace orc {
};
class RleDecoder {
- public:
+ public:
// must be non-inline!
virtual ~RleDecoder();
+ RleDecoder(ReaderMetrics* _metrics) : metrics(_metrics) {
+ // pass
+ }
+
/**
* Seek to a particular spot.
*/
@@ -119,8 +126,14 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void next(int64_t* data, uint64_t numValues,
- const char* notNull) = 0;
+ virtual void next(int64_t* data, uint64_t numValues, const char* notNull) = 0;
+
+ virtual void next(int32_t* data, uint64_t numValues, const char* notNull) = 0;
+
+ virtual void next(int16_t* data, uint64_t numValues, const char* notNull) = 0;
+
+ protected:
+ ReaderMetrics* metrics;
};
/**
@@ -130,12 +143,9 @@ namespace orc {
* @param version version of RLE decoding to do
* @param pool memory pool to use for allocation
*/
- std::unique_ptr<RleEncoder> createRleEncoder
- (std::unique_ptr<BufferedOutputStream> output,
- bool isSigned,
- RleVersion version,
- MemoryPool& pool,
- bool alignedBitpacking);
+ std::unique_ptr<RleEncoder> createRleEncoder(std::unique_ptr<BufferedOutputStream> output,
+ bool isSigned, RleVersion version, MemoryPool& pool,
+ bool alignedBitpacking);
/**
* Create an RLE decoder.
@@ -144,11 +154,9 @@ namespace orc {
* @param version version of RLE decoding to do
* @param pool memory pool to use for allocation
*/
- std::unique_ptr<RleDecoder> createRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- bool isSigned,
- RleVersion version,
- MemoryPool& pool);
+ std::unique_ptr<RleDecoder> createRleDecoder(std::unique_ptr<SeekableInputStream> input,
+ bool isSigned, RleVersion version, MemoryPool& pool,
+ ReaderMetrics* metrics);
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc
index 12e2d057cd..be2c6e2875 100644
--- a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc
+++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc
@@ -1,19 +1,20 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
- * distributed with option work for additional information
- * regarding copyright ownership. The ASF licenses option file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
- * "License"); you may not use option file except in compliance
+ * "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#include "RLEV2Util.hh"
@@ -21,50 +22,44 @@
namespace orc {
// Map FBS enum to bit width value.
- const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = {
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- 26, 28, 30, 32, 40, 48, 56, 64
- };
+ const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 26, 28, 30, 32, 40, 48, 56, 64};
// Map bit length i to closest fixed bit width that can contain i bits.
const uint8_t ClosestFixedBitsMap[65] = {
- 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- 26, 26, 28, 28, 30, 30, 32, 32,
- 40, 40, 40, 40, 40, 40, 40, 40,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 56, 56, 56, 56, 56, 56, 56, 56,
- 64, 64, 64, 64, 64, 64, 64, 64
- };
+ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+ 22, 23, 24, 26, 26, 28, 28, 30, 30, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48,
+ 48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64};
// Map bit length i to closest aligned fixed bit width that can contain i bits.
const uint8_t ClosestAlignedFixedBitsMap[65] = {
- 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24,
- 32, 32, 32, 32, 32, 32, 32, 32,
- 40, 40, 40, 40, 40, 40, 40, 40,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 56, 56, 56, 56, 56, 56, 56, 56,
- 64, 64, 64, 64, 64, 64, 64, 64
- };
+ 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24,
+ 24, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48,
+ 48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64};
// Map bit width to FBS enum.
const uint8_t BitWidthToFBSMap[65] = {
- FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR,
- FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT,
- FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE,
- FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN,
- FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY,
- FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR,
- FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX,
- FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT,
- FixedBitSizes::THIRTY, FixedBitSizes::THIRTY,
- FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO,
- FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
- FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
- FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
- FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
- FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
- FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
- FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR,
- FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR
- };
-}
+ FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO,
+ FixedBitSizes::THREE, FixedBitSizes::FOUR, FixedBitSizes::FIVE,
+ FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT,
+ FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN,
+ FixedBitSizes::TWELVE, FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN,
+ FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, FixedBitSizes::SEVENTEEN,
+ FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY,
+ FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE,
+ FixedBitSizes::TWENTYFOUR, FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX,
+ FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, FixedBitSizes::THIRTY,
+ FixedBitSizes::THIRTY, FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO,
+ FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
+ FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
+ FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTYEIGHT,
+ FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
+ FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
+ FixedBitSizes::FORTYEIGHT, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
+ FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
+ FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
+ FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR,
+ FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR,
+ FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR};
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh
index 95a6826eaa..89c6913400 100644
--- a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh
@@ -1,20 +1,20 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#ifndef ORC_RLEV2UTIL_HH
#define ORC_RLEV2UTIL_HH
@@ -74,8 +74,8 @@ namespace orc {
}
inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) {
- return static_cast<uint32_t >(encoding << 6);
+ return static_cast<uint32_t>(encoding << 6);
}
-}
+} // namespace orc
-#endif //ORC_RLEV2UTIL_HH
+#endif // ORC_RLEV2UTIL_HH
diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc
index fe333978db..b221e8b8aa 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv1.cc
+++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc
@@ -16,287 +16,295 @@
* limitations under the License.
*/
+#include "RLEv1.hh"
#include "Adaptor.hh"
#include "Compression.hh"
+#include "Utils.hh"
#include "orc/Exceptions.hh"
-#include "RLEv1.hh"
#include <algorithm>
namespace orc {
-const uint64_t MINIMUM_REPEAT = 3;
-const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT;
+ const uint64_t MINIMUM_REPEAT = 3;
+ const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT;
-const int64_t BASE_128_MASK = 0x7f;
+ const int64_t BASE_128_MASK = 0x7f;
-const int64_t MAX_DELTA = 127;
-const int64_t MIN_DELTA = -128;
-const uint64_t MAX_LITERAL_SIZE = 128;
+ const int64_t MAX_DELTA = 127;
+ const int64_t MIN_DELTA = -128;
+ const uint64_t MAX_LITERAL_SIZE = 128;
-RleEncoderV1::RleEncoderV1(
- std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned):
- RleEncoder(std::move(outStream), hasSigned) {
- literals = new int64_t[MAX_LITERAL_SIZE];
- delta = 0;
- repeat = false;
- tailRunLength = 0;
-}
+ RleEncoderV1::RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned)
+ : RleEncoder(std::move(outStream), hasSigned) {
+ literals = new int64_t[MAX_LITERAL_SIZE];
+ delta = 0;
+ repeat = false;
+ tailRunLength = 0;
+ }
-RleEncoderV1::~RleEncoderV1() {
- delete [] literals;
-}
+ RleEncoderV1::~RleEncoderV1() {
+ delete[] literals;
+ }
-void RleEncoderV1::writeValues() {
- if (numLiterals != 0) {
- if (repeat) {
- writeByte(static_cast<char>
- (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT));
- writeByte(static_cast<char>(delta));
- if (isSigned) {
- writeVslong(literals[0]);
- } else {
- writeVulong(literals[0]);
- }
- } else {
- writeByte(static_cast<char>(-numLiterals));
- for(size_t i=0; i < numLiterals; ++i) {
+ void RleEncoderV1::writeValues() {
+ if (numLiterals != 0) {
+ if (repeat) {
+ writeByte(static_cast<char>(static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT));
+ writeByte(static_cast<char>(delta));
if (isSigned) {
- writeVslong(literals[i]);
+ writeVslong(literals[0]);
} else {
- writeVulong(literals[i]);
+ writeVulong(literals[0]);
+ }
+ } else {
+ writeByte(static_cast<char>(-numLiterals));
+ for (size_t i = 0; i < numLiterals; ++i) {
+ if (isSigned) {
+ writeVslong(literals[i]);
+ } else {
+ writeVulong(literals[i]);
+ }
}
}
+ repeat = false;
+ numLiterals = 0;
+ tailRunLength = 0;
}
- repeat = false;
- numLiterals = 0;
- tailRunLength = 0;
}
-}
-uint64_t RleEncoderV1::flush() {
- writeValues();
- outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
- uint64_t dataSize = outputStream->flush();
- bufferLength = bufferPosition = 0;
- return dataSize;
-}
+ uint64_t RleEncoderV1::flush() {
+ writeValues();
+ outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
+ uint64_t dataSize = outputStream->flush();
+ bufferLength = bufferPosition = 0;
+ return dataSize;
+ }
-void RleEncoderV1::write(int64_t value) {
- if (numLiterals == 0) {
- literals[numLiterals++] = value;
- tailRunLength = 1;
- } else if (repeat) {
- if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) {
- numLiterals += 1;
- if (numLiterals == MAXIMUM_REPEAT) {
- writeValues();
- }
- } else {
- writeValues();
+ void RleEncoderV1::write(int64_t value) {
+ if (numLiterals == 0) {
literals[numLiterals++] = value;
tailRunLength = 1;
- }
- } else {
- if (tailRunLength == 1) {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) {
+ numLiterals += 1;
+ if (numLiterals == MAXIMUM_REPEAT) {
+ writeValues();
+ }
} else {
- tailRunLength = 2;
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
}
- } else if (value == literals[numLiterals - 1] + delta) {
- tailRunLength += 1;
} else {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
+ if (tailRunLength == 1) {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ } else if (value == literals[numLiterals - 1] + delta) {
+ tailRunLength += 1;
} else {
- tailRunLength = 2;
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
}
- }
- if (tailRunLength == MINIMUM_REPEAT) {
- if (numLiterals + 1 == MINIMUM_REPEAT) {
- repeat = true;
- numLiterals += 1;
+ if (tailRunLength == MINIMUM_REPEAT) {
+ if (numLiterals + 1 == MINIMUM_REPEAT) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
+ int64_t base = literals[numLiterals];
+ writeValues();
+ literals[0] = base;
+ repeat = true;
+ numLiterals = MINIMUM_REPEAT;
+ }
} else {
- numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
- int64_t base = literals[numLiterals];
- writeValues();
- literals[0] = base;
- repeat = true;
- numLiterals = MINIMUM_REPEAT;
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
}
- } else {
- literals[numLiterals++] = value;
- if (numLiterals == MAX_LITERAL_SIZE) {
- writeValues();
+ }
+ }
+
+ signed char RleDecoderV1::readByte() {
+ SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs);
+ if (bufferStart == bufferEnd) {
+ int bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in readByte");
}
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
}
+ return static_cast<signed char>(*(bufferStart++));
}
-}
-signed char RleDecoderV1::readByte() {
- if (bufferStart == bufferEnd) {
- int bufferLength;
- const void* bufferPointer;
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
- throw ParseError("bad read in readByte");
+ uint64_t RleDecoderV1::readLong() {
+ uint64_t result = 0;
+ int64_t offset = 0;
+ signed char ch = readByte();
+ if (ch >= 0) {
+ result = static_cast<uint64_t>(ch);
+ } else {
+ result = static_cast<uint64_t>(ch) & BASE_128_MASK;
+ while ((ch = readByte()) < 0) {
+ offset += 7;
+ result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
+ }
+ result |= static_cast<uint64_t>(ch) << (offset + 7);
}
- bufferStart = static_cast<const char*>(bufferPointer);
- bufferEnd = bufferStart + bufferLength;
+ return result;
}
- return *(bufferStart++);
-}
-uint64_t RleDecoderV1::readLong() {
- uint64_t result = 0;
- int64_t offset = 0;
- signed char ch = readByte();
- if (ch >= 0) {
- result = static_cast<uint64_t>(ch);
- } else {
- result = static_cast<uint64_t>(ch) & BASE_128_MASK;
- while ((ch = readByte()) < 0) {
- offset += 7;
- result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
+ void RleDecoderV1::skipLongs(uint64_t numValues) {
+ while (numValues > 0) {
+ if (readByte() >= 0) {
+ --numValues;
+ }
}
- result |= static_cast<uint64_t>(ch) << (offset + 7);
}
- return result;
-}
-void RleDecoderV1::skipLongs(uint64_t numValues) {
- while (numValues > 0) {
- if (readByte() >= 0) {
- --numValues;
+ void RleDecoderV1::readHeader() {
+ signed char ch = readByte();
+ if (ch < 0) {
+ remainingValues = static_cast<uint64_t>(-ch);
+ repeating = false;
+ } else {
+ remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
+ repeating = true;
+ delta = readByte();
+ value = isSigned ? unZigZag(readLong()) : static_cast<int64_t>(readLong());
}
}
-}
-void RleDecoderV1::readHeader() {
- signed char ch = readByte();
- if (ch < 0) {
- remainingValues = static_cast<uint64_t>(-ch);
+ void RleDecoderV1::reset() {
+ remainingValues = 0;
+ value = 0;
+ bufferStart = nullptr;
+ bufferEnd = nullptr;
+ delta = 0;
repeating = false;
- } else {
- remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
- repeating = true;
- delta = readByte();
- value = isSigned
- ? unZigZag(readLong())
- : static_cast<int64_t>(readLong());
}
-}
-
-RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
- bool hasSigned)
- : inputStream(std::move(input)),
- isSigned(hasSigned),
- remainingValues(0),
- value(0),
- bufferStart(nullptr),
- bufferEnd(bufferStart),
- delta(0),
- repeating(false) {
-}
-void RleDecoderV1::seek(PositionProvider& location) {
- // move the input stream
- inputStream->seek(location);
- // force a re-read from the stream
- bufferEnd = bufferStart;
- // read a new header
- readHeader();
- // skip ahead the given number of records
- skip(location.next());
-}
+ RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool hasSigned,
+ ReaderMetrics* _metrics)
+ : RleDecoder(_metrics), inputStream(std::move(input)), isSigned(hasSigned) {
+ reset();
+ }
-void RleDecoderV1::skip(uint64_t numValues) {
- while (numValues > 0) {
- if (remainingValues == 0) {
- readHeader();
- }
- uint64_t count = std::min(numValues, remainingValues);
- remainingValues -= count;
- numValues -= count;
- if (repeating) {
- value += delta * static_cast<int64_t>(count);
- } else {
- skipLongs(count);
- }
+ void RleDecoderV1::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // reset the decoder status and lazily call readHeader()
+ reset();
+ // skip ahead the given number of records
+ skip(location.next());
}
-}
-void RleDecoderV1::next(int64_t* const data,
- const uint64_t numValues,
- const char* const notNull) {
- uint64_t position = 0;
- // skipNulls()
- if (notNull) {
- // Skip over null values.
- while (position < numValues && !notNull[position]) {
- ++position;
+ void RleDecoderV1::skip(uint64_t numValues) {
+ while (numValues > 0) {
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ uint64_t count = std::min(numValues, remainingValues);
+ remainingValues -= count;
+ numValues -= count;
+ if (repeating) {
+ value += delta * static_cast<int64_t>(count);
+ } else {
+ skipLongs(count);
+ }
}
}
- while (position < numValues) {
- // If we are out of values, read more.
- if (remainingValues == 0) {
- readHeader();
+
+ template <typename T>
+ void RleDecoderV1::next(T* const data, const uint64_t numValues, const char* const notNull) {
+ SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall);
+ uint64_t position = 0;
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
}
- // How many do we read out of this block?
- uint64_t count = std::min(numValues - position, remainingValues);
- uint64_t consumed = 0;
- if (repeating) {
- if (notNull) {
- for (uint64_t i = 0; i < count; ++i) {
- if (notNull[position + i]) {
- data[position + i] = value + static_cast<int64_t>(consumed) * delta;
- consumed += 1;
- }
- }
- } else {
- for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = value + static_cast<int64_t>(i) * delta;
- }
- consumed = count;
+ while (position < numValues) {
+ // If we are out of values, read more.
+ if (remainingValues == 0) {
+ readHeader();
}
- value += static_cast<int64_t>(consumed) * delta;
- } else {
- if (notNull) {
- for (uint64_t i = 0 ; i < count; ++i) {
- if (notNull[position + i]) {
- data[position + i] = isSigned
- ? unZigZag(readLong())
- : static_cast<int64_t>(readLong());
- ++consumed;
+ // How many do we read out of this block?
+ uint64_t count = std::min(numValues - position, remainingValues);
+ uint64_t consumed = 0;
+ if (repeating) {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = static_cast<T>(value + static_cast<int64_t>(consumed) * delta);
+ consumed += 1;
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = static_cast<T>(value + static_cast<int64_t>(i) * delta);
}
+ consumed = count;
}
+ value += static_cast<int64_t>(consumed) * delta;
} else {
- if (isSigned) {
+ if (notNull) {
for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = unZigZag(readLong());
+ if (notNull[position + i]) {
+ data[position + i] =
+ isSigned ? static_cast<T>(unZigZag(readLong())) : static_cast<T>(readLong());
+ ++consumed;
+ }
}
} else {
- for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = static_cast<int64_t>(readLong());
+ if (isSigned) {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = static_cast<T>(unZigZag(readLong()));
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = static_cast<T>(readLong());
+ }
}
+ consumed = count;
}
- consumed = count;
}
- }
- remainingValues -= consumed;
- position += count;
+ remainingValues -= consumed;
+ position += count;
- // skipNulls()
- if (notNull) {
- // Skip over null values.
- while (position < numValues && !notNull[position]) {
- ++position;
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
}
}
}
-}
+ void RleDecoderV1::next(int64_t* data, uint64_t numValues, const char* notNull) {
+ next<int64_t>(data, numValues, notNull);
+ }
+
+ void RleDecoderV1::next(int32_t* data, uint64_t numValues, const char* notNull) {
+ next<int32_t>(data, numValues, notNull);
+ }
+
+ void RleDecoderV1::next(int16_t* data, uint64_t numValues, const char* notNull) {
+ next<int16_t>(data, numValues, notNull);
+ }
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh
index 8e31d70873..fbe6b0f9c6 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv1.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh
@@ -1,20 +1,20 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#ifndef ORC_RLEV1_HH
#define ORC_RLEV1_HH
@@ -26,11 +26,10 @@
namespace orc {
-class RleEncoderV1 : public RleEncoder {
-public:
- RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned);
- ~RleEncoderV1() override ;
+ class RleEncoderV1 : public RleEncoder {
+ public:
+ RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned);
+ ~RleEncoderV1() override;
/**
* Flushing underlying BufferedOutputStream
@@ -39,36 +38,41 @@ public:
void write(int64_t val) override;
-private:
+ private:
int64_t delta;
bool repeat;
uint64_t tailRunLength;
void writeValues();
-};
+ };
-class RleDecoderV1 : public RleDecoder {
-public:
- RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
- bool isSigned);
+ class RleDecoderV1 : public RleDecoder {
+ public:
+ RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool isSigned, ReaderMetrics* metrics);
/**
- * Seek to a particular spot.
- */
+ * Seek to a particular spot.
+ */
void seek(PositionProvider&) override;
/**
- * Seek over a given number of values.
- */
+ * Seek over a given number of values.
+ */
void skip(uint64_t numValues) override;
/**
- * Read a number of values into the batch.
- */
- void next(int64_t* data, uint64_t numValues,
- const char* notNull) override;
+ * Read a number of values into the batch.
+ */
+ template <typename T>
+ void next(T* data, uint64_t numValues, const char* notNull);
+
+ void next(int64_t* data, uint64_t numValues, const char* notNull) override;
-private:
+ void next(int32_t* data, uint64_t numValues, const char* notNull) override;
+
+ void next(int16_t* data, uint64_t numValues, const char* notNull) override;
+
+ private:
inline signed char readByte();
inline void readHeader();
@@ -77,15 +81,17 @@ private:
inline void skipLongs(uint64_t numValues);
+ inline void reset();
+
const std::unique_ptr<SeekableInputStream> inputStream;
const bool isSigned;
uint64_t remainingValues;
int64_t value;
- const char *bufferStart;
- const char *bufferEnd;
+ const char* bufferStart;
+ const char* bufferEnd;
int64_t delta;
bool repeating;
-};
+ };
} // namespace orc
#endif // ORC_RLEV1_HH
diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh
index b1e68fb125..1cee59d0a6 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv2.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh
@@ -1,27 +1,27 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#ifndef ORC_RLEV2_HH
#define ORC_RLEV2_HH
#include "Adaptor.hh"
-#include "orc/Exceptions.hh"
#include "RLE.hh"
+#include "orc/Exceptions.hh"
#include <vector>
@@ -30,46 +30,76 @@
#define HIST_LEN 32
namespace orc {
-struct FixedBitSizes {
+ struct FixedBitSizes {
enum FBS {
- ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
- THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
- TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
- TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
+ ONE = 0,
+ TWO,
+ THREE,
+ FOUR,
+ FIVE,
+ SIX,
+ SEVEN,
+ EIGHT,
+ NINE,
+ TEN,
+ ELEVEN,
+ TWELVE,
+ THIRTEEN,
+ FOURTEEN,
+ FIFTEEN,
+ SIXTEEN,
+ SEVENTEEN,
+ EIGHTEEN,
+ NINETEEN,
+ TWENTY,
+ TWENTYONE,
+ TWENTYTWO,
+ TWENTYTHREE,
+ TWENTYFOUR,
+ TWENTYSIX,
+ TWENTYEIGHT,
+ THIRTY,
+ THIRTYTWO,
+ FORTY,
+ FORTYEIGHT,
+ FIFTYSIX,
+ SIXTYFOUR,
+ SIZE
};
-};
-
-enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
-
-struct EncodingOption {
- EncodingType encoding;
- int64_t fixedDelta;
- int64_t gapVsPatchListCount;
- int64_t zigzagLiteralsCount;
- int64_t baseRedLiteralsCount;
- int64_t adjDeltasCount;
- uint32_t zzBits90p;
- uint32_t zzBits100p;
- uint32_t brBits95p;
- uint32_t brBits100p;
- uint32_t bitsDeltaMax;
- uint32_t patchWidth;
- uint32_t patchGapWidth;
- uint32_t patchLength;
- int64_t min;
- bool isFixedDelta;
-};
-
-class RleEncoderV2 : public RleEncoder {
-public:
- RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);
+ };
+
+ enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 };
+
+ struct EncodingOption {
+ EncodingType encoding;
+ int64_t fixedDelta;
+ int64_t gapVsPatchListCount;
+ int64_t zigzagLiteralsCount;
+ int64_t baseRedLiteralsCount;
+ int64_t adjDeltasCount;
+ uint32_t zzBits90p;
+ uint32_t zzBits100p;
+ uint32_t brBits95p;
+ uint32_t brBits100p;
+ uint32_t bitsDeltaMax;
+ uint32_t patchWidth;
+ uint32_t patchGapWidth;
+ uint32_t patchLength;
+ int64_t min;
+ bool isFixedDelta;
+ };
+
+ class RleEncoderV2 : public RleEncoder {
+ public:
+ RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned,
+ bool alignBitPacking = true);
~RleEncoderV2() override {
- delete [] literals;
- delete [] gapVsPatchList;
- delete [] zigzagLiterals;
- delete [] baseRedLiterals;
- delete [] adjDeltas;
+ delete[] literals;
+ delete[] gapVsPatchList;
+ delete[] zigzagLiterals;
+ delete[] baseRedLiterals;
+ delete[] adjDeltas;
}
/**
* Flushing underlying BufferedOutputStream
@@ -78,20 +108,19 @@ public:
void write(int64_t val) override;
-private:
-
+ private:
const bool alignedBitPacking;
uint32_t fixedRunLength;
uint32_t variableRunLength;
int64_t prevDelta;
int32_t histgram[HIST_LEN];
- // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val),
- // it is move here for performance consideration.
+ // The four list below should actually belong to EncodingOption since it only holds temporal
+ // values in write(int64_t val), it is move here for performance consideration.
int64_t* gapVsPatchList;
- int64_t* zigzagLiterals;
- int64_t* baseRedLiterals;
- int64_t* adjDeltas;
+ int64_t* zigzagLiterals;
+ int64_t* baseRedLiterals;
+ int64_t* adjDeltas;
uint32_t getOpCode(EncodingType encoding);
int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
@@ -106,97 +135,155 @@ private:
void writeDirectValues(EncodingOption& option);
void writePatchedBasedValues(EncodingOption& option);
void writeDeltaValues(EncodingOption& option);
- uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false);
-};
-
-class RleDecoderV2 : public RleDecoder {
-public:
- RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
- bool isSigned, MemoryPool& pool);
-
- /**
- * Seek to a particular spot.
- */
- void seek(PositionProvider&) override;
-
- /**
- * Seek over a given number of values.
- */
- void skip(uint64_t numValues) override;
-
- /**
- * Read a number of values into the batch.
- */
- void next(int64_t* data, uint64_t numValues,
- const char* notNull) override;
-
-private:
-
- /**
- * Decode the next gap and patch from 'unpackedPatch' and update the index on it.
- * Used by PATCHED_BASE.
- *
- * @param patchBitSize bit size of the patch value
- * @param patchMask mask for the patch value
- * @param resGap result of gap
- * @param resPatch result of patch
- * @param patchIdx current index in the 'unpackedPatch' buffer
- */
- void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask,
- int64_t* resGap, int64_t* resPatch, uint64_t* patchIdx);
-
- void resetReadLongs() {
- bitsLeft = 0;
- curByte = 0;
- }
+ uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p,
+ bool reuseHist = false);
+ };
- void resetRun() {
- resetReadLongs();
- }
+ class RleDecoderV2 : public RleDecoder {
+ public:
+ RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned, MemoryPool& pool,
+ ReaderMetrics* metrics);
+
+ /**
+ * Seek to a particular spot.
+ */
+ void seek(PositionProvider&) override;
+
+ /**
+ * Seek over a given number of values.
+ */
+ void skip(uint64_t numValues) override;
+
+ /**
+ * Read a number of values into the batch.
+ */
+ template <typename T>
+ void next(T* data, uint64_t numValues, const char* notNull);
+
+ void next(int64_t* data, uint64_t numValues, const char* notNull) override;
+
+ void next(int32_t* data, uint64_t numValues, const char* notNull) override;
+
+ void next(int16_t* data, uint64_t numValues, const char* notNull) override;
+
+ unsigned char readByte();
+
+ void setBufStart(const char* start) {
+ bufferStart = const_cast<char*>(start);
+ }
- unsigned char readByte();
-
- int64_t readLongBE(uint64_t bsz);
- int64_t readVslong();
- uint64_t readVulong();
- void readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs);
- void plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs);
-
- void unrolledUnpack4(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack8(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack16(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack24(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack32(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack40(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len);
- void unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len);
-
- uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
- uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
- uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
- uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
-
- uint64_t copyDataFromBuffer(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
-
- const std::unique_ptr<SeekableInputStream> inputStream;
- const bool isSigned;
-
- unsigned char firstByte;
- uint64_t runLength; // Length of the current run
- uint64_t runRead; // Number of returned values of the current run
- const char *bufferStart;
- const char *bufferEnd;
- uint32_t bitsLeft; // Used by readLongs when bitSize < 8
- uint32_t curByte; // Used by anything that uses readLongs
- DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
- DataBuffer<int64_t> literals; // Values of the current run
-};
+ char* getBufStart() {
+ return bufferStart;
+ }
+
+ void setBufEnd(const char* end) {
+ bufferEnd = const_cast<char*>(end);
+ }
+
+ char* getBufEnd() {
+ return bufferEnd;
+ }
+
+ uint64_t bufLength() {
+ return bufferEnd - bufferStart;
+ }
+
+ void setBitsLeft(const uint32_t bits) {
+ bitsLeft = bits;
+ }
+
+ void setCurByte(const uint32_t byte) {
+ curByte = byte;
+ }
+
+ uint32_t getBitsLeft() {
+ return bitsLeft;
+ }
+
+ uint32_t getCurByte() {
+ return curByte;
+ }
+
+ /**
+ * Most hotspot of this function locates in saving stack, so inline this function to have
+ * performance gain.
+ */
+ inline void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen);
+
+ private:
+ /**
+ * Decode the next gap and patch from 'unpackedPatch' and update the index on it.
+ * Used by PATCHED_BASE.
+ *
+ * @param patchBitSize bit size of the patch value
+ * @param patchMask mask for the patch value
+ * @param resGap result of gap
+ * @param resPatch result of patch
+ * @param patchIdx current index in the 'unpackedPatch' buffer
+ */
+ void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap,
+ int64_t* resPatch, uint64_t* patchIdx);
+
+ void resetReadLongs() {
+ bitsLeft = 0;
+ curByte = 0;
+ }
+
+ void resetRun() {
+ resetReadLongs();
+ }
+
+ int64_t readLongBE(uint64_t bsz);
+ int64_t readVslong();
+ uint64_t readVulong();
+ void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs);
+
+ template <typename T>
+ uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
+ template <typename T>
+ uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
+ template <typename T>
+ uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
+ template <typename T>
+ uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
+ template <typename T>
+ uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned;
+ unsigned char firstByte;
+ char* bufferStart;
+ char* bufferEnd;
+ uint64_t runLength; // Length of the current run
+ uint64_t runRead; // Number of returned values of the current run
+ uint32_t bitsLeft; // Used by readLongs when bitSize < 8
+ uint32_t curByte; // Used by anything that uses readLongs
+ DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
+ DataBuffer<int64_t> literals; // Values of the current run
+ };
+
+ inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) {
+ uint64_t remainingLen = bufLength();
+ int bufferLength = 0;
+ const void* bufferPointer = nullptr;
+
+ if (backupByteLen != 0) {
+ inputStream->BackUp(backupByteLen);
+ }
+
+ if (len >= remainingLen && resetBuf) {
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in RleDecoderV2::resetBufferStart");
+ }
+ }
+
+ if (bufferPointer == nullptr) {
+ bufferStart += len;
+ } else {
+ bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer));
+ bufferEnd = bufferStart + bufferLength;
+ }
+ }
} // namespace orc
#endif // ORC_RLEV2_HH
diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc
index 6a9068f202..2cc88fbb80 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.cc
+++ b/contrib/libs/apache/orc/c++/src/Reader.cc
@@ -16,43 +16,49 @@
* limitations under the License.
*/
+#include "Reader.hh"
#include "Adaptor.hh"
#include "BloomFilter.hh"
#include "Options.hh"
-#include "Reader.hh"
#include "Statistics.hh"
#include "StripeStream.hh"
+#include "Utils.hh"
#include "wrap/coded-stream-wrapper.h"
#include <algorithm>
#include <iostream>
+#include <iterator>
#include <memory>
+#include <set>
#include <sstream>
#include <string>
#include <vector>
-#include <iterator>
-#include <set>
namespace orc {
// ORC files writen by these versions of cpp writers have inconsistent bloom filter
// hashing. Bloom filters of them should not be used.
static const char* BAD_CPP_BLOOM_FILTER_VERSIONS[] = {
- "1.6.0", "1.6.1", "1.6.2", "1.6.3", "1.6.4", "1.6.5", "1.6.6", "1.6.7", "1.6.8",
- "1.6.9", "1.6.10", "1.6.11", "1.7.0"};
+ "1.6.0", "1.6.1", "1.6.2", "1.6.3", "1.6.4", "1.6.5", "1.6.6",
+ "1.6.7", "1.6.8", "1.6.9", "1.6.10", "1.6.11", "1.7.0"};
+
+ ReaderMetrics* getDefaultReaderMetrics() {
+ static ReaderMetrics internal;
+ return &internal;
+ }
const RowReaderOptions::IdReadIntentMap EMPTY_IDREADINTENTMAP() {
return {};
}
- const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() {
+ const WriterVersionImpl& WriterVersionImpl::VERSION_HIVE_8732() {
static const WriterVersionImpl version(WriterVersion_HIVE_8732);
return version;
}
uint64_t getCompressionBlockSize(const proto::PostScript& ps) {
- if (ps.has_compressionblocksize()) {
- return ps.compressionblocksize();
+ if (ps.has_compression_block_size()) {
+ return ps.compression_block_size();
} else {
return 256 * 1024;
}
@@ -67,31 +73,29 @@ namespace orc {
}
std::string ColumnSelector::toDotColumnPath() {
- if (columns.empty()) {
- return std::string();
- }
- std::ostringstream columnStream;
- std::copy(columns.begin(), columns.end(),
+ if (columns.empty()) {
+ return std::string();
+ }
+ std::ostringstream columnStream;
+ std::copy(columns.begin(), columns.end(),
std::ostream_iterator<std::string>(columnStream, "."));
- std::string columnPath = columnStream.str();
- return columnPath.substr(0, columnPath.length() - 1);
+ std::string columnPath = columnStream.str();
+ return columnPath.substr(0, columnPath.length() - 1);
}
- WriterVersion getWriterVersionImpl(const FileContents * contents) {
- if (!contents->postscript->has_writerversion()) {
+ WriterVersion getWriterVersionImpl(const FileContents* contents) {
+ if (!contents->postscript->has_writer_version()) {
return WriterVersion_ORIGINAL;
}
- return static_cast<WriterVersion>(contents->postscript->writerversion());
+ return static_cast<WriterVersion>(contents->postscript->writer_version());
}
void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) {
return selectChildren(selectedColumns, type, EMPTY_IDREADINTENTMAP());
}
- void ColumnSelector::selectChildren(
- std::vector<bool> &selectedColumns,
- const Type &type,
- const RowReaderOptions::IdReadIntentMap& idReadIntentMap) {
+ void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type,
+ const RowReaderOptions::IdReadIntentMap& idReadIntentMap) {
size_t id = static_cast<size_t>(type.getColumnId());
TypeKind kind = type.getKind();
if (!selectedColumns[id]) {
@@ -99,8 +103,7 @@ namespace orc {
bool selectChild = true;
if (kind == TypeKind::LIST || kind == TypeKind::MAP || kind == TypeKind::UNION) {
auto elem = idReadIntentMap.find(id);
- if (elem != idReadIntentMap.end() &&
- elem->second == ReadIntent_OFFSETS) {
+ if (elem != idReadIntentMap.end() && elem->second == ReadIntent_OFFSETS) {
selectChild = false;
}
}
@@ -121,7 +124,7 @@ namespace orc {
size_t id = static_cast<size_t>(type.getColumnId());
bool result = selectedColumns[id];
uint64_t numSubtypeSelected = 0;
- for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
+ for (uint64_t c = 0; c < type.getSubtypeCount(); ++c) {
if (selectParents(selectedColumns, *type.getSubtype(c))) {
result = true;
numSubtypeSelected++;
@@ -169,20 +172,19 @@ namespace orc {
const RowReaderOptions& options) {
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) {
- for(std::list<uint64_t>::const_iterator field = options.getInclude().begin();
- field != options.getInclude().end(); ++field) {
+ for (std::list<uint64_t>::const_iterator field = options.getInclude().begin();
+ field != options.getInclude().end(); ++field) {
updateSelectedByFieldId(selectedColumns, *field);
}
} else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) {
- for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
- field != options.getIncludeNames().end(); ++field) {
+ for (std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
+ field != options.getIncludeNames().end(); ++field) {
updateSelectedByName(selectedColumns, *field);
}
} else if (options.getTypeIdsSet()) {
- const RowReaderOptions::IdReadIntentMap idReadIntentMap =
- options.getIdReadIntentMap();
- for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
- typeId != options.getInclude().end(); ++typeId) {
+ const RowReaderOptions::IdReadIntentMap idReadIntentMap = options.getIdReadIntentMap();
+ for (std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
+ typeId != options.getInclude().end(); ++typeId) {
updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap);
}
} else {
@@ -190,7 +192,7 @@ namespace orc {
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
+ selectedColumns[0] = true; // column 0 is selected by default
}
void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns,
@@ -210,16 +212,14 @@ namespace orc {
}
void ColumnSelector::updateSelectedByTypeId(
- std::vector<bool> &selectedColumns,
- uint64_t typeId,
+ std::vector<bool>& selectedColumns, uint64_t typeId,
const RowReaderOptions::IdReadIntentMap& idReadIntentMap) {
if (typeId < selectedColumns.size()) {
const Type& type = *idTypeMap[typeId];
selectChildren(selectedColumns, type, idReadIntentMap);
} else {
std::stringstream buffer;
- buffer << "Invalid type id selected " << typeId << " out of "
- << selectedColumns.size();
+ buffer << "Invalid type id selected " << typeId << " out of " << selectedColumns.size();
throw ParseError(buffer.str());
}
}
@@ -242,36 +242,39 @@ namespace orc {
}
}
- ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) {
+ ColumnSelector::ColumnSelector(const FileContents* _contents) : contents(_contents) {
buildTypeNameIdMap(contents->schema.get());
}
RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents,
- const RowReaderOptions& opts
- ): localTimezone(getLocalTimezone()),
- contents(_contents),
- throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()),
- forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()),
- footer(contents->footer.get()),
- firstRowOfStripe(*contents->pool, 0),
- enableEncodedBlock(opts.getEnableLazyDecoding()),
- readerTimezone(getTimezoneByName(opts.getTimezoneName())) {
+ const RowReaderOptions& opts)
+ : localTimezone(getLocalTimezone()),
+ contents(_contents),
+ throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()),
+ forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()),
+ footer(contents->footer.get()),
+ firstRowOfStripe(*contents->pool, 0),
+ enableEncodedBlock(opts.getEnableLazyDecoding()),
+ readerTimezone(getTimezoneByName(opts.getTimezoneName())),
+ schemaEvolution(opts.getReadType(), contents->schema.get()) {
uint64_t numberOfStripes;
numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
currentStripe = numberOfStripes;
lastStripe = 0;
currentRowInStripe = 0;
rowsInCurrentStripe = 0;
+ numRowGroupsInStripeRange = 0;
+ useTightNumericVector = opts.getUseTightNumericVector();
+ throwOnSchemaEvolutionOverflow = opts.getThrowOnSchemaEvolutionOverflow();
uint64_t rowTotal = 0;
firstRowOfStripe.resize(numberOfStripes);
- for(size_t i=0; i < numberOfStripes; ++i) {
+ for (size_t i = 0; i < numberOfStripes; ++i) {
firstRowOfStripe[i] = rowTotal;
- proto::StripeInformation stripeInfo =
- footer->stripes(static_cast<int>(i));
- rowTotal += stripeInfo.numberofrows();
+ proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(i));
+ rowTotal += stripeInfo.number_of_rows();
bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() &&
- stripeInfo.offset() < opts.getOffset() + opts.getLength();
+ stripeInfo.offset() < opts.getOffset() + opts.getLength();
if (isStripeInRange) {
if (i < currentStripe) {
currentStripe = i;
@@ -279,28 +282,33 @@ namespace orc {
if (i >= lastStripe) {
lastStripe = i + 1;
}
+ if (footer->row_index_stride() > 0) {
+ numRowGroupsInStripeRange +=
+ (stripeInfo.number_of_rows() + footer->row_index_stride() - 1) /
+ footer->row_index_stride();
+ }
}
}
firstStripe = currentStripe;
+ processingStripe = lastStripe;
if (currentStripe == 0) {
previousRow = (std::numeric_limits<uint64_t>::max)();
} else if (currentStripe == numberOfStripes) {
- previousRow = footer->numberofrows();
+ previousRow = footer->number_of_rows();
} else {
- previousRow = firstRowOfStripe[firstStripe]-1;
+ previousRow = firstRowOfStripe[firstStripe] - 1;
}
ColumnSelector column_selector(contents.get());
column_selector.updateSelected(selectedColumns, opts);
// prepare SargsApplier if SearchArgument is available
- if (opts.getSearchArgument() && footer->rowindexstride() > 0) {
+ if (opts.getSearchArgument() && footer->row_index_stride() > 0) {
sargs = opts.getSearchArgument();
- sargsApplier.reset(new SargsApplier(*contents->schema,
- sargs.get(),
- footer->rowindexstride(),
- getWriterVersionImpl(_contents.get())));
+ sargsApplier.reset(
+ new SargsApplier(*contents->schema, sargs.get(), footer->row_index_stride(),
+ getWriterVersionImpl(_contents.get()), contents->readerMetrics));
}
skipBloomFilters = hasBadBloomFilters();
@@ -314,9 +322,9 @@ namespace orc {
// 1.6.x releases before 1.6.11 won't have it. On the other side, the C++ writer
// supports writing bloom filters since 1.6.0. So files written by the C++ writer
// and with 'softwareVersion' unset would have bad bloom filters.
- if (!footer->has_softwareversion()) return true;
+ if (!footer->has_software_version()) return true;
- const std::string &fullVersion = footer->softwareversion();
+ const std::string& fullVersion = footer->software_version();
std::string version;
// Deal with snapshot versions, e.g. 1.6.12-SNAPSHOT.
if (fullVersion.find('-') != std::string::npos) {
@@ -324,7 +332,7 @@ namespace orc {
} else {
version = fullVersion;
}
- for (const char *v : BAD_CPP_BLOOM_FILTER_VERSIONS) {
+ for (const char* v : BAD_CPP_BLOOM_FILTER_VERSIONS) {
if (version == v) {
return true;
}
@@ -346,8 +354,7 @@ namespace orc {
const Type& RowReaderImpl::getSelectedType() const {
if (selectedSchema.get() == nullptr) {
- selectedSchema = buildSelectedType(contents->schema.get(),
- selectedColumns);
+ selectedSchema = buildSelectedType(contents->schema.get(), selectedColumns);
}
return *(selectedSchema.get());
}
@@ -369,49 +376,56 @@ namespace orc {
// seeking past lastStripe
uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size());
- if ( (lastStripe == num_stripes
- && rowNumber >= footer->numberofrows()) ||
- (lastStripe < num_stripes
- && rowNumber >= firstRowOfStripe[lastStripe]) ) {
+ if ((lastStripe == num_stripes && rowNumber >= footer->number_of_rows()) ||
+ (lastStripe < num_stripes && rowNumber >= firstRowOfStripe[lastStripe])) {
currentStripe = num_stripes;
- previousRow = footer->numberofrows();
+ previousRow = footer->number_of_rows();
return;
}
uint64_t seekToStripe = 0;
- while (seekToStripe+1 < lastStripe &&
- firstRowOfStripe[seekToStripe+1] <= rowNumber) {
+ while (seekToStripe + 1 < lastStripe && firstRowOfStripe[seekToStripe + 1] <= rowNumber) {
seekToStripe++;
}
// seeking before the first stripe
if (seekToStripe < firstStripe) {
currentStripe = num_stripes;
- previousRow = footer->numberofrows();
+ previousRow = footer->number_of_rows();
return;
}
- currentStripe = seekToStripe;
- currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
previousRow = rowNumber;
- startNextStripe();
+ auto rowIndexStride = footer->row_index_stride();
+ if (!isCurrentStripeInited() || currentStripe != seekToStripe || rowIndexStride == 0 ||
+ currentStripeInfo.index_length() == 0) {
+ // current stripe is not initialized or
+ // target stripe is not current stripe or
+ // current stripe doesn't have row indexes
+ currentStripe = seekToStripe;
+ currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
+ startNextStripe();
+ if (currentStripe >= lastStripe) {
+ return;
+ }
+ } else {
+ currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
+ if (sargsApplier) {
+ // advance to selected row group if predicate pushdown is enabled
+ currentRowInStripe =
+ advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe,
+ footer->row_index_stride(), sargsApplier->getNextSkippedRows());
+ }
+ }
uint64_t rowsToSkip = currentRowInStripe;
- auto rowIndexStride = footer->rowindexstride();
// seek to the target row group if row indexes exists
- if (rowIndexStride > 0 && currentStripeInfo.indexlength() > 0) {
- // when predicate push down is enabled, above call to startNextStripe()
- // will move current row to 1st matching row group; here we only need
- // to deal with the case when PPD is not enabled.
- if (!sargsApplier) {
- if (rowIndexes.empty()) {
- loadStripeIndex();
- }
- auto rowGroupId = static_cast<uint32_t>(rowsToSkip / rowIndexStride);
- if (rowGroupId != 0) {
- seekToRowGroup(rowGroupId);
- }
+ if (rowIndexStride > 0 && currentStripeInfo.index_length() > 0) {
+ if (rowIndexes.empty()) {
+ loadStripeIndex();
}
+ // TODO(ORC-1175): process the failures of loadStripeIndex() call
+ seekToRowGroup(static_cast<uint32_t>(rowsToSkip / rowIndexStride));
// skip leading rows in the target row group
rowsToSkip %= rowIndexStride;
}
@@ -432,19 +446,14 @@ namespace orc {
for (int i = 0; i < currentStripeFooter.streams_size(); ++i) {
const proto::Stream& pbStream = currentStripeFooter.streams(i);
uint64_t colId = pbStream.column();
- if (selectedColumns[colId] && pbStream.has_kind()
- && (pbStream.kind() == proto::Stream_Kind_ROW_INDEX ||
- pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) {
- std::unique_ptr<SeekableInputStream> inStream =
- createDecompressor(getCompression(),
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream
- (contents->stream.get(),
- offset,
- pbStream.length(),
- *contents->pool)),
- getCompressionSize(),
- *contents->pool);
+ if (selectedColumns[colId] && pbStream.has_kind() &&
+ (pbStream.kind() == proto::Stream_Kind_ROW_INDEX ||
+ pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) {
+ std::unique_ptr<SeekableInputStream> inStream = createDecompressor(
+ getCompression(),
+ std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
+ contents->stream.get(), offset, pbStream.length(), *contents->pool)),
+ getCompressionSize(), *contents->pool, contents->readerMetrics);
if (pbStream.kind() == proto::Stream_Kind_ROW_INDEX) {
proto::RowIndex rowIndex;
@@ -452,17 +461,16 @@ namespace orc {
throw ParseError("Failed to parse the row index");
}
rowIndexes[colId] = rowIndex;
- } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8
+ } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8
proto::BloomFilterIndex pbBFIndex;
if (!pbBFIndex.ParseFromZeroCopyStream(inStream.get())) {
throw ParseError("Failed to parse bloom filter index");
}
BloomFilterIndex bfIndex;
- for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) {
+ for (int j = 0; j < pbBFIndex.bloom_filter_size(); j++) {
bfIndex.entries.push_back(BloomFilterUTF8Utils::deserialize(
- pbStream.kind(),
- currentStripeFooter.columns(static_cast<int>(pbStream.column())),
- pbBFIndex.bloomfilter(j)));
+ pbStream.kind(), currentStripeFooter.columns(static_cast<int>(pbStream.column())),
+ pbBFIndex.bloom_filter(j)));
}
// add bloom filters to result for one column
bloomFilterIndex[pbStream.column()] = bfIndex;
@@ -478,11 +486,10 @@ namespace orc {
// store position providers for selected colimns
std::unordered_map<uint64_t, PositionProvider> positionProviders;
- for (auto rowIndex = rowIndexes.cbegin();
- rowIndex != rowIndexes.cend(); ++rowIndex) {
+ for (auto rowIndex = rowIndexes.cbegin(); rowIndex != rowIndexes.cend(); ++rowIndex) {
uint64_t colId = rowIndex->first;
const proto::RowIndexEntry& entry =
- rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId));
+ rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId));
// copy index positions for a specific column
positions.push_back({});
@@ -514,22 +521,16 @@ namespace orc {
proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
const FileContents& contents) {
- uint64_t stripeFooterStart = info.offset() + info.indexlength() +
- info.datalength();
- uint64_t stripeFooterLength = info.footerlength();
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents.compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents.stream.get(),
- stripeFooterStart,
- stripeFooterLength,
- *contents.pool)),
- contents.blockSize,
- *contents.pool);
+ uint64_t stripeFooterStart = info.offset() + info.index_length() + info.data_length();
+ uint64_t stripeFooterLength = info.footer_length();
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ contents.compression,
+ std::make_unique<SeekableFileInputStream>(contents.stream.get(), stripeFooterStart,
+ stripeFooterLength, *contents.pool),
+ contents.blockSize, *contents.pool, contents.readerMetrics);
proto::StripeFooter result;
if (!result.ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError(std::string("bad StripeFooter from ") +
- pbStream->getName());
+ throw ParseError(std::string("bad StripeFooter from ") + pbStream->getName());
}
// Verify StripeFooter in case it's corrupt
if (result.columns_size() != contents.footer->types_size()) {
@@ -541,31 +542,29 @@ namespace orc {
return result;
}
- ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents,
- const ReaderOptions& opts,
- uint64_t _fileLength,
- uint64_t _postscriptLength
- ): contents(std::move(_contents)),
- options(opts),
- fileLength(_fileLength),
- postscriptLength(_postscriptLength),
- footer(contents->footer.get()) {
+ ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, const ReaderOptions& opts,
+ uint64_t _fileLength, uint64_t _postscriptLength)
+ : contents(std::move(_contents)),
+ options(opts),
+ fileLength(_fileLength),
+ postscriptLength(_postscriptLength),
+ footer(contents->footer.get()) {
isMetadataLoaded = false;
checkOrcVersion();
numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
- contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer));
+ contents->schema = convertType(footer->types(0), *footer);
contents->blockSize = getCompressionBlockSize(*contents->postscript);
- contents->compression= convertCompressionKind(*contents->postscript);
+ contents->compression = convertCompressionKind(*contents->postscript);
}
std::string ReaderImpl::getSerializedFileTail() const {
proto::FileTail tail;
- proto::PostScript *mutable_ps = tail.mutable_postscript();
+ proto::PostScript* mutable_ps = tail.mutable_postscript();
mutable_ps->CopyFrom(*contents->postscript);
- proto::Footer *mutableFooter = tail.mutable_footer();
+ proto::Footer* mutableFooter = tail.mutable_footer();
mutableFooter->CopyFrom(*footer);
- tail.set_filelength(fileLength);
- tail.set_postscriptlength(postscriptLength);
+ tail.set_file_length(fileLength);
+ tail.set_postscript_length(postscriptLength);
TString result;
if (!tail.SerializeToString(&result)) {
throw ParseError("Failed to serialize file tail");
@@ -593,29 +592,21 @@ namespace orc {
if (!isMetadataLoaded) {
readMetadata();
}
- return contents->metadata == nullptr ? 0 :
- static_cast<uint64_t>(contents->metadata->stripestats_size());
+ return contents->metadata == nullptr
+ ? 0
+ : static_cast<uint64_t>(contents->metadata->stripe_stats_size());
}
- std::unique_ptr<StripeInformation>
- ReaderImpl::getStripe(uint64_t stripeIndex) const {
+ std::unique_ptr<StripeInformation> ReaderImpl::getStripe(uint64_t stripeIndex) const {
if (stripeIndex > getNumberOfStripes()) {
throw std::logic_error("stripe index out of range");
}
- proto::StripeInformation stripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
+ proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(stripeIndex));
- return std::unique_ptr<StripeInformation>
- (new StripeInformationImpl
- (stripeInfo.offset(),
- stripeInfo.indexlength(),
- stripeInfo.datalength(),
- stripeInfo.footerlength(),
- stripeInfo.numberofrows(),
- contents->stream.get(),
- *contents->pool,
- contents->compression,
- contents->blockSize));
+ return std::unique_ptr<StripeInformation>(new StripeInformationImpl(
+ stripeInfo.offset(), stripeInfo.index_length(), stripeInfo.data_length(),
+ stripeInfo.footer_length(), stripeInfo.number_of_rows(), contents->stream.get(),
+ *contents->pool, contents->compression, contents->blockSize, contents->readerMetrics));
}
FileVersion ReaderImpl::getFormatVersion() const {
@@ -626,16 +617,16 @@ namespace orc {
}
uint64_t ReaderImpl::getNumberOfRows() const {
- return footer->numberofrows();
+ return footer->number_of_rows();
}
WriterId ReaderImpl::getWriterId() const {
if (footer->has_writer()) {
uint32_t id = footer->writer();
- if (id > WriterId::TRINO_WRITER) {
+ if (id > WriterId::CUDF_WRITER) {
return WriterId::UNKNOWN_WRITER;
} else {
- return static_cast<WriterId>(id);
+ return static_cast<WriterId>(id);
}
}
return WriterId::ORC_JAVA_WRITER;
@@ -652,8 +643,8 @@ namespace orc {
std::string ReaderImpl::getSoftwareVersion() const {
std::ostringstream buffer;
buffer << writerIdToString(getWriterIdValue());
- if (footer->has_softwareversion()) {
- buffer << " " << footer->softwareversion();
+ if (footer->has_software_version()) {
+ buffer << " " << footer->software_version();
}
return buffer.str();
}
@@ -663,15 +654,15 @@ namespace orc {
}
uint64_t ReaderImpl::getContentLength() const {
- return footer->contentlength();
+ return footer->content_length();
}
uint64_t ReaderImpl::getStripeStatisticsLength() const {
- return contents->postscript->metadatalength();
+ return contents->postscript->metadata_length();
}
uint64_t ReaderImpl::getFileFooterLength() const {
- return contents->postscript->footerlength();
+ return contents->postscript->footer_length();
}
uint64_t ReaderImpl::getFilePostscriptLength() const {
@@ -683,7 +674,7 @@ namespace orc {
}
uint64_t ReaderImpl::getRowIndexStride() const {
- return footer->rowindexstride();
+ return footer->row_index_stride();
}
const std::string& ReaderImpl::getStreamName() const {
@@ -692,14 +683,14 @@ namespace orc {
std::list<std::string> ReaderImpl::getMetadataKeys() const {
std::list<std::string> result;
- for(int i=0; i < footer->metadata_size(); ++i) {
+ for (int i = 0; i < footer->metadata_size(); ++i) {
result.push_back(footer->metadata(i).name());
}
return result;
}
std::string ReaderImpl::getMetadataValue(const std::string& key) const {
- for(int i=0; i < footer->metadata_size(); ++i) {
+ for (int i = 0; i < footer->metadata_size(); ++i) {
if (footer->metadata(i).name() == TString(key)) {
return footer->metadata(i).value();
}
@@ -707,12 +698,13 @@ namespace orc {
throw std::range_error("key not found");
}
- void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo,
- uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter,
- std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const {
+ void ReaderImpl::getRowIndexStatistics(
+ const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
+ const proto::StripeFooter& currentStripeFooter,
+ std::vector<std::vector<proto::ColumnStatistics>>* indexStats) const {
int num_streams = currentStripeFooter.streams_size();
uint64_t offset = stripeInfo.offset();
- uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength();
+ uint64_t indexEnd = stripeInfo.offset() + stripeInfo.index_length();
for (int i = 0; i < num_streams; i++) {
const proto::Stream& stream = currentStripeFooter.streams(i);
StreamKind streamKind = static_cast<StreamKind>(stream.kind());
@@ -722,19 +714,15 @@ namespace orc {
std::stringstream msg;
msg << "Malformed RowIndex stream meta in stripe " << stripeIndex
<< ": streamOffset=" << offset << ", streamLength=" << length
- << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength="
- << stripeInfo.indexlength();
+ << ", stripeOffset=" << stripeInfo.offset()
+ << ", stripeIndexLength=" << stripeInfo.index_length();
throw ParseError(msg.str());
}
std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents->stream.get(),
- offset,
- length,
- *contents->pool)),
- contents->blockSize,
- *(contents->pool));
+ createDecompressor(contents->compression,
+ std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
+ contents->stream.get(), offset, length, *contents->pool)),
+ contents->blockSize, *(contents->pool), contents->readerMetrics);
proto::RowIndex rowIndex;
if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) {
@@ -752,7 +740,7 @@ namespace orc {
}
bool ReaderImpl::hasMetadataValue(const std::string& key) const {
- for(int i=0; i < footer->metadata_size(); ++i) {
+ for (int i = 0; i < footer->metadata_size(); ++i) {
if (footer->metadata(i).name() == TString(key)) {
return true;
}
@@ -764,8 +752,7 @@ namespace orc {
return *(contents->schema.get());
}
- std::unique_ptr<StripeStatistics>
- ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
+ std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
if (!isMetadataLoaded) {
readMetadata();
}
@@ -773,48 +760,40 @@ namespace orc {
throw std::logic_error("No stripe statistics in file");
}
size_t num_cols = static_cast<size_t>(
- contents->metadata->stripestats(
- static_cast<int>(stripeIndex)).colstats_size());
- std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols);
+ contents->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
+ std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols);
- proto::StripeInformation currentStripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
- proto::StripeFooter currentStripeFooter =
- getStripeFooter(currentStripeInfo, *contents.get());
+ proto::StripeInformation currentStripeInfo = footer->stripes(static_cast<int>(stripeIndex));
+ proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get());
getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
- const Timezone& writerTZ =
- currentStripeFooter.has_writertimezone() ?
- getTimezoneByName(currentStripeFooter.writertimezone()) :
- getLocalTimezone();
+ const Timezone& writerTZ = currentStripeFooter.has_writer_timezone()
+ ? getTimezoneByName(currentStripeFooter.writer_timezone())
+ : getLocalTimezone();
StatContext statContext(hasCorrectStatistics(), &writerTZ);
- return std::unique_ptr<StripeStatistics>
- (new StripeStatisticsImpl(contents->metadata->stripestats(static_cast<int>(stripeIndex)),
- indexStats, statContext));
+ return std::make_unique<StripeStatisticsImpl>(
+ contents->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext);
}
std::unique_ptr<Statistics> ReaderImpl::getStatistics() const {
StatContext statContext(hasCorrectStatistics());
- return std::unique_ptr<Statistics>
- (new StatisticsImpl(*footer, statContext));
+ return std::make_unique<StatisticsImpl>(*footer, statContext);
}
- std::unique_ptr<ColumnStatistics>
- ReaderImpl::getColumnStatistics(uint32_t index) const {
+ std::unique_ptr<ColumnStatistics> ReaderImpl::getColumnStatistics(uint32_t index) const {
if (index >= static_cast<uint64_t>(footer->statistics_size())) {
throw std::logic_error("column index out of range");
}
- proto::ColumnStatistics col =
- footer->statistics(static_cast<int32_t>(index));
+ proto::ColumnStatistics col = footer->statistics(static_cast<int32_t>(index));
StatContext statContext(hasCorrectStatistics());
- return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext));
+ return std::unique_ptr<ColumnStatistics>(convertColumnStatistics(col, statContext));
}
void ReaderImpl::readMetadata() const {
- uint64_t metadataSize = contents->postscript->metadatalength();
- uint64_t footerLength = contents->postscript->footerlength();
+ uint64_t metadataSize = contents->postscript->metadata_length();
+ uint64_t footerLength = contents->postscript->footer_length();
if (fileLength < metadataSize + footerLength + postscriptLength + 1) {
std::stringstream msg;
msg << "Invalid Metadata length: fileLength=" << fileLength
@@ -824,15 +803,11 @@ namespace orc {
}
uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1;
if (metadataSize != 0) {
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents->stream.get(),
- metadataStart,
- metadataSize,
- *contents->pool)),
- contents->blockSize,
- *contents->pool);
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ contents->compression,
+ std::make_unique<SeekableFileInputStream>(contents->stream.get(), metadataStart,
+ metadataSize, *contents->pool),
+ contents->blockSize, *contents->pool, contents->readerMetrics);
contents->metadata.reset(new proto::Metadata());
if (!contents->metadata->ParseFromZeroCopyStream(pbStream.get())) {
throw ParseError("Failed to parse the metadata");
@@ -848,10 +823,9 @@ namespace orc {
void ReaderImpl::checkOrcVersion() {
FileVersion version = getFormatVersion();
if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) {
- *(options.getErrorStream())
- << "Warning: ORC file " << contents->stream->getName()
- << " was written in an unknown format version "
- << version.toString() << "\n";
+ *(options.getErrorStream()) << "Warning: ORC file " << contents->stream->getName()
+ << " was written in an unknown format version "
+ << version.toString() << "\n";
}
}
@@ -860,13 +834,12 @@ namespace orc {
return createRowReader(defaultOpts);
}
- std::unique_ptr<RowReader> ReaderImpl::createRowReader(
- const RowReaderOptions& opts) const {
+ std::unique_ptr<RowReader> ReaderImpl::createRowReader(const RowReaderOptions& opts) const {
if (opts.getSearchArgument() && !isMetadataLoaded) {
// load stripe statistics for PPD
readMetadata();
}
- return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts));
+ return std::make_unique<RowReaderImpl>(contents, opts);
}
uint64_t maxStreamsForType(const proto::Type& type) {
@@ -895,8 +868,8 @@ namespace orc {
case proto::Type_Kind_VARCHAR:
return 4;
default:
- return 0;
- }
+ return 0;
+ }
}
uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
@@ -910,8 +883,8 @@ namespace orc {
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
ColumnSelector column_selector(contents.get());
if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) {
- for(std::list<uint64_t>::const_iterator field = include.begin();
- field != include.end(); ++field) {
+ for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end();
+ ++field) {
column_selector.updateSelectedByFieldId(selectedColumns, *field);
}
} else {
@@ -919,7 +892,7 @@ namespace orc {
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
+ selectedColumns[0] = true; // column 0 is selected by default
return getMemoryUse(stripeIx, selectedColumns);
}
@@ -928,8 +901,8 @@ namespace orc {
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
ColumnSelector column_selector(contents.get());
if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) {
- for(std::list<std::string>::const_iterator field = names.begin();
- field != names.end(); ++field) {
+ for (std::list<std::string>::const_iterator field = names.begin(); field != names.end();
+ ++field) {
column_selector.updateSelectedByName(selectedColumns, *field);
}
} else {
@@ -937,7 +910,7 @@ namespace orc {
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
+ selectedColumns[0] = true; // column 0 is selected by default
return getMemoryUse(stripeIx, selectedColumns);
}
@@ -946,8 +919,8 @@ namespace orc {
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
ColumnSelector column_selector(contents.get());
if (include.begin() != include.end()) {
- for(std::list<uint64_t>::const_iterator field = include.begin();
- field != include.end(); ++field) {
+ for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end();
+ ++field) {
column_selector.updateSelectedByTypeId(selectedColumns, *field);
}
} else {
@@ -955,7 +928,7 @@ namespace orc {
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
+ selectedColumns[0] = true; // column 0 is selected by default
return getMemoryUse(stripeIx, selectedColumns);
}
@@ -963,13 +936,13 @@ namespace orc {
uint64_t maxDataLength = 0;
if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
- uint64_t stripe = footer->stripes(stripeIx).datalength();
+ uint64_t stripe = footer->stripes(stripeIx).data_length();
if (maxDataLength < stripe) {
maxDataLength = stripe;
}
} else {
- for (int i=0; i < footer->stripes_size(); i++) {
- uint64_t stripe = footer->stripes(i).datalength();
+ for (int i = 0; i < footer->stripes_size(); i++) {
+ uint64_t stripe = footer->stripes(i).data_length();
if (maxDataLength < stripe) {
maxDataLength = stripe;
}
@@ -978,10 +951,10 @@ namespace orc {
bool hasStringColumn = false;
uint64_t nSelectedStreams = 0;
- for (int i=0; !hasStringColumn && i < footer->types_size(); i++) {
+ for (int i = 0; !hasStringColumn && i < footer->types_size(); i++) {
if (selectedColumns[static_cast<size_t>(i)]) {
const proto::Type& type = footer->types(i);
- nSelectedStreams += maxStreamsForType(type) ;
+ nSelectedStreams += maxStreamsForType(type);
switch (static_cast<int64_t>(type.kind())) {
case proto::Type_Kind_CHAR:
case proto::Type_Kind_STRING:
@@ -997,22 +970,23 @@ namespace orc {
}
}
- /* If a string column is read, use stripe datalength as a memory estimate
+ /* If a string column is read, use stripe data_length as a memory estimate
* because we don't know the dictionary size. Multiply by 2 because
* a string column requires two buffers:
* in the input stream and in the seekable input stream.
* If no string column is read, estimate from the number of streams.
*/
- uint64_t memory = hasStringColumn ? 2 * maxDataLength :
- std::min(uint64_t(maxDataLength),
- nSelectedStreams * contents->stream->getNaturalReadSize());
+ uint64_t memory = hasStringColumn
+ ? 2 * maxDataLength
+ : std::min(uint64_t(maxDataLength),
+ nSelectedStreams * contents->stream->getNaturalReadSize());
// Do we need even more memory to read the footer or the metadata?
- if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) {
- memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS;
+ if (memory < contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS) {
+ memory = contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS;
}
- if (memory < contents->postscript->metadatalength()) {
- memory = contents->postscript->metadatalength();
+ if (memory < contents->postscript->metadata_length()) {
+ memory = contents->postscript->metadata_length();
}
// Account for firstRowOfStripe.
@@ -1021,7 +995,7 @@ namespace orc {
// Decompressors need buffers for each stream
uint64_t decompressorMemory = 0;
if (contents->compression != CompressionKind_NONE) {
- for (int i=0; i < footer->types_size(); i++) {
+ for (int i = 0; i < footer->types_size(); i++) {
if (selectedColumns[static_cast<size_t>(i)]) {
const proto::Type& type = footer->types(i);
decompressorMemory += maxStreamsForType(type) * contents->blockSize;
@@ -1032,7 +1006,7 @@ namespace orc {
}
}
- return memory + decompressorMemory ;
+ return memory + decompressorMemory;
}
// Update fields to indicate we've reached the end of file
@@ -1045,17 +1019,17 @@ namespace orc {
previousRow = 0;
} else {
previousRow = firstRowOfStripe[lastStripe - 1] +
- footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows();
+ footer->stripes(static_cast<int>(lastStripe - 1)).number_of_rows();
}
}
void RowReaderImpl::startNextStripe() {
- reader.reset(); // ColumnReaders use lots of memory; free old memory first
+ reader.reset(); // ColumnReaders use lots of memory; free old memory first
rowIndexes.clear();
bloomFilterIndex.clear();
// evaluate file statistics if it exists
- if (sargsApplier && !sargsApplier->evaluateFileStatistics(*footer)) {
+ if (sargsApplier && !sargsApplier->evaluateFileStatistics(*footer, numRowGroupsInStripeRange)) {
// skip the entire file
markEndOfFile();
return;
@@ -1064,25 +1038,32 @@ namespace orc {
do {
currentStripeInfo = footer->stripes(static_cast<int>(currentStripe));
uint64_t fileLength = contents->stream->getLength();
- if (currentStripeInfo.offset() + currentStripeInfo.indexlength() +
- currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) {
+ if (currentStripeInfo.offset() + currentStripeInfo.index_length() +
+ currentStripeInfo.data_length() + currentStripeInfo.footer_length() >=
+ fileLength) {
std::stringstream msg;
- msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength="
- << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength="
- << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength()
- << ", footerLength=" << currentStripeInfo.footerlength() << ")";
+ msg << "Malformed StripeInformation at stripe index " << currentStripe
+ << ": fileLength=" << fileLength
+ << ", StripeInfo=(offset=" << currentStripeInfo.offset()
+ << ", indexLength=" << currentStripeInfo.index_length()
+ << ", dataLength=" << currentStripeInfo.data_length()
+ << ", footerLength=" << currentStripeInfo.footer_length() << ")";
throw ParseError(msg.str());
}
currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get());
- rowsInCurrentStripe = currentStripeInfo.numberofrows();
+ rowsInCurrentStripe = currentStripeInfo.number_of_rows();
+ processingStripe = currentStripe;
if (sargsApplier) {
bool isStripeNeeded = true;
if (contents->metadata) {
const auto& currentStripeStats =
- contents->metadata->stripestats(static_cast<int>(currentStripe));
+ contents->metadata->stripe_stats(static_cast<int>(currentStripe));
// skip this stripe after stats fail to satisfy sargs
- isStripeNeeded = sargsApplier->evaluateStripeStatistics(currentStripeStats);
+ uint64_t stripeRowGroupCount =
+ (rowsInCurrentStripe + footer->row_index_stride() - 1) / footer->row_index_stride();
+ isStripeNeeded =
+ sargsApplier->evaluateStripeStatistics(currentStripeStats, stripeRowGroupCount);
}
if (isStripeNeeded) {
@@ -1090,9 +1071,7 @@ namespace orc {
loadStripeIndex();
// select row groups to read in the current stripe
- sargsApplier->pickRowGroups(rowsInCurrentStripe,
- rowIndexes,
- bloomFilterIndex);
+ sargsApplier->pickRowGroups(rowsInCurrentStripe, rowIndexes, bloomFilterIndex);
if (sargsApplier->hasSelectedFrom(currentRowInStripe)) {
// current stripe has at least one row group matching the predicate
break;
@@ -1110,26 +1089,23 @@ namespace orc {
if (currentStripe < lastStripe) {
// get writer timezone info from stripe footer to help understand timestamp values.
const Timezone& writerTimezone =
- currentStripeFooter.has_writertimezone() ?
- getTimezoneByName(currentStripeFooter.writertimezone()) :
- localTimezone;
- StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo,
- currentStripeFooter,
- currentStripeInfo.offset(),
- *contents->stream,
- writerTimezone,
+ currentStripeFooter.has_writer_timezone()
+ ? getTimezoneByName(currentStripeFooter.writer_timezone())
+ : localTimezone;
+ StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, currentStripeFooter,
+ currentStripeInfo.offset(), *contents->stream, writerTimezone,
readerTimezone);
- reader = buildReader(*contents->schema, stripeStreams);
+ reader = buildReader(*contents->schema, stripeStreams, useTightNumericVector,
+ throwOnSchemaEvolutionOverflow, /*convertToReadType=*/true);
if (sargsApplier) {
// move to the 1st selected row group when PPD is enabled.
- currentRowInStripe = advanceToNextRowGroup(currentRowInStripe,
- rowsInCurrentStripe,
- footer->rowindexstride(),
- sargsApplier->getNextSkippedRows());
+ currentRowInStripe =
+ advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe,
+ footer->row_index_stride(), sargsApplier->getNextSkippedRows());
previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe - 1;
if (currentRowInStripe > 0) {
- seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride()));
+ seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride()));
}
}
} else {
@@ -1139,6 +1115,7 @@ namespace orc {
}
bool RowReaderImpl::next(ColumnVectorBatch& data) {
+ SCOPED_STOPWATCH(contents->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall);
if (currentStripe >= lastStripe) {
data.numElements = 0;
markEndOfFile();
@@ -1148,14 +1125,10 @@ namespace orc {
startNextStripe();
}
uint64_t rowsToRead =
- std::min(static_cast<uint64_t>(data.capacity),
- rowsInCurrentStripe - currentRowInStripe);
+ std::min(static_cast<uint64_t>(data.capacity), rowsInCurrentStripe - currentRowInStripe);
if (sargsApplier && rowsToRead > 0) {
- rowsToRead = computeBatchSize(rowsToRead,
- currentRowInStripe,
- rowsInCurrentStripe,
- footer->rowindexstride(),
- sargsApplier->getNextSkippedRows());
+ rowsToRead = computeBatchSize(rowsToRead, currentRowInStripe, rowsInCurrentStripe,
+ footer->row_index_stride(), sargsApplier->getNextSkippedRows());
}
data.numElements = rowsToRead;
if (rowsToRead == 0) {
@@ -1164,8 +1137,7 @@ namespace orc {
}
if (enableEncodedBlock) {
reader->nextEncoded(data, rowsToRead, nullptr);
- }
- else {
+ } else {
reader->next(data, rowsToRead, nullptr);
}
// update row number
@@ -1174,15 +1146,14 @@ namespace orc {
// check if we need to advance to next selected row group
if (sargsApplier) {
- uint64_t nextRowToRead = advanceToNextRowGroup(currentRowInStripe,
- rowsInCurrentStripe,
- footer->rowindexstride(),
- sargsApplier->getNextSkippedRows());
+ uint64_t nextRowToRead =
+ advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, footer->row_index_stride(),
+ sargsApplier->getNextSkippedRows());
if (currentRowInStripe != nextRowToRead) {
// it is guaranteed to be at start of a row group
currentRowInStripe = nextRowToRead;
if (currentRowInStripe < rowsInCurrentStripe) {
- seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride()));
+ seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride()));
}
}
}
@@ -1194,10 +1165,8 @@ namespace orc {
return rowsToRead != 0;
}
- uint64_t RowReaderImpl::computeBatchSize(uint64_t requestedSize,
- uint64_t currentRowInStripe,
- uint64_t rowsInCurrentStripe,
- uint64_t rowIndexStride,
+ uint64_t RowReaderImpl::computeBatchSize(uint64_t requestedSize, uint64_t currentRowInStripe,
+ uint64_t rowsInCurrentStripe, uint64_t rowIndexStride,
const std::vector<uint64_t>& nextSkippedRows) {
// In case of PPD, batch size should be aware of row group boundaries. If only a subset of row
// groups are selected then marker position is set to the end of range (subset of row groups
@@ -1240,18 +1209,39 @@ namespace orc {
return rowsInCurrentStripe;
}
- std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch
- (uint64_t capacity) const {
- return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock);
+ static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+ columnIds.insert(type->getColumnId());
+ for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+ getColumnIds(type->getSubtype(i), columnIds);
+ }
}
- void ensureOrcFooter(InputStream* stream,
- DataBuffer<char> *buffer,
- uint64_t postscriptLength) {
+ std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
+ // If the read type is specified, then check that the selected schema matches the read type
+ // on the first call to createRowBatch.
+ if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+ auto fileSchema = &getSelectedType();
+ auto readType = schemaEvolution.getReadType();
+ std::set<uint64_t> readColumns, fileColumns;
+ getColumnIds(readType, readColumns);
+ getColumnIds(fileSchema, fileColumns);
+ if (readColumns != fileColumns) {
+ std::ostringstream ss;
+ ss << "The selected schema " << fileSchema->toString() << " doesn't match read type "
+ << readType->toString();
+ throw SchemaEvolutionError(ss.str());
+ }
+ }
+ const Type& readType =
+ schemaEvolution.getReadType() ? *schemaEvolution.getReadType() : getSelectedType();
+ return readType.createRowBatch(capacity, *contents->pool, enableEncodedBlock,
+ useTightNumericVector);
+ }
+ void ensureOrcFooter(InputStream* stream, DataBuffer<char>* buffer, uint64_t postscriptLength) {
const std::string MAGIC("ORC");
const uint64_t magicLength = MAGIC.length();
- const char * const bufferStart = buffer->data();
+ const char* const bufferStart = buffer->data();
const uint64_t bufferLength = buffer->size();
if (postscriptLength < magicLength || bufferLength < magicLength) {
@@ -1263,7 +1253,7 @@ namespace orc {
if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) {
// If there is no magic string at the end, check the beginning.
// Only files written by Hive 0.11.0 don't have the tail ORC string.
- std::unique_ptr<char[]> frontBuffer( new char[magicLength] );
+ std::unique_ptr<char[]> frontBuffer(new char[magicLength]);
stream->read(frontBuffer.get(), magicLength, 0);
bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0;
@@ -1279,28 +1269,25 @@ namespace orc {
* @param buffer the buffer with the tail of the file.
* @param postscriptSize the length of postscript in bytes
*/
- std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream,
- DataBuffer<char> *buffer,
+ std::unique_ptr<proto::PostScript> readPostscript(InputStream* stream, DataBuffer<char>* buffer,
uint64_t postscriptSize) {
- char *ptr = buffer->data();
+ char* ptr = buffer->data();
uint64_t readSize = buffer->size();
ensureOrcFooter(stream, buffer, postscriptSize);
- std::unique_ptr<proto::PostScript> postscript =
- std::unique_ptr<proto::PostScript>(new proto::PostScript());
+ auto postscript = std::make_unique<proto::PostScript>();
if (readSize < 1 + postscriptSize) {
std::stringstream msg;
- msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = "
- << stream->getLength();
+ msg << "Invalid ORC postscript length: " << postscriptSize
+ << ", file length = " << stream->getLength();
throw ParseError(msg.str());
}
if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize,
- static_cast<int>(postscriptSize))) {
- throw ParseError("Failed to parse the postscript from " +
- stream->getName());
+ static_cast<int>(postscriptSize))) {
+ throw ParseError("Failed to parse the postscript from " + stream->getName());
}
- return REDUNDANT_MOVE(postscript);
+ return postscript;
}
/**
@@ -1308,7 +1295,7 @@ namespace orc {
* so we won't crash when we convert the proto::Types to TypeImpls (ORC-317).
* For STRUCT types, fieldName size should match subTypes size (ORC-581).
*/
- void checkProtoTypes(const proto::Footer &footer) {
+ void checkProtoTypes(const proto::Footer& footer) {
std::stringstream msg;
int maxId = footer.types_size();
if (maxId <= 0) {
@@ -1316,17 +1303,16 @@ namespace orc {
}
for (int i = 0; i < maxId; ++i) {
const proto::Type& type = footer.types(i);
- if (type.kind() == proto::Type_Kind_STRUCT
- && type.subtypes_size() != type.fieldnames_size()) {
+ if (type.kind() == proto::Type_Kind_STRUCT &&
+ type.subtypes_size() != type.field_names_size()) {
msg << "Footer is corrupt: STRUCT type " << i << " has " << type.subtypes_size()
- << " subTypes, but has " << type.fieldnames_size() << " fieldNames";
+ << " subTypes, but has " << type.field_names_size() << " fieldNames";
throw ParseError(msg.str());
}
for (int j = 0; j < type.subtypes_size(); ++j) {
int subTypeId = static_cast<int>(type.subtypes(j));
if (subTypeId <= i) {
- msg << "Footer is corrupt: malformed link from type " << i << " to "
- << subTypeId;
+ msg << "Footer is corrupt: malformed link from type " << i << " to " << subTypeId;
throw ParseError(msg.str());
}
if (subTypeId >= maxId) {
@@ -1334,9 +1320,8 @@ namespace orc {
throw ParseError(msg.str());
}
if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) {
- msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j
- << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= "
- << subTypeId << ")";
+ msg << "Footer is corrupt: subType(" << (j - 1) << ") >= subType(" << j << ") in types("
+ << i << "). (" << type.subtypes(j - 1) << " >= " << subTypeId << ")";
throw ParseError(msg.str());
}
}
@@ -1351,37 +1336,31 @@ namespace orc {
* @param ps the file's postscript
* @param memoryPool the memory pool to use
*/
- std::unique_ptr<proto::Footer> readFooter(InputStream* stream,
- const DataBuffer<char> *buffer,
- uint64_t footerOffset,
- const proto::PostScript& ps,
- MemoryPool& memoryPool) {
- const char *footerPtr = buffer->data() + footerOffset;
-
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(convertCompressionKind(ps),
- std::unique_ptr<SeekableInputStream>
- (new SeekableArrayInputStream(footerPtr,
- ps.footerlength())),
- getCompressionBlockSize(ps),
- memoryPool);
-
- std::unique_ptr<proto::Footer> footer =
- std::unique_ptr<proto::Footer>(new proto::Footer());
+ std::unique_ptr<proto::Footer> readFooter(InputStream* stream, const DataBuffer<char>* buffer,
+ uint64_t footerOffset, const proto::PostScript& ps,
+ MemoryPool& memoryPool, ReaderMetrics* readerMetrics) {
+ const char* footerPtr = buffer->data() + footerOffset;
+
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ convertCompressionKind(ps),
+ std::make_unique<SeekableArrayInputStream>(footerPtr, ps.footer_length()),
+ getCompressionBlockSize(ps), memoryPool, readerMetrics);
+
+ auto footer = std::make_unique<proto::Footer>();
if (!footer->ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError("Failed to parse the footer from " +
- stream->getName());
+ throw ParseError("Failed to parse the footer from " + stream->getName());
}
checkProtoTypes(*footer);
- return REDUNDANT_MOVE(footer);
+ return footer;
}
std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream,
const ReaderOptions& options) {
- std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents());
+ auto contents = std::make_shared<FileContents>();
contents->pool = options.getMemoryPool();
contents->errorStream = options.getErrorStream();
+ contents->readerMetrics = options.getReaderMetrics();
std::string serializedFooter = options.getSerializedFileTail();
uint64_t fileLength;
uint64_t postscriptLength;
@@ -1391,27 +1370,25 @@ namespace orc {
if (!tail.ParseFromString(TString(serializedFooter))) {
throw ParseError("Failed to parse the file tail from string");
}
- contents->postscript.reset(new proto::PostScript(tail.postscript()));
- contents->footer.reset(new proto::Footer(tail.footer()));
- fileLength = tail.filelength();
- postscriptLength = tail.postscriptlength();
+ contents->postscript = std::make_unique<proto::PostScript>(tail.postscript());
+ contents->footer = std::make_unique<proto::Footer>(tail.footer());
+ fileLength = tail.file_length();
+ postscriptLength = tail.postscript_length();
} else {
// figure out the size of the file using the option or filesystem
- fileLength = std::min(options.getTailLocation(),
- static_cast<uint64_t>(stream->getLength()));
+ fileLength = std::min(options.getTailLocation(), static_cast<uint64_t>(stream->getLength()));
- //read last bytes into buffer to get PostScript
+ // read last bytes into buffer to get PostScript
uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS);
if (readSize < 4) {
throw ParseError("File size too small");
}
- std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) );
+ auto buffer = std::make_unique<DataBuffer<char>>(*contents->pool, readSize);
stream->read(buffer->data(), readSize, fileLength - readSize);
postscriptLength = buffer->data()[readSize - 1] & 0xff;
- contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(),
- buffer.get(), postscriptLength));
- uint64_t footerSize = contents->postscript->footerlength();
+ contents->postscript = readPostscript(stream.get(), buffer.get(), postscriptLength);
+ uint64_t footerSize = contents->postscript->footer_length();
uint64_t tailSize = 1 + postscriptLength + footerSize;
if (tailSize >= fileLength) {
std::stringstream msg;
@@ -1428,8 +1405,8 @@ namespace orc {
footerOffset = readSize - tailSize;
}
- contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(),
- footerOffset, *contents->postscript, *contents->pool));
+ contents->footer = readFooter(stream.get(), buffer.get(), footerOffset, *contents->postscript,
+ *contents->pool, contents->readerMetrics);
}
contents->isDecimalAsLong = false;
if (contents->postscript->version_size() == 2) {
@@ -1439,27 +1416,23 @@ namespace orc {
}
}
contents->stream = std::move(stream);
- return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents),
- options,
- fileLength,
- postscriptLength));
+ return std::make_unique<ReaderImpl>(std::move(contents), options, fileLength, postscriptLength);
}
- std::map<uint32_t, BloomFilterIndex>
- ReaderImpl::getBloomFilters(uint32_t stripeIndex,
- const std::set<uint32_t>& included) const {
+ std::map<uint32_t, BloomFilterIndex> ReaderImpl::getBloomFilters(
+ uint32_t stripeIndex, const std::set<uint32_t>& included) const {
std::map<uint32_t, BloomFilterIndex> ret;
// find stripe info
if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) {
- throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex)));
+ throw std::logic_error("Illegal stripe index: " +
+ to_string(static_cast<int64_t>(stripeIndex)));
}
const proto::StripeInformation currentStripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
- const proto::StripeFooter currentStripeFooter =
- getStripeFooter(currentStripeInfo, *contents);
+ footer->stripes(static_cast<int>(stripeIndex));
+ const proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents);
- // iterate stripe footer to get stream of bloomfilter
+ // iterate stripe footer to get stream of bloom_filter
uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset());
for (int i = 0; i < currentStripeFooter.streams_size(); i++) {
const proto::Stream& stream = currentStripeFooter.streams(i);
@@ -1469,16 +1442,11 @@ namespace orc {
// a bloom filter stream from a selected column is found
if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 &&
(included.empty() || included.find(column) != included.end())) {
-
std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents->stream.get(),
- offset,
- length,
- *contents->pool)),
- contents->blockSize,
- *(contents->pool));
+ createDecompressor(contents->compression,
+ std::make_unique<SeekableFileInputStream>(
+ contents->stream.get(), offset, length, *contents->pool),
+ contents->blockSize, *(contents->pool), contents->readerMetrics);
proto::BloomFilterIndex pbBFIndex;
if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) {
@@ -1486,11 +1454,10 @@ namespace orc {
}
BloomFilterIndex bfIndex;
- for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) {
- std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize(
- stream.kind(),
- currentStripeFooter.columns(static_cast<int>(stream.column())),
- pbBFIndex.bloomfilter(j));
+ for (int j = 0; j < pbBFIndex.bloom_filter_size(); j++) {
+ std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize(
+ stream.kind(), currentStripeFooter.columns(static_cast<int>(stream.column())),
+ pbBFIndex.bloom_filter(j));
bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry)));
}
@@ -1512,10 +1479,8 @@ namespace orc {
// PASS
}
- InputStream::~InputStream() {
- // PASS
+ InputStream::~InputStream(){
+ // PASS
};
-
-
-}// namespace
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh
index ffaff4176e..a1367e4bd3 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.hh
+++ b/contrib/libs/apache/orc/c++/src/Reader.hh
@@ -26,20 +26,22 @@
#include "ColumnReader.hh"
#include "RLE.hh"
-#include "sargs/SargsApplier.hh"
+#include "SchemaEvolution.hh"
#include "TypeImpl.hh"
+#include "sargs/SargsApplier.hh"
namespace orc {
static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
/**
- * WriterVersion Implementation
- */
+ * WriterVersion Implementation
+ */
class WriterVersionImpl {
- private:
+ private:
WriterVersion version;
- public:
+
+ public:
// Known Versions with issues resolved
// The static method below is to fix global constructors Clang warning
static const WriterVersionImpl& VERSION_HIVE_8732();
@@ -52,8 +54,8 @@ namespace orc {
};
/**
- * State shared between Reader and Row Reader
- */
+ * State shared between Reader and Row Reader
+ */
struct FileContents {
std::unique_ptr<InputStream> stream;
std::unique_ptr<proto::PostScript> postscript;
@@ -61,12 +63,13 @@ namespace orc {
std::unique_ptr<Type> schema;
uint64_t blockSize;
CompressionKind compression;
- MemoryPool *pool;
- std::ostream *errorStream;
+ MemoryPool* pool;
+ std::ostream* errorStream;
/// Decimal64 in ORCv2 uses RLE to store values. This flag indicates whether
/// this new encoding is used.
bool isDecimalAsLong;
std::unique_ptr<proto::Metadata> metadata;
+ ReaderMetrics* readerMetrics;
};
proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
@@ -109,10 +112,10 @@ namespace orc {
// is selected.
bool selectParents(std::vector<bool>& selectedColumns, const Type& type);
- /**
- * Constructor that selects columns.
- * @param contents of the file
- */
+ /**
+ * Constructor that selects columns.
+ * @param contents of the file
+ */
ColumnSelector(const FileContents* contents);
// Select the columns from the RowReaderoptions object
@@ -122,9 +125,8 @@ namespace orc {
void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options);
};
-
class RowReaderImpl : public RowReader {
- private:
+ private:
const Timezone& localTimezone;
// contents
@@ -145,14 +147,19 @@ namespace orc {
uint64_t previousRow;
uint64_t firstStripe;
uint64_t currentStripe;
- uint64_t lastStripe; // the stripe AFTER the last one
+ uint64_t lastStripe; // the stripe AFTER the last one
+ uint64_t processingStripe;
uint64_t currentRowInStripe;
uint64_t rowsInCurrentStripe;
+ // number of row groups between first stripe and last stripe
+ uint64_t numRowGroupsInStripeRange;
proto::StripeInformation currentStripeInfo;
proto::StripeFooter currentStripeFooter;
std::unique_ptr<ColumnReader> reader;
bool enableEncodedBlock;
+ bool useTightNumericVector;
+ bool throwOnSchemaEvolutionOverflow;
// internal methods
void startNextStripe();
inline void markEndOfFile();
@@ -166,27 +173,32 @@ namespace orc {
// desired timezone to return data of timestamp types.
const Timezone& readerTimezone;
+ // match read and file types
+ SchemaEvolution schemaEvolution;
+
// load stripe index if not done so
void loadStripeIndex();
// In case of PPD, batch size should be aware of row group boundaries.
// If only a subset of row groups are selected then the next read should
// stop at the end of selected range.
- static uint64_t computeBatchSize(uint64_t requestedSize,
- uint64_t currentRowInStripe,
- uint64_t rowsInCurrentStripe,
- uint64_t rowIndexStride,
+ static uint64_t computeBatchSize(uint64_t requestedSize, uint64_t currentRowInStripe,
+ uint64_t rowsInCurrentStripe, uint64_t rowIndexStride,
const std::vector<uint64_t>& nextSkippedRows);
// Skip non-selected rows
- static uint64_t advanceToNextRowGroup(uint64_t currentRowInStripe,
- uint64_t rowsInCurrentStripe,
+ static uint64_t advanceToNextRowGroup(uint64_t currentRowInStripe, uint64_t rowsInCurrentStripe,
uint64_t rowIndexStride,
const std::vector<uint64_t>& nextSkippedRows);
friend class TestRowReader_advanceToNextRowGroup_Test;
friend class TestRowReader_computeBatchSize_Test;
+ // whether the current stripe is initialized
+ inline bool isCurrentStripeInited() const {
+ return currentStripe == processingStripe;
+ }
+
/**
* Seek to the start of a row group in the current stripe
* @param rowGroupEntryId the row group id to seek to
@@ -200,22 +212,20 @@ namespace orc {
*/
bool hasBadBloomFilters();
- public:
- /**
- * Constructor that lets the user specify additional options.
- * @param contents of the file
- * @param options options for reading
- */
- RowReaderImpl(std::shared_ptr<FileContents> contents,
- const RowReaderOptions& options);
+ public:
+ /**
+ * Constructor that lets the user specify additional options.
+ * @param contents of the file
+ * @param options options for reading
+ */
+ RowReaderImpl(std::shared_ptr<FileContents> contents, const RowReaderOptions& options);
// Select the columns from the options object
const std::vector<bool> getSelectedColumns() const override;
const Type& getSelectedType() const override;
- std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size
- ) const override;
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const override;
bool next(ColumnVectorBatch& data) override;
@@ -231,6 +241,10 @@ namespace orc {
bool getThrowOnHive11DecimalOverflow() const;
bool getIsDecimalAsLong() const;
int32_t getForcedScaleOnHive11Decimal() const;
+
+ const SchemaEvolution* getSchemaEvolution() const {
+ return &schemaEvolution;
+ }
};
class ReaderImpl : public Reader {
@@ -251,12 +265,14 @@ namespace orc {
// internal methods
void readMetadata() const;
void checkOrcVersion();
- void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
- const proto::StripeFooter& currentStripeFooter,
- std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;
+ void getRowIndexStatistics(
+ const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
+ const proto::StripeFooter& currentStripeFooter,
+ std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;
// metadata
mutable bool isMetadataLoaded;
+
public:
/**
* Constructor that lets the user specify additional options.
@@ -265,10 +281,8 @@ namespace orc {
* @param fileLength the length of the file in bytes
* @param postscriptLength the length of the postscript in bytes
*/
- ReaderImpl(std::shared_ptr<FileContents> contents,
- const ReaderOptions& options,
- uint64_t fileLength,
- uint64_t postscriptLength);
+ ReaderImpl(std::shared_ptr<FileContents> contents, const ReaderOptions& options,
+ uint64_t fileLength, uint64_t postscriptLength);
const ReaderOptions& getReaderOptions() const;
@@ -298,20 +312,17 @@ namespace orc {
uint64_t getNumberOfStripes() const override;
- std::unique_ptr<StripeInformation> getStripe(uint64_t
- ) const override;
+ std::unique_ptr<StripeInformation> getStripe(uint64_t) const override;
uint64_t getNumberOfStripeStatistics() const override;
const std::string& getStreamName() const override;
- std::unique_ptr<StripeStatistics>
- getStripeStatistics(uint64_t stripeIndex) const override;
+ std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const override;
std::unique_ptr<RowReader> createRowReader() const override;
- std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options
- ) const override;
+ std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options) const override;
uint64_t getContentLength() const override;
uint64_t getStripeStatisticsLength() const override;
@@ -321,8 +332,7 @@ namespace orc {
std::unique_ptr<Statistics> getStatistics() const override;
- std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId
- ) const override;
+ std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId) const override;
std::string getSerializedFileTail() const override;
@@ -330,28 +340,41 @@ namespace orc {
bool hasCorrectStatistics() const override;
- const proto::PostScript* getPostscript() const {return contents->postscript.get();}
+ const ReaderMetrics* getReaderMetrics() const override {
+ return contents->readerMetrics;
+ }
- uint64_t getBlockSize() const {return contents->blockSize;}
+ const proto::PostScript* getPostscript() const {
+ return contents->postscript.get();
+ }
- const proto::Footer* getFooter() const {return contents->footer.get();}
+ uint64_t getBlockSize() const {
+ return contents->blockSize;
+ }
- const Type* getSchema() const {return contents->schema.get();}
+ const proto::Footer* getFooter() const {
+ return contents->footer.get();
+ }
- InputStream* getStream() const {return contents->stream.get();}
+ const Type* getSchema() const {
+ return contents->schema.get();
+ }
+
+ InputStream* getStream() const {
+ return contents->stream.get();
+ }
uint64_t getMemoryUse(int stripeIx = -1) override;
- uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override;
+ uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx = -1) override;
- uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override;
+ uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx = -1) override;
- uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override;
+ uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx = -1) override;
- std::map<uint32_t, BloomFilterIndex>
- getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
+ std::map<uint32_t, BloomFilterIndex> getBloomFilters(
+ uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
};
-
-}// namespace
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
index 8ab57b1f6e..ae05a70a36 100644
--- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
@@ -17,731 +17,439 @@
*/
#include "Adaptor.hh"
+#include "BpackingDefault.hh"
+#if defined(ORC_HAVE_RUNTIME_AVX512)
+#error #include "BpackingAvx512.hh"
+#endif
#include "Compression.hh"
-#include "RLEv2.hh"
+#include "Dispatch.hh"
#include "RLEV2Util.hh"
+#include "RLEv2.hh"
+#include "Utils.hh"
namespace orc {
-unsigned char RleDecoderV2::readByte() {
- if (bufferStart == bufferEnd) {
- int bufferLength;
- const void* bufferPointer;
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
- throw ParseError("bad read in RleDecoderV2::readByte");
+ unsigned char RleDecoderV2::readByte() {
+ SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs);
+ if (bufferStart == bufferEnd) {
+ int bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in RleDecoderV2::readByte");
+ }
+ bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer));
+ bufferEnd = bufferStart + bufferLength;
}
- bufferStart = static_cast<const char*>(bufferPointer);
- bufferEnd = bufferStart + bufferLength;
- }
- unsigned char result = static_cast<unsigned char>(*bufferStart++);
- return result;
-}
-
-int64_t RleDecoderV2::readLongBE(uint64_t bsz) {
- int64_t ret = 0, val;
- uint64_t n = bsz;
- while (n > 0) {
- n--;
- val = readByte();
- ret |= (val << (n * 8));
- }
- return ret;
-}
-
-inline int64_t RleDecoderV2::readVslong() {
- return unZigZag(readVulong());
-}
-
-uint64_t RleDecoderV2::readVulong() {
- uint64_t ret = 0, b;
- uint64_t offset = 0;
- do {
- b = readByte();
- ret |= (0x7f & b) << offset;
- offset += 7;
- } while (b >= 0x80);
- return ret;
-}
-
-void RleDecoderV2::readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs) {
- switch (fbs) {
- case 4:
- unrolledUnpack4(data, offset, len);
- return;
- case 8:
- unrolledUnpack8(data, offset, len);
- return;
- case 16:
- unrolledUnpack16(data, offset, len);
- return;
- case 24:
- unrolledUnpack24(data, offset, len);
- return;
- case 32:
- unrolledUnpack32(data, offset, len);
- return;
- case 40:
- unrolledUnpack40(data, offset, len);
- return;
- case 48:
- unrolledUnpack48(data, offset, len);
- return;
- case 56:
- unrolledUnpack56(data, offset, len);
- return;
- case 64:
- unrolledUnpack64(data, offset, len);
- return;
- default:
- // Fallback to the default implementation for deprecated bit size.
- plainUnpackLongs(data, offset, len, fbs);
- return;
+ unsigned char result = static_cast<unsigned char>(*bufferStart++);
+ return result;
}
-}
-
-void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8.
- while (bitsLeft > 0 && curIdx < offset + len) {
- bitsLeft -= 4;
- data[curIdx++] = (curByte >> bitsLeft) & 15;
- }
- if (curIdx == offset + len) return;
-
- // Exhaust the buffer
- uint64_t numGroups = (offset + len - curIdx) / 2;
- numGroups = std::min(numGroups, static_cast<uint64_t>(bufferEnd - bufferStart));
- // Avoid updating 'bufferStart' inside the loop.
- const auto *buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- uint32_t localByte;
- for (uint64_t i = 0; i < numGroups; ++i) {
- localByte = *buffer++;
- data[curIdx] = (localByte >> 4) & 15;
- data[curIdx + 1] = localByte & 15;
- curIdx += 2;
- }
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
- // readByte() will update 'bufferStart' and 'bufferEnd'
- curByte = readByte();
- bitsLeft = 8;
- }
-}
-
-void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = bufferEnd - bufferStart;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- data[curIdx++] = *buffer++;
+ int64_t RleDecoderV2::readLongBE(uint64_t bsz) {
+ int64_t ret = 0, val;
+ uint64_t n = bsz;
+ while (n > 0) {
+ n--;
+ val = readByte();
+ ret |= (val << (n * 8));
}
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
+ return ret;
+ }
- // readByte() will update 'bufferStart' and 'bufferEnd'.
- data[curIdx++] = readByte();
+ inline int64_t RleDecoderV2::readVslong() {
+ return unZigZag(readVulong());
}
-}
-
-void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 2;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint16_t b0, b1;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint16_t>(*buffer);
- b1 = static_cast<uint16_t>(*(buffer + 1));
- buffer += 2;
- data[curIdx++] = (b0 << 8) | b1;
- }
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- data[curIdx++] = (b0 << 8) | b1;
+ uint64_t RleDecoderV2::readVulong() {
+ uint64_t ret = 0, b;
+ uint64_t offset = 0;
+ do {
+ b = readByte();
+ ret |= (0x7f & b) << offset;
+ offset += 7;
+ } while (b >= 0x80);
+ return ret;
}
-}
-
-void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 3;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint32_t b0, b1, b2;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint32_t>(*buffer);
- b1 = static_cast<uint32_t>(*(buffer + 1));
- b2 = static_cast<uint32_t>(*(buffer + 2));
- buffer += 3;
- data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
+
+ struct UnpackDynamicFunction {
+ using FunctionType = decltype(&BitUnpack::readLongs);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+#if defined(ORC_HAVE_RUNTIME_AVX512)
+ return {{DispatchLevel::NONE, BitUnpackDefault::readLongs},
+ {DispatchLevel::AVX512, BitUnpackAVX512::readLongs}};
+#else
+ return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}};
+#endif
}
- bufferStart += bufferNum * 3;
- if (curIdx == offset + len) return;
-
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- b2 = readByte();
- data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2);
+ };
+
+ void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) {
+ static DynamicDispatch<UnpackDynamicFunction> dispatch;
+ return dispatch.func(this, data, offset, len, fbs);
}
-}
-
-void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 4;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint32_t b0, b1, b2, b3;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint32_t>(*buffer);
- b1 = static_cast<uint32_t>(*(buffer + 1));
- b2 = static_cast<uint32_t>(*(buffer + 2));
- b3 = static_cast<uint32_t>(*(buffer + 3));
- buffer += 4;
- data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
- }
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
-
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- b2 = readByte();
- b3 = readByte();
- data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3);
+
+ RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool _isSigned,
+ MemoryPool& pool, ReaderMetrics* _metrics)
+ : RleDecoder(_metrics),
+ inputStream(std::move(input)),
+ isSigned(_isSigned),
+ firstByte(0),
+ bufferStart(nullptr),
+ bufferEnd(bufferStart),
+ runLength(0),
+ runRead(0),
+ bitsLeft(0),
+ curByte(0),
+ unpackedPatch(pool, 0),
+ literals(pool, MAX_LITERAL_SIZE) {
+ // PASS
}
-}
-
-void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 5;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint64_t b0, b1, b2, b3, b4;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint32_t>(*buffer);
- b1 = static_cast<uint32_t>(*(buffer + 1));
- b2 = static_cast<uint32_t>(*(buffer + 2));
- b3 = static_cast<uint32_t>(*(buffer + 3));
- b4 = static_cast<uint32_t>(*(buffer + 4));
- buffer += 5;
- data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
- }
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
-
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- b2 = readByte();
- b3 = readByte();
- b4 = readByte();
- data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
+
+ void RleDecoderV2::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // clear state
+ bufferEnd = bufferStart = nullptr;
+ runRead = runLength = 0;
+ // skip ahead the given number of records
+ skip(location.next());
}
-}
-
-void RleDecoderV2::unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 6;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint64_t b0, b1, b2, b3, b4, b5;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint32_t>(*buffer);
- b1 = static_cast<uint32_t>(*(buffer + 1));
- b2 = static_cast<uint32_t>(*(buffer + 2));
- b3 = static_cast<uint32_t>(*(buffer + 3));
- b4 = static_cast<uint32_t>(*(buffer + 4));
- b5 = static_cast<uint32_t>(*(buffer + 5));
- buffer += 6;
- data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5);
+
+ void RleDecoderV2::skip(uint64_t numValues) {
+ // simple for now, until perf tests indicate something encoding specific is
+ // needed
+ const uint64_t N = 64;
+ int64_t dummy[N];
+
+ while (numValues) {
+ uint64_t nRead = std::min(N, numValues);
+ next(dummy, nRead, nullptr);
+ numValues -= nRead;
}
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
-
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- b2 = readByte();
- b3 = readByte();
- b4 = readByte();
- b5 = readByte();
- data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5);
}
-}
-
-void RleDecoderV2::unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 7;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint64_t b0, b1, b2, b3, b4, b5, b6;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint32_t>(*buffer);
- b1 = static_cast<uint32_t>(*(buffer + 1));
- b2 = static_cast<uint32_t>(*(buffer + 2));
- b3 = static_cast<uint32_t>(*(buffer + 3));
- b4 = static_cast<uint32_t>(*(buffer + 4));
- b5 = static_cast<uint32_t>(*(buffer + 5));
- b6 = static_cast<uint32_t>(*(buffer + 6));
- buffer += 7;
- data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6);
+
+ template <typename T>
+ void RleDecoderV2::next(T* const data, const uint64_t numValues, const char* const notNull) {
+ SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall);
+ uint64_t nRead = 0;
+
+ while (nRead < numValues) {
+ // Skip any nulls before attempting to read first byte.
+ while (notNull && !notNull[nRead]) {
+ if (++nRead == numValues) {
+ return; // ended with null values
+ }
+ }
+
+ if (runRead == runLength) {
+ resetRun();
+ firstByte = readByte();
+ }
+
+ uint64_t offset = nRead, length = numValues - nRead;
+
+ EncodingType enc = static_cast<EncodingType>((firstByte >> 6) & 0x03);
+ switch (static_cast<int64_t>(enc)) {
+ case SHORT_REPEAT:
+ nRead += nextShortRepeats(data, offset, length, notNull);
+ break;
+ case DIRECT:
+ nRead += nextDirect(data, offset, length, notNull);
+ break;
+ case PATCHED_BASE:
+ nRead += nextPatched(data, offset, length, notNull);
+ break;
+ case DELTA:
+ nRead += nextDelta(data, offset, length, notNull);
+ break;
+ default:
+ throw ParseError("unknown encoding");
+ }
}
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
-
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- b2 = readByte();
- b3 = readByte();
- b4 = readByte();
- b5 = readByte();
- b6 = readByte();
- data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6);
}
-}
-
-void RleDecoderV2::unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len) {
- uint64_t curIdx = offset;
- while (curIdx < offset + len) {
- // Exhaust the buffer
- int64_t bufferNum = (bufferEnd - bufferStart) / 8;
- bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx));
- uint64_t b0, b1, b2, b3, b4, b5, b6, b7;
- // Avoid updating 'bufferStart' inside the loop.
- const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart);
- for (int i = 0; i < bufferNum; ++i) {
- b0 = static_cast<uint32_t>(*buffer);
- b1 = static_cast<uint32_t>(*(buffer + 1));
- b2 = static_cast<uint32_t>(*(buffer + 2));
- b3 = static_cast<uint32_t>(*(buffer + 3));
- b4 = static_cast<uint32_t>(*(buffer + 4));
- b5 = static_cast<uint32_t>(*(buffer + 5));
- b6 = static_cast<uint32_t>(*(buffer + 6));
- b7 = static_cast<uint32_t>(*(buffer + 7));
- buffer += 8;
- data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
- }
- bufferStart = reinterpret_cast<const char*>(buffer);
- if (curIdx == offset + len) return;
-
- // One of the following readByte() will update 'bufferStart' and 'bufferEnd'.
- b0 = readByte();
- b1 = readByte();
- b2 = readByte();
- b3 = readByte();
- b4 = readByte();
- b5 = readByte();
- b6 = readByte();
- b7 = readByte();
- data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7);
+
+ void RleDecoderV2::next(int64_t* data, uint64_t numValues, const char* notNull) {
+ next<int64_t>(data, numValues, notNull);
}
-}
-
-void RleDecoderV2::plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len,
- uint64_t fbs) {
- for (uint64_t i = offset; i < (offset + len); i++) {
- uint64_t result = 0;
- uint64_t bitsLeftToRead = fbs;
- while (bitsLeftToRead > bitsLeft) {
- result <<= bitsLeft;
- result |= curByte & ((1 << bitsLeft) - 1);
- bitsLeftToRead -= bitsLeft;
- curByte = readByte();
- bitsLeft = 8;
- }
- // handle the left over bits
- if (bitsLeftToRead > 0) {
- result <<= bitsLeftToRead;
- bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
- result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
- }
- data[i] = static_cast<int64_t>(result);
+ void RleDecoderV2::next(int32_t* data, uint64_t numValues, const char* notNull) {
+ next<int32_t>(data, numValues, notNull);
}
-}
-
-RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
- bool _isSigned, MemoryPool& pool
- ): inputStream(std::move(input)),
- isSigned(_isSigned),
- firstByte(0),
- runLength(0),
- runRead(0),
- bufferStart(nullptr),
- bufferEnd(bufferStart),
- bitsLeft(0),
- curByte(0),
- unpackedPatch(pool, 0),
- literals(pool, MAX_LITERAL_SIZE) {
- // PASS
-}
-
-void RleDecoderV2::seek(PositionProvider& location) {
- // move the input stream
- inputStream->seek(location);
- // clear state
- bufferEnd = bufferStart = nullptr;
- runRead = runLength = 0;
- // skip ahead the given number of records
- skip(location.next());
-}
-
-void RleDecoderV2::skip(uint64_t numValues) {
- // simple for now, until perf tests indicate something encoding specific is
- // needed
- const uint64_t N = 64;
- int64_t dummy[N];
-
- while (numValues) {
- uint64_t nRead = std::min(N, numValues);
- next(dummy, nRead, nullptr);
- numValues -= nRead;
+
+ void RleDecoderV2::next(int16_t* data, uint64_t numValues, const char* notNull) {
+ next<int16_t>(data, numValues, notNull);
}
-}
-
-void RleDecoderV2::next(int64_t* const data,
- const uint64_t numValues,
- const char* const notNull) {
- uint64_t nRead = 0;
-
- while (nRead < numValues) {
- // Skip any nulls before attempting to read first byte.
- while (notNull && !notNull[nRead]) {
- if (++nRead == numValues) {
- return; // ended with null values
- }
- }
+ template <typename T>
+ uint64_t RleDecoderV2::nextShortRepeats(T* const data, uint64_t offset, uint64_t numValues,
+ const char* const notNull) {
if (runRead == runLength) {
- resetRun();
- firstByte = readByte();
- }
+ // extract the number of fixed bytes
+ uint64_t byteSize = (firstByte >> 3) & 0x07;
+ byteSize += 1;
- uint64_t offset = nRead, length = numValues - nRead;
-
- EncodingType enc = static_cast<EncodingType>
- ((firstByte >> 6) & 0x03);
- switch(static_cast<int64_t>(enc)) {
- case SHORT_REPEAT:
- nRead += nextShortRepeats(data, offset, length, notNull);
- break;
- case DIRECT:
- nRead += nextDirect(data, offset, length, notNull);
- break;
- case PATCHED_BASE:
- nRead += nextPatched(data, offset, length, notNull);
- break;
- case DELTA:
- nRead += nextDelta(data, offset, length, notNull);
- break;
- default:
- throw ParseError("unknown encoding");
- }
- }
-}
-
-uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bytes
- uint64_t byteSize = (firstByte >> 3) & 0x07;
- byteSize += 1;
-
- runLength = firstByte & 0x07;
- // run lengths values are stored only after MIN_REPEAT value is met
- runLength += MIN_REPEAT;
- runRead = 0;
-
- // read the repeated value which is store using fixed bytes
- literals[0] = readLongBE(byteSize);
-
- if (isSigned) {
- literals[0] = unZigZag(static_cast<uint64_t>(literals[0]));
+ runLength = firstByte & 0x07;
+ // run lengths values are stored only after MIN_REPEAT value is met
+ runLength += MIN_REPEAT;
+ runRead = 0;
+
+ // read the repeated value which is store using fixed bytes
+ literals[0] = readLongBE(byteSize);
+
+ if (isSigned) {
+ literals[0] = unZigZag(static_cast<uint64_t>(literals[0]));
+ }
}
- }
- uint64_t nRead = std::min(runLength - runRead, numValues);
+ uint64_t nRead = std::min(runLength - runRead, numValues);
- if (notNull) {
- for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
- if (notNull[pos]) {
- data[pos] = literals[0];
+ if (notNull) {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ data[pos] = static_cast<T>(literals[0]);
+ ++runRead;
+ }
+ }
+ } else {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = static_cast<T>(literals[0]);
++runRead;
}
}
- } else {
- for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
- data[pos] = literals[0];
- ++runRead;
- }
+
+ return nRead;
}
- return nRead;
-}
-
-uint64_t RleDecoderV2::nextDirect(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
- uint32_t bitSize = decodeBitWidth(fbo);
-
- // extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- // runs are one off
- runLength += 1;
- runRead = 0;
-
- readLongs(literals.data(), 0, runLength, bitSize);
- if (isSigned) {
- for (uint64_t i = 0; i < runLength; ++i) {
- literals[i] = unZigZag(static_cast<uint64_t>(literals[i]));
+ template <typename T>
+ uint64_t RleDecoderV2::nextDirect(T* const data, uint64_t offset, uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ uint32_t bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+
+ readLongs(literals.data(), 0, runLength, bitSize);
+ if (isSigned) {
+ for (uint64_t i = 0; i < runLength; ++i) {
+ literals[i] = unZigZag(static_cast<uint64_t>(literals[i]));
+ }
}
}
+
+ return copyDataFromBuffer(data, offset, numValues, notNull);
}
- return copyDataFromBuffer(data, offset, numValues, notNull);
-}
-
-void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask,
- int64_t* resGap, int64_t* resPatch,
- uint64_t* patchIdx) {
- uint64_t idx = *patchIdx;
- uint64_t gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize;
- int64_t patch = unpackedPatch[idx] & patchMask;
- int64_t actualGap = 0;
-
- // special case: gap is >255 then patch value will be 0.
- // if gap is <=255 then patch value cannot be 0
- while (gap == 255 && patch == 0) {
- actualGap += 255;
- ++idx;
- gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize;
- patch = unpackedPatch[idx] & patchMask;
+ void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap,
+ int64_t* resPatch, uint64_t* patchIdx) {
+ uint64_t idx = *patchIdx;
+ uint64_t gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize;
+ int64_t patch = unpackedPatch[idx] & patchMask;
+ int64_t actualGap = 0;
+
+ // special case: gap is >255 then patch value will be 0.
+ // if gap is <=255 then patch value cannot be 0
+ while (gap == 255 && patch == 0) {
+ actualGap += 255;
+ ++idx;
+ gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize;
+ patch = unpackedPatch[idx] & patchMask;
+ }
+ // add the left over gap
+ actualGap += gap;
+
+ *resGap = actualGap;
+ *resPatch = patch;
+ *patchIdx = idx;
}
- // add the left over gap
- actualGap += gap;
- *resGap = actualGap;
- *resPatch = patch;
- *patchIdx = idx;
-}
+ template <typename T>
+ uint64_t RleDecoderV2::nextPatched(T* const data, uint64_t offset, uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ uint32_t bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+
+ // extract the number of bytes occupied by base
+ uint64_t thirdByte = readByte();
+ uint64_t byteSize = (thirdByte >> 5) & 0x07;
+ // base width is one off
+ byteSize += 1;
+
+ // extract patch width
+ uint32_t pwo = thirdByte & 0x1f;
+ uint32_t patchBitSize = decodeBitWidth(pwo);
+
+ // read fourth byte and extract patch gap width
+ uint64_t fourthByte = readByte();
+ uint32_t pgw = (fourthByte >> 5) & 0x07;
+ // patch gap width is one off
+ pgw += 1;
+
+ // extract the length of the patch list
+ size_t pl = fourthByte & 0x1f;
+ if (pl == 0) {
+ throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!");
+ }
-uint64_t RleDecoderV2::nextPatched(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
- uint32_t bitSize = decodeBitWidth(fbo);
-
- // extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- // runs are one off
- runLength += 1;
- runRead = 0;
-
- // extract the number of bytes occupied by base
- uint64_t thirdByte = readByte();
- uint64_t byteSize = (thirdByte >> 5) & 0x07;
- // base width is one off
- byteSize += 1;
-
- // extract patch width
- uint32_t pwo = thirdByte & 0x1f;
- uint32_t patchBitSize = decodeBitWidth(pwo);
-
- // read fourth byte and extract patch gap width
- uint64_t fourthByte = readByte();
- uint32_t pgw = (fourthByte >> 5) & 0x07;
- // patch gap width is one off
- pgw += 1;
-
- // extract the length of the patch list
- size_t pl = fourthByte & 0x1f;
- if (pl == 0) {
- throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!");
- }
+ // read the next base width number of bytes to extract base value
+ int64_t base = readLongBE(byteSize);
+ int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1));
+ // if mask of base value is 1 then base is negative value else positive
+ if ((base & mask) != 0) {
+ base = base & ~mask;
+ base = -base;
+ }
- // read the next base width number of bytes to extract base value
- int64_t base = readLongBE(byteSize);
- int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1));
- // if mask of base value is 1 then base is negative value else positive
- if ((base & mask) != 0) {
- base = base & ~mask;
- base = -base;
- }
+ readLongs(literals.data(), 0, runLength, bitSize);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // TODO: something more efficient than resize
+ unpackedPatch.resize(pl);
+ // TODO: Skip corrupt?
+ // if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
+ if ((patchBitSize + pgw) > 64) {
+ throw ParseError(
+ "Corrupt PATCHED_BASE encoded data "
+ "(patchBitSize + pgw > 64)!");
+ }
+ uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
+ readLongs(unpackedPatch.data(), 0, pl, cfb);
+ // any remaining bits are thrown out
+ resetReadLongs();
- readLongs(literals.data(), 0, runLength, bitSize);
- // any remaining bits are thrown out
- resetReadLongs();
-
- // TODO: something more efficient than resize
- unpackedPatch.resize(pl);
- // TODO: Skip corrupt?
- // if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
- if ((patchBitSize + pgw) > 64) {
- throw ParseError("Corrupt PATCHED_BASE encoded data "
- "(patchBitSize + pgw > 64)!");
- }
- uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
- readLongs(unpackedPatch.data(), 0, pl, cfb);
- // any remaining bits are thrown out
- resetReadLongs();
-
- // apply the patch directly when decoding the packed data
- int64_t patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1);
-
- int64_t gap = 0;
- int64_t patch = 0;
- uint64_t patchIdx = 0;
- adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx);
-
- for (uint64_t i = 0; i < runLength; ++i) {
- if (static_cast<int64_t>(i) != gap) {
- // no patching required. add base to unpacked value to get final value
- literals[i] += base;
- } else {
- // extract the patch value
- int64_t patchedVal = literals[i] | (patch << bitSize);
+ // apply the patch directly when decoding the packed data
+ int64_t patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1);
+
+ int64_t gap = 0;
+ int64_t patch = 0;
+ uint64_t patchIdx = 0;
+ adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx);
+
+ for (uint64_t i = 0; i < runLength; ++i) {
+ if (static_cast<int64_t>(i) != gap) {
+ // no patching required. add base to unpacked value to get final value
+ literals[i] += base;
+ } else {
+ // extract the patch value
+ int64_t patchedVal = literals[i] | (patch << bitSize);
- // add base to patched value
- literals[i] = base + patchedVal;
+ // add base to patched value
+ literals[i] = base + patchedVal;
- // increment the patch to point to next entry in patch list
- ++patchIdx;
+ // increment the patch to point to next entry in patch list
+ ++patchIdx;
- if (patchIdx < unpackedPatch.size()) {
- adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch,
- &patchIdx);
+ if (patchIdx < unpackedPatch.size()) {
+ adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx);
- // next gap is relative to the current gap
- gap += i;
+ // next gap is relative to the current gap
+ gap += i;
+ }
}
}
}
+
+ return copyDataFromBuffer(data, offset, numValues, notNull);
}
- return copyDataFromBuffer(data, offset, numValues, notNull);
-}
-
-uint64_t RleDecoderV2::nextDelta(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
- uint32_t bitSize;
- if (fbo != 0) {
- bitSize = decodeBitWidth(fbo);
- } else {
- bitSize = 0;
- }
+ template <typename T>
+ uint64_t RleDecoderV2::nextDelta(T* const data, uint64_t offset, uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ uint32_t bitSize;
+ if (fbo != 0) {
+ bitSize = decodeBitWidth(fbo);
+ } else {
+ bitSize = 0;
+ }
- // extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- ++runLength; // account for first value
- runRead = 0;
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ ++runLength; // account for first value
+ runRead = 0;
- int64_t prevValue;
- // read the first value stored as vint
- if (isSigned) {
- prevValue = readVslong();
- } else {
- prevValue = static_cast<int64_t>(readVulong());
- }
+ int64_t prevValue;
+ // read the first value stored as vint
+ if (isSigned) {
+ prevValue = readVslong();
+ } else {
+ prevValue = static_cast<int64_t>(readVulong());
+ }
- literals[0] = prevValue;
+ literals[0] = prevValue;
- // read the fixed delta value stored as vint (deltas can be negative even
- // if all number are positive)
- int64_t deltaBase = readVslong();
+ // read the fixed delta value stored as vint (deltas can be negative even
+ // if all number are positive)
+ int64_t deltaBase = readVslong();
- if (bitSize == 0) {
- // add fixed deltas to adjacent values
- for (uint64_t i = 1; i < runLength; ++i) {
- literals[i] = literals[i - 1] + deltaBase;
- }
- } else {
- prevValue = literals[1] = prevValue + deltaBase;
- if (runLength < 2) {
- std::stringstream ss;
- ss << "Illegal run length for delta encoding: " << runLength;
- throw ParseError(ss.str());
- }
- // write the unpacked values, add it to previous value and store final
- // value to result buffer. if the delta base value is negative then it
- // is a decreasing sequence else an increasing sequence.
- // read deltas using the literals buffer.
- readLongs(literals.data(), 2, runLength - 2, bitSize);
- if (deltaBase < 0) {
- for (uint64_t i = 2; i < runLength; ++i) {
- prevValue = literals[i] = prevValue - literals[i];
+ if (bitSize == 0) {
+ // add fixed deltas to adjacent values
+ for (uint64_t i = 1; i < runLength; ++i) {
+ literals[i] = literals[i - 1] + deltaBase;
}
} else {
- for (uint64_t i = 2; i < runLength; ++i) {
- prevValue = literals[i] = prevValue + literals[i];
+ prevValue = literals[1] = prevValue + deltaBase;
+ if (runLength < 2) {
+ std::stringstream ss;
+ ss << "Illegal run length for delta encoding: " << runLength;
+ throw ParseError(ss.str());
+ }
+ // write the unpacked values, add it to previous value and store final
+ // value to result buffer. if the delta base value is negative then it
+ // is a decreasing sequence else an increasing sequence.
+ // read deltas using the literals buffer.
+ readLongs(literals.data(), 2, runLength - 2, bitSize);
+ if (deltaBase < 0) {
+ for (uint64_t i = 2; i < runLength; ++i) {
+ prevValue = literals[i] = prevValue - literals[i];
+ }
+ } else {
+ for (uint64_t i = 2; i < runLength; ++i) {
+ prevValue = literals[i] = prevValue + literals[i];
+ }
}
}
}
- }
- return copyDataFromBuffer(data, offset, numValues, notNull);
-}
+ return copyDataFromBuffer(data, offset, numValues, notNull);
+ }
-uint64_t RleDecoderV2::copyDataFromBuffer(int64_t* data, uint64_t offset,
- uint64_t numValues, const char* notNull) {
- uint64_t nRead = std::min(runLength - runRead, numValues);
- if (notNull) {
- for (uint64_t i = offset; i < (offset + nRead); ++i) {
- if (notNull[i]) {
- data[i] = literals[runRead++];
+ template <typename T>
+ uint64_t RleDecoderV2::copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues,
+ const char* notNull) {
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+ if (notNull) {
+ for (uint64_t i = offset; i < (offset + nRead); ++i) {
+ if (notNull[i]) {
+ data[i] = static_cast<T>(literals[runRead++]);
+ }
+ }
+ } else {
+ for (uint64_t i = offset; i < (offset + nRead); ++i) {
+ data[i] = static_cast<T>(literals[runRead++]);
}
}
- } else {
- memcpy(data + offset, literals.data() + runRead, nRead * sizeof(int64_t));
- runRead += nRead;
+ return nRead;
}
- return nRead;
-}
} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
index 4e7a145a5a..a75aeac2eb 100644
--- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
@@ -1,133 +1,135 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
- * distributed with option work for additional information
- * regarding copyright ownership. The ASF licenses option file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
- * "License"); you may not use option file except in compliance
+ * "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#include "Adaptor.hh"
#include "Compression.hh"
-#include "RLEv2.hh"
#include "RLEV2Util.hh"
+#include "RLEv2.hh"
#define MAX_SHORT_REPEAT_LENGTH 10
namespace orc {
-/**
- * Compute the bits required to represent pth percentile value
- * @param data - array
- * @param p - percentile value (>=0.0 to <=1.0)
- * @return pth percentile bits
- */
-uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
+ /**
+ * Compute the bits required to represent pth percentile value
+ * @param data - array
+ * @param p - percentile value (>=0.0 to <=1.0)
+ * @return pth percentile bits
+ */
+ uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p,
+ bool reuseHist) {
if ((p > 1.0) || (p <= 0.0)) {
- throw InvalidArgument("Invalid p value: " + to_string(p));
+ throw InvalidArgument("Invalid p value: " + to_string(p));
}
if (!reuseHist) {
- // histogram that store the encoded bit requirement for each values.
- // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t));
- // compute the histogram
- for(size_t i = offset; i < (offset + length); i++) {
- uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
- histgram[idx] += 1;
- }
+ // histogram that store the encoded bit requirement for each values.
+ // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
+ memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t));
+ // compute the histogram
+ for (size_t i = offset; i < (offset + length); i++) {
+ uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
+ histgram[idx] += 1;
+ }
}
int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
// return the bits required by pth percentile length
- for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
- perLen -= histgram[i];
- if (perLen < 0) {
- return decodeBitWidth(static_cast<uint32_t>(i));
- }
+ for (int32_t i = HIST_LEN - 1; i >= 0; i--) {
+ perLen -= histgram[i];
+ if (perLen < 0) {
+ return decodeBitWidth(static_cast<uint32_t>(i));
+ }
}
return 0;
-}
+ }
-RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned, bool alignBitPacking) :
- RleEncoder(std::move(outStream), hasSigned),
+ RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned,
+ bool alignBitPacking)
+ : RleEncoder(std::move(outStream), hasSigned),
alignedBitPacking(alignBitPacking),
- prevDelta(0){
+ prevDelta(0) {
literals = new int64_t[MAX_LITERAL_SIZE];
gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
zigzagLiterals = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr;
baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
adjDeltas = new int64_t[MAX_LITERAL_SIZE];
-}
+ }
-void RleEncoderV2::write(int64_t val) {
- if(numLiterals == 0) {
- initializeLiterals(val);
- return;
+ void RleEncoderV2::write(int64_t val) {
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ return;
}
- if(numLiterals == 1) {
- prevDelta = val - literals[0];
- literals[numLiterals++] = val;
+ if (numLiterals == 1) {
+ prevDelta = val - literals[0];
+ literals[numLiterals++] = val;
- if(val == literals[0]) {
- fixedRunLength = 2;
- variableRunLength = 0;
- } else {
- fixedRunLength = 0;
- variableRunLength = 2;
- }
- return;
+ if (val == literals[0]) {
+ fixedRunLength = 2;
+ variableRunLength = 0;
+ } else {
+ fixedRunLength = 0;
+ variableRunLength = 2;
+ }
+ return;
}
int64_t currentDelta = val - literals[numLiterals - 1];
EncodingOption option = {};
if (prevDelta == 0 && currentDelta == 0) {
- // case 1: fixed delta run
- literals[numLiterals++] = val;
-
- if (variableRunLength > 0) {
- // if variable run is non-zero then we are seeing repeating
- // values at the end of variable run in which case fixed Run
- // length is 2
- fixedRunLength = 2;
- }
- fixedRunLength++;
-
- // if fixed run met the minimum condition and if variable
- // run is non-zero then flush the variable run and shift the
- // tail fixed runs to start of the buffer
- if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
- numLiterals -= MIN_REPEAT;
- variableRunLength -= (MIN_REPEAT - 1);
-
- determineEncoding(option);
- writeValues(option);
-
- // shift tail fixed runs to beginning of the buffer
- for (size_t i = 0; i < MIN_REPEAT; ++i) {
- literals[i] = val;
- }
- numLiterals = MIN_REPEAT;
- }
+ // case 1: fixed delta run
+ literals[numLiterals++] = val;
+
+ if (variableRunLength > 0) {
+ // if variable run is non-zero then we are seeing repeating
+ // values at the end of variable run in which case fixed Run
+ // length is 2
+ fixedRunLength = 2;
+ }
+ fixedRunLength++;
+
+ // if fixed run met the minimum condition and if variable
+ // run is non-zero then flush the variable run and shift the
+ // tail fixed runs to start of the buffer
+ if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
+ numLiterals -= MIN_REPEAT;
+ variableRunLength -= (MIN_REPEAT - 1);
+
+ determineEncoding(option);
+ writeValues(option);
- if (fixedRunLength == MAX_LITERAL_SIZE) {
- option.encoding = DELTA;
- option.isFixedDelta = true;
- writeValues(option);
+ // shift tail fixed runs to beginning of the buffer
+ for (size_t i = 0; i < MIN_REPEAT; ++i) {
+ literals[i] = val;
}
- return;
+ numLiterals = MIN_REPEAT;
+ }
+
+ if (fixedRunLength == MAX_LITERAL_SIZE) {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ writeValues(option);
+ }
+ return;
}
// case 2: variable delta run
@@ -136,45 +138,45 @@ void RleEncoderV2::write(int64_t val) {
// short repeat conditions then write the values as short repeats
// else use delta encoding
if (fixedRunLength >= MIN_REPEAT) {
- if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- option.encoding = SHORT_REPEAT;
- } else {
- option.encoding = DELTA;
- option.isFixedDelta = true;
- }
- writeValues(option);
+ if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ option.encoding = SHORT_REPEAT;
+ } else {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ }
+ writeValues(option);
}
// if fixed run length is <MIN_REPEAT and current value is
// different from previous then treat it as variable run
if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
}
// after writing values re-initialize the variables
if (numLiterals == 0) {
- initializeLiterals(val);
+ initializeLiterals(val);
} else {
- prevDelta = val - literals[numLiterals - 1];
- literals[numLiterals++] = val;
- variableRunLength++;
+ prevDelta = val - literals[numLiterals - 1];
+ literals[numLiterals++] = val;
+ variableRunLength++;
- if (variableRunLength == MAX_LITERAL_SIZE) {
- determineEncoding(option);
- writeValues(option);
- }
+ if (variableRunLength == MAX_LITERAL_SIZE) {
+ determineEncoding(option);
+ writeValues(option);
+ }
}
-}
+ }
-void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
- assert (isSigned);
+ void RleEncoderV2::computeZigZagLiterals(EncodingOption& option) {
+ assert(isSigned);
for (size_t i = 0; i < numLiterals; i++) {
- zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]);
+ zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]);
}
-}
+ }
-void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
+ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
// mask will be max value beyond which patch will be generated
int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
@@ -190,9 +192,9 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
// gap and patch together in a long. To make sure gap and patch can be
// packed together adjust the patch width
if (option.patchWidth == 64) {
- option.patchWidth = 56;
- option.brBits95p = 8;
- mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
+ option.patchWidth = 56;
+ option.brBits95p = 8;
+ mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
}
uint32_t gapIdx = 0;
@@ -203,27 +205,27 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
std::vector<int64_t> gapList;
std::vector<int64_t> patchList;
- for(size_t i = 0; i < numLiterals; i++) {
- // if value is above mask then create the patch and record the gap
- if (baseRedLiterals[i] > mask) {
- size_t gap = i - prev;
- if (gap > maxGap) {
- maxGap = gap;
- }
-
- // gaps are relative, so store the previous patched value index
- prev = i;
- gapList.push_back(static_cast<int64_t>(gap));
- gapIdx++;
-
- // extract the most significant bits that are over mask bits
- int64_t patch = baseRedLiterals[i] >> option.brBits95p;
- patchList.push_back(patch);
- patchIdx++;
-
- // strip off the MSB to enable safe bit packing
- baseRedLiterals[i] &= mask;
+ for (size_t i = 0; i < numLiterals; i++) {
+ // if value is above mask then create the patch and record the gap
+ if (baseRedLiterals[i] > mask) {
+ size_t gap = i - prev;
+ if (gap > maxGap) {
+ maxGap = gap;
}
+
+ // gaps are relative, so store the previous patched value index
+ prev = i;
+ gapList.push_back(static_cast<int64_t>(gap));
+ gapIdx++;
+
+ // extract the most significant bits that are over mask bits
+ int64_t patch = baseRedLiterals[i] >> option.brBits95p;
+ patchList.push_back(patch);
+ patchIdx++;
+
+ // strip off the MSB to enable safe bit packing
+ baseRedLiterals[i] &= mask;
+ }
}
// adjust the patch length to number of entries in gap list
@@ -232,9 +234,9 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
// if the element to be patched is the first and only element then
// max gap will be 0, but to store the gap as 0 we need atleast 1 bit
if (maxGap == 0 && option.patchLength != 0) {
- option.patchGapWidth = 1;
+ option.patchGapWidth = 1;
} else {
- option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap));
+ option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap));
}
// special case: if the patch gap width is greater than 256, then
@@ -250,58 +252,58 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
// 255 gap width => 0 for patch value
// 1 gap width => actual patch value
if (option.patchGapWidth > 8) {
- option.patchGapWidth = 8;
- // for gap = 511, we need two additional entries in patch list
- if (maxGap == 511) {
- option.patchLength += 2;
- } else {
- option.patchLength += 1;
- }
+ option.patchGapWidth = 8;
+ // for gap = 511, we need two additional entries in patch list
+ if (maxGap == 511) {
+ option.patchLength += 2;
+ } else {
+ option.patchLength += 1;
+ }
}
// create gap vs patch list
gapIdx = 0;
patchIdx = 0;
- for(size_t i = 0; i < option.patchLength; i++) {
- int64_t g = gapList[gapIdx++];
- int64_t p = patchList[patchIdx++];
- while (g > 255) {
- gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
- i++;
- g -= 255;
- }
+ for (size_t i = 0; i < option.patchLength; i++) {
+ int64_t g = gapList[gapIdx++];
+ int64_t p = patchList[patchIdx++];
+ while (g > 255) {
+ gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
+ i++;
+ g -= 255;
+ }
- // store patch value in LSBs and gap in MSBs
- gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
+ // store patch value in LSBs and gap in MSBs
+ gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
}
-}
+ }
-/**
- * Prepare for Direct or PatchedBase encoding
- * compute zigZagLiterals and zzBits100p (Max number of encoding bits required)
- * @return zigzagLiterals
- */
-int64_t* RleEncoderV2::prepareForDirectOrPatchedBase(EncodingOption& option) {
+ /**
+ * Prepare for Direct or PatchedBase encoding
+ * compute zigZagLiterals and zzBits100p (Max number of encoding bits required)
+ * @return zigzagLiterals
+ */
+ int64_t* RleEncoderV2::prepareForDirectOrPatchedBase(EncodingOption& option) {
if (isSigned) {
- computeZigZagLiterals(option);
+ computeZigZagLiterals(option);
}
int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
return currentZigzagLiterals;
-}
+ }
-void RleEncoderV2::determineEncoding(EncodingOption& option) {
+ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
// but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
// computation when it's determined to be necessary.
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
- // we need to compute zigzag values for DIRECT encoding if we decide to
- // break early for delta overflows or for shorter runs
- prepareForDirectOrPatchedBase(option);
- option.encoding = DIRECT;
- return;
+ // we need to compute zigzag values for DIRECT encoding if we decide to
+ // break early for delta overflows or for shorter runs
+ prepareForDirectOrPatchedBase(option);
+ option.encoding = DIRECT;
+ return;
}
// DELTA encoding check
@@ -319,29 +321,29 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
adjDeltas[option.adjDeltasCount++] = initialDelta;
for (size_t i = 1; i < numLiterals; i++) {
- const int64_t l1 = literals[i];
- const int64_t l0 = literals[i - 1];
- currDelta = l1 - l0;
- option.min = std::min(option.min, l1);
- max = std::max(max, l1);
-
- isIncreasing &= (l0 <= l1);
- isDecreasing &= (l0 >= l1);
-
- option.isFixedDelta &= (currDelta == initialDelta);
- if (i > 1) {
- adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
- deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
- }
+ const int64_t l1 = literals[i];
+ const int64_t l0 = literals[i - 1];
+ currDelta = l1 - l0;
+ option.min = std::min(option.min, l1);
+ max = std::max(max, l1);
+
+ isIncreasing &= (l0 <= l1);
+ isDecreasing &= (l0 >= l1);
+
+ option.isFixedDelta &= (currDelta == initialDelta);
+ if (i > 1) {
+ adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
+ deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
+ }
}
// it's faster to exit under delta overflow condition without checking for
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!isSafeSubtract(max, option.min)) {
- prepareForDirectOrPatchedBase(option);
- option.encoding = DIRECT;
- return;
+ prepareForDirectOrPatchedBase(option);
+ option.encoding = DIRECT;
+ return;
}
// invariant - subtracting any number from any other in the literals after
@@ -350,42 +352,42 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// if min is equal to max then the delta is 0, option condition happens for
// fixed values run >10 which cannot be encoded with SHORT_REPEAT
if (option.min == max) {
- if (!option.isFixedDelta) {
- throw InvalidArgument(to_string(option.min) + "==" +
- to_string(max) + ", isFixedDelta cannot be false");
- }
+ if (!option.isFixedDelta) {
+ throw InvalidArgument(to_string(option.min) + "==" + to_string(max) +
+ ", isFixedDelta cannot be false");
+ }
- if(currDelta != 0) {
- throw InvalidArgument(to_string(option.min) + "==" +
- to_string(max) + ", currDelta should be zero");
- }
- option.fixedDelta = 0;
- option.encoding = DELTA;
- return;
+ if (currDelta != 0) {
+ throw InvalidArgument(to_string(option.min) + "==" + to_string(max) +
+ ", currDelta should be zero");
+ }
+ option.fixedDelta = 0;
+ option.encoding = DELTA;
+ return;
}
if (option.isFixedDelta) {
- if (currDelta != initialDelta) {
- throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
- }
+ if (currDelta != initialDelta) {
+ throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
+ }
- option.encoding = DELTA;
- option.fixedDelta = currDelta;
- return;
+ option.encoding = DELTA;
+ option.fixedDelta = currDelta;
+ return;
}
// if initialDelta is 0 then we cannot delta encode as we cannot identify
// the sign of deltas (increasing or decreasing)
if (initialDelta != 0) {
- // stores the number of bits required for packing delta blob in
- // delta encoding
- option.bitsDeltaMax = findClosestNumBits(deltaMax);
-
- // monotonic condition
- if (isIncreasing || isDecreasing) {
- option.encoding = DELTA;
- return;
- }
+ // stores the number of bits required for packing delta blob in
+ // delta encoding
+ option.bitsDeltaMax = findClosestNumBits(deltaMax);
+
+ // monotonic condition
+ if (isIncreasing || isDecreasing) {
+ option.encoding = DELTA;
+ return;
+ }
}
// PATCHED_BASE encoding check
@@ -402,106 +404,105 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// if the difference between 90th percentile and 100th percentile fixed
// bits is > 1 then we need patch the values
if (diffBitsLH > 1) {
+ // patching is done only on base reduced values.
+ // remove base from literals
+ for (size_t i = 0; i < numLiterals; i++) {
+ baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
+ }
- // patching is done only on base reduced values.
- // remove base from literals
- for (size_t i = 0; i < numLiterals; i++) {
- baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
- }
-
- // 95th percentile width is used to determine max allowed value
- // after which patching will be done
- option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
-
- // 100th percentile is used to compute the max patch width
- option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
-
- // after base reducing the values, if the difference in bits between
- // 95th percentile and 100th percentile value is zero then there
- // is no point in patching the values, in which case we will
- // fallback to DIRECT encoding.
- // The decision to use patched base was based on zigzag values, but the
- // actual patching is done on base reduced literals.
- if ((option.brBits100p - option.brBits95p) != 0) {
- option.encoding = PATCHED_BASE;
- preparePatchedBlob(option);
- return;
- } else {
- option.encoding = DIRECT;
- return;
- }
- } else {
- // if difference in bits between 95th percentile and 100th percentile is
- // 0, then patch length will become 0. Hence we will fallback to direct
+ // 95th percentile width is used to determine max allowed value
+ // after which patching will be done
+ option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
+
+ // 100th percentile is used to compute the max patch width
+ option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
+
+ // after base reducing the values, if the difference in bits between
+ // 95th percentile and 100th percentile value is zero then there
+ // is no point in patching the values, in which case we will
+ // fallback to DIRECT encoding.
+ // The decision to use patched base was based on zigzag values, but the
+ // actual patching is done on base reduced literals.
+ if ((option.brBits100p - option.brBits95p) != 0) {
+ option.encoding = PATCHED_BASE;
+ preparePatchedBlob(option);
+ return;
+ } else {
option.encoding = DIRECT;
return;
+ }
+ } else {
+ // if difference in bits between 95th percentile and 100th percentile is
+ // 0, then patch length will become 0. Hence we will fallback to direct
+ option.encoding = DIRECT;
+ return;
}
-}
+ }
-uint64_t RleEncoderV2::flush() {
+ uint64_t RleEncoderV2::flush() {
if (numLiterals != 0) {
- EncodingOption option = {};
- if (variableRunLength != 0) {
- determineEncoding(option);
- writeValues(option);
- } else if (fixedRunLength != 0) {
- if (fixedRunLength < MIN_REPEAT) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- determineEncoding(option);
- writeValues(option);
- } else if (fixedRunLength >= MIN_REPEAT
- && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- option.encoding = SHORT_REPEAT;
- writeValues(option);
- } else {
- option.encoding = DELTA;
- option.isFixedDelta = true;
- writeValues(option);
- }
+ EncodingOption option = {};
+ if (variableRunLength != 0) {
+ determineEncoding(option);
+ writeValues(option);
+ } else if (fixedRunLength != 0) {
+ if (fixedRunLength < MIN_REPEAT) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ determineEncoding(option);
+ writeValues(option);
+ } else if (fixedRunLength >= MIN_REPEAT && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ option.encoding = SHORT_REPEAT;
+ writeValues(option);
+ } else {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ writeValues(option);
}
+ }
}
outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
uint64_t dataSize = outputStream->flush();
bufferLength = bufferPosition = 0;
return dataSize;
-}
+ }
-void RleEncoderV2::writeValues(EncodingOption& option) {
+ void RleEncoderV2::writeValues(EncodingOption& option) {
if (numLiterals != 0) {
- switch (option.encoding) {
- case SHORT_REPEAT:
- writeShortRepeatValues(option);
- break;
- case DIRECT:
- writeDirectValues(option);
- break;
- case PATCHED_BASE:
- writePatchedBasedValues(option);
- break;
- case DELTA:
- writeDeltaValues(option);
- break;
- default:
- throw NotImplementedYet("Not implemented yet");
- }
+ switch (option.encoding) {
+ case SHORT_REPEAT:
+ writeShortRepeatValues(option);
+ break;
+ case DIRECT:
+ writeDirectValues(option);
+ break;
+ case PATCHED_BASE:
+ writePatchedBasedValues(option);
+ break;
+ case DELTA:
+ writeDeltaValues(option);
+ break;
+ default:
+ throw NotImplementedYet("Not implemented yet");
+ }
- numLiterals = 0;
- prevDelta = 0;
+ numLiterals = 0;
+ prevDelta = 0;
}
-}
+ }
-void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
+ void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
int64_t repeatVal;
if (isSigned) {
- repeatVal = zigZag(literals[0]);
+ repeatVal = zigZag(literals[0]);
} else {
- repeatVal = literals[0];
+ repeatVal = literals[0];
}
const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal);
- const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1);
+ const uint32_t numBytesRepeatVal =
+ numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1);
uint32_t header = getOpCode(SHORT_REPEAT);
@@ -511,19 +512,19 @@ void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
writeByte(static_cast<char>(header));
- for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) {
- int64_t b = ((repeatVal >> (i * 8)) & 0xff);
- writeByte(static_cast<char>(b));
+ for (int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) {
+ int64_t b = ((repeatVal >> (i * 8)) & 0xff);
+ writeByte(static_cast<char>(b));
}
fixedRunLength = 0;
-}
+ }
-void RleEncoderV2::writeDirectValues(EncodingOption& option) {
+ void RleEncoderV2::writeDirectValues(EncodingOption& option) {
// write the number of fixed bits required in next 5 bits
uint32_t fb = option.zzBits100p;
if (alignedBitPacking) {
- fb = getClosestAlignedFixedBits(fb);
+ fb = getClosestAlignedFixedBits(fb);
}
const uint32_t efb = encodeBitWidth(fb) << 1;
@@ -550,9 +551,9 @@ void RleEncoderV2::writeDirectValues(EncodingOption& option) {
// reset run length
variableRunLength = 0;
-}
+ }
-void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
+ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
// NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
// because patch is applied to MSB bits. For example: If fixed bit width of
// base value is 7 bits and if patch is 3 bits, the actual value is
@@ -578,7 +579,7 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
// if the min value is negative toggle the sign
const bool isNegative = (option.min < 0);
if (isNegative) {
- option.min = -option.min;
+ option.min = -option.min;
}
// find the number of bytes required for base and shift it by 5 bits
@@ -590,7 +591,7 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
// if the base value is negative then set MSB to 1
if (isNegative) {
- option.min |= (1LL << ((baseBytes * 8) - 1));
+ option.min |= (1LL << ((baseBytes * 8) - 1));
}
// third byte contains 3 bits for number of bytes occupied by base
@@ -599,7 +600,8 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
// fourth byte contains 3 bits for page gap width and 5 bits for
// patch length
- const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength);
+ const char headerFourthByte =
+ static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength);
// write header
writeByte(headerFirstByte);
@@ -608,9 +610,9 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
writeByte(headerFourthByte);
// write the base value using fixed bytes in big endian order
- for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) {
- char b = static_cast<char>(((option.min >> (i * 8)) & 0xff));
- writeByte(b);
+ for (int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) {
+ char b = static_cast<char>(((option.min >> (i * 8)) & 0xff));
+ writeByte(b);
}
// base reduced literals are bit packed
@@ -625,39 +627,39 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
// reset run length
variableRunLength = 0;
-}
+ }
-void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
+ void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
uint32_t len = 0;
uint32_t fb = option.bitsDeltaMax;
uint32_t efb = 0;
if (alignedBitPacking) {
- fb = getClosestAlignedFixedBits(fb);
+ fb = getClosestAlignedFixedBits(fb);
}
if (option.isFixedDelta) {
- // if fixed run length is greater than threshold then it will be fixed
- // delta sequence with delta value 0 else fixed delta sequence with
- // non-zero delta value
- if (fixedRunLength > MIN_REPEAT) {
- // ex. sequence: 2 2 2 2 2 2 2 2
- len = fixedRunLength - 1;
- fixedRunLength = 0;
- } else {
- // ex. sequence: 4 6 8 10 12 14 16
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
- } else {
- // fixed width 0 is used for long repeating values.
- // sequences that require only 1 bit to encode will have an additional bit
- if (fb == 1) {
- fb = 2;
- }
- efb = encodeBitWidth(fb) << 1;
+ // if fixed run length is greater than threshold then it will be fixed
+ // delta sequence with delta value 0 else fixed delta sequence with
+ // non-zero delta value
+ if (fixedRunLength > MIN_REPEAT) {
+ // ex. sequence: 2 2 2 2 2 2 2 2
+ len = fixedRunLength - 1;
+ fixedRunLength = 0;
+ } else {
+ // ex. sequence: 4 6 8 10 12 14 16
len = variableRunLength - 1;
variableRunLength = 0;
+ }
+ } else {
+ // fixed width 0 is used for long repeating values.
+ // sequences that require only 1 bit to encode will have an additional bit
+ if (fb == 1) {
+ fb = 2;
+ }
+ efb = encodeBitWidth(fb) << 1;
+ len = variableRunLength - 1;
+ variableRunLength = 0;
}
// extract the 9th bit of run length
@@ -675,106 +677,106 @@ void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
// store the first value from zigzag literal array
if (isSigned) {
- writeVslong(literals[0]);
+ writeVslong(literals[0]);
} else {
- writeVulong(literals[0]);
+ writeVulong(literals[0]);
}
if (option.isFixedDelta) {
- // if delta is fixed then we don't need to store delta blob
- writeVslong(option.fixedDelta);
+ // if delta is fixed then we don't need to store delta blob
+ writeVslong(option.fixedDelta);
} else {
- // store the first value as delta value using zigzag encoding
- writeVslong(adjDeltas[0]);
+ // store the first value as delta value using zigzag encoding
+ writeVslong(adjDeltas[0]);
- // adjacent delta values are bit packed. The length of adjDeltas array is
- // always one less than the number of literals (delta difference for n
- // elements is n-1). We have already written one element, write the
- // remaining numLiterals - 2 elements here
- writeInts(adjDeltas, 1, numLiterals - 2, fb);
+ // adjacent delta values are bit packed. The length of adjDeltas array is
+ // always one less than the number of literals (delta difference for n
+ // elements is n-1). We have already written one element, write the
+ // remaining numLiterals - 2 elements here
+ writeInts(adjDeltas, 1, numLiterals - 2, fb);
}
-}
+ }
-void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) {
- if(input == nullptr || len < 1 || bitSize < 1) {
+ void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) {
+ if (input == nullptr || len < 1 || bitSize < 1) {
return;
- }
+ }
- if (getClosestAlignedFixedBits(bitSize) == bitSize) {
- uint32_t numBytes;
- uint32_t endOffSet = static_cast<uint32_t>(offset + len);
- if (bitSize < 8 ) {
- char bitMask = static_cast<char>((1 << bitSize) - 1);
- uint32_t numHops = 8 / bitSize;
- uint32_t remainder = static_cast<uint32_t>(len % numHops);
- uint32_t endUnroll = endOffSet - remainder;
- for (uint32_t i = offset; i < endUnroll; i+=numHops) {
- char toWrite = 0;
- for (uint32_t j = 0; j < numHops; ++j) {
- toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize));
+ if (getClosestAlignedFixedBits(bitSize) == bitSize) {
+ uint32_t numBytes;
+ uint32_t endOffSet = static_cast<uint32_t>(offset + len);
+ if (bitSize < 8) {
+ char bitMask = static_cast<char>((1 << bitSize) - 1);
+ uint32_t numHops = 8 / bitSize;
+ uint32_t remainder = static_cast<uint32_t>(len % numHops);
+ uint32_t endUnroll = endOffSet - remainder;
+ for (uint32_t i = offset; i < endUnroll; i += numHops) {
+ char toWrite = 0;
+ for (uint32_t j = 0; j < numHops; ++j) {
+ toWrite |= static_cast<char>((input[i + j] & bitMask) << (8 - (j + 1) * bitSize));
+ }
+ writeByte(toWrite);
}
- writeByte(toWrite);
- }
- if (remainder > 0) {
- uint32_t startShift = 8 - bitSize;
- char toWrite = 0;
- for (uint32_t i = endUnroll; i < endOffSet; ++i) {
- toWrite |= static_cast<char>((input[i] & bitMask) << startShift);
- startShift -= bitSize;
+ if (remainder > 0) {
+ uint32_t startShift = 8 - bitSize;
+ char toWrite = 0;
+ for (uint32_t i = endUnroll; i < endOffSet; ++i) {
+ toWrite |= static_cast<char>((input[i] & bitMask) << startShift);
+ startShift -= bitSize;
+ }
+ writeByte(toWrite);
}
- writeByte(toWrite);
- }
- } else {
- numBytes = bitSize / 8;
+ } else {
+ numBytes = bitSize / 8;
- for (uint32_t i = offset; i < endOffSet; ++i) {
- for (uint32_t j = 0; j < numBytes; ++j) {
- char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255);
- writeByte(toWrite);
+ for (uint32_t i = offset; i < endOffSet; ++i) {
+ for (uint32_t j = 0; j < numBytes; ++j) {
+ char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255);
+ writeByte(toWrite);
+ }
}
}
- }
- return;
- }
+ return;
+ }
- // write for unaligned bit size
- uint32_t bitsLeft = 8;
- char current = 0;
- for(uint32_t i = offset; i < (offset + len); i++) {
- int64_t value = input[i];
- uint32_t bitsToWrite = bitSize;
- while (bitsToWrite > bitsLeft) {
- // add the bits to the bottom of the current word
- current |= static_cast<char>(value >> (bitsToWrite - bitsLeft));
- // subtract out the bits we just added
- bitsToWrite -= bitsLeft;
- // zero out the bits above bitsToWrite
- value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1;
- writeByte(current);
- current = 0;
- bitsLeft = 8;
+ // write for unaligned bit size
+ uint32_t bitsLeft = 8;
+ char current = 0;
+ for (uint32_t i = offset; i < (offset + len); i++) {
+ int64_t value = input[i];
+ uint32_t bitsToWrite = bitSize;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= static_cast<char>(value >> (bitsToWrite - bitsLeft));
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1;
+ writeByte(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ bitsLeft -= bitsToWrite;
+ current |= static_cast<char>(value << bitsLeft);
+ if (bitsLeft == 0) {
+ writeByte(current);
+ current = 0;
+ bitsLeft = 8;
+ }
}
- bitsLeft -= bitsToWrite;
- current |= static_cast<char>(value << bitsLeft);
- if (bitsLeft == 0) {
+
+ // flush
+ if (bitsLeft != 8) {
writeByte(current);
- current = 0;
- bitsLeft = 8;
}
}
- // flush
- if (bitsLeft != 8) {
- writeByte(current);
- }
-}
-
-void RleEncoderV2::initializeLiterals(int64_t val) {
+ void RleEncoderV2::initializeLiterals(int64_t val) {
literals[numLiterals++] = val;
fixedRunLength = 1;
variableRunLength = 1;
-}
-}
+ }
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
new file mode 100644
index 0000000000..b8c4fd4048
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
@@ -0,0 +1,255 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SchemaEvolution.hh"
+#include "orc/Exceptions.hh"
+
+namespace orc {
+
+ SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& _readType, const Type* fileType)
+ : readType(_readType) {
+ if (readType) {
+ buildConversion(readType.get(), fileType);
+ } else {
+ for (uint64_t i = 0; i <= fileType->getMaximumColumnId(); ++i) {
+ safePPDConversionMap.insert(i);
+ }
+ }
+ }
+
+ const Type* SchemaEvolution::getReadType(const Type& fileType) const {
+ auto ret = readTypeMap.find(fileType.getColumnId());
+ return ret == readTypeMap.cend() ? &fileType : ret->second;
+ }
+
+ inline void invalidConversion(const Type* readType, const Type* fileType) {
+ throw SchemaEvolutionError("Cannot convert from " + fileType->toString() + " to " +
+ readType->toString());
+ }
+
+ struct EnumClassHash {
+ template <typename T>
+ std::size_t operator()(T t) const {
+ return static_cast<std::size_t>(t);
+ }
+ };
+
+ bool isNumeric(const Type& type) {
+ auto kind = type.getKind();
+ return kind == BOOLEAN || kind == BYTE || kind == SHORT || kind == INT || kind == LONG ||
+ kind == FLOAT || kind == DOUBLE;
+ }
+
+ bool isStringVariant(const Type& type) {
+ auto kind = type.getKind();
+ return kind == STRING || kind == CHAR || kind == VARCHAR;
+ }
+
+ bool isDecimal(const Type& type) {
+ auto kind = type.getKind();
+ return kind == DECIMAL;
+ }
+
+ bool isTimestamp(const Type& type) {
+ auto kind = type.getKind();
+ return kind == TIMESTAMP || kind == TIMESTAMP_INSTANT;
+ }
+
+ struct ConversionCheckResult {
+ bool isValid;
+ bool needConvert;
+ };
+
+ ConversionCheckResult checkConversion(const Type& readType, const Type& fileType) {
+ ConversionCheckResult ret = {false, false};
+ if (readType.getKind() == fileType.getKind()) {
+ ret.isValid = true;
+ if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) {
+ ret.isValid = readType.getMaximumLength() == fileType.getMaximumLength();
+ } else if (fileType.getKind() == DECIMAL) {
+ ret.needConvert = readType.getPrecision() != fileType.getPrecision() ||
+ readType.getScale() != fileType.getScale();
+ }
+ } else {
+ switch (fileType.getKind()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE: {
+ ret.isValid = ret.needConvert = isNumeric(readType) || isStringVariant(readType) ||
+ isDecimal(readType) || isTimestamp(readType);
+ break;
+ }
+ case DECIMAL: {
+ ret.isValid = ret.needConvert = isNumeric(readType);
+ break;
+ }
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ case TIMESTAMP:
+ case TIMESTAMP_INSTANT:
+ case DATE:
+ case BINARY: {
+ // Not support
+ break;
+ }
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION: {
+ ret.isValid = ret.needConvert = false;
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ return ret;
+ }
+
+ void SchemaEvolution::buildConversion(const Type* _readType, const Type* fileType) {
+ if (fileType == nullptr) {
+ throw SchemaEvolutionError("File does not have " + _readType->toString());
+ }
+
+ auto [valid, convert] = checkConversion(*_readType, *fileType);
+ if (!valid) {
+ invalidConversion(_readType, fileType);
+ }
+ readTypeMap.emplace(_readType->getColumnId(), convert ? _readType : fileType);
+
+ // check whether PPD conversion is safe
+ buildSafePPDConversionMap(_readType, fileType);
+
+ for (uint64_t i = 0; i < _readType->getSubtypeCount(); ++i) {
+ auto subType = _readType->getSubtype(i);
+ if (subType) {
+ // null subType means that this is a sub column of map/list type
+ // and it does not exist in the file. simply skip it.
+ buildConversion(subType, fileType->getTypeByColumnId(subType->getColumnId()));
+ }
+ }
+ }
+
+ bool SchemaEvolution::needConvert(const Type& fileType) const {
+ auto _readType = getReadType(fileType);
+ if (_readType == &fileType) {
+ return false;
+ }
+ // it does not check valid here as verified by buildConversion()
+ return checkConversion(*_readType, fileType).needConvert;
+ }
+
+ inline bool isPrimitive(const Type* type) {
+ auto kind = type->getKind();
+ return kind != STRUCT && kind != MAP && kind != LIST && kind != UNION;
+ }
+
+ void SchemaEvolution::buildSafePPDConversionMap(const Type* _readType, const Type* fileType) {
+ if (_readType == nullptr || !isPrimitive(_readType) || fileType == nullptr ||
+ !isPrimitive(fileType)) {
+ return;
+ }
+
+ bool isSafe = false;
+ if (_readType == fileType) {
+ // short cut for same type
+ isSafe = true;
+ } else if (_readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) {
+ // for decimals alone do equality check to not mess up with precision change
+ if (fileType->getPrecision() == readType->getPrecision() &&
+ fileType->getScale() == readType->getScale()) {
+ isSafe = true;
+ }
+ } else {
+ // only integer and string evolutions are safe
+ // byte -> short -> int -> long
+ // string <-> char <-> varchar
+ // NOTE: Float to double evolution is not safe as floats are stored as
+ // doubles in ORC's internal index, but when doing predicate evaluation
+ // for queries like "select * from orc_float where f = 74.72" the constant
+ // on the filter is converted from string -> double so the precisions will
+ // be different and the comparison will fail.
+ // Soon, we should convert all sargs that compare equality between floats
+ // or doubles to range predicates.
+ // Similarly string -> char and varchar -> char and vice versa is impossible
+ // as ORC stores char with padded spaces in its internal index.
+ switch (fileType->getKind()) {
+ case BYTE: {
+ if (readType->getKind() == SHORT || readType->getKind() == INT ||
+ readType->getKind() == LONG) {
+ isSafe = true;
+ }
+ break;
+ }
+ case SHORT: {
+ if (readType->getKind() == INT || readType->getKind() == LONG) {
+ isSafe = true;
+ }
+ break;
+ }
+ case INT: {
+ if (readType->getKind() == LONG) {
+ isSafe = true;
+ }
+ break;
+ }
+ case STRING: {
+ if (readType->getKind() == VARCHAR) {
+ isSafe = true;
+ }
+ break;
+ }
+ case VARCHAR: {
+ if (readType->getKind() == STRING) {
+ isSafe = true;
+ }
+ break;
+ }
+ case BOOLEAN:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case BINARY:
+ case TIMESTAMP:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DECIMAL:
+ case DATE:
+ case CHAR:
+ case TIMESTAMP_INSTANT:
+ break;
+ }
+ }
+
+ if (isSafe) {
+ safePPDConversionMap.insert(fileType->getColumnId());
+ }
+ }
+
+ bool SchemaEvolution::isSafePPDConversion(uint64_t columnId) const {
+ return safePPDConversionMap.find(columnId) != safePPDConversionMap.cend();
+ }
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh
new file mode 100644
index 0000000000..ef9020eba4
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_SCHEMA_EVOLUTION_HH
+#define ORC_SCHEMA_EVOLUTION_HH
+
+#include "orc/Type.hh"
+
+#include <unordered_map>
+#include <unordered_set>
+
+namespace orc {
+
+ /**
+ * Utility class to compare read type and file type to match their columns
+ * and check type conversion.
+ */
+ class SchemaEvolution {
+ public:
+ SchemaEvolution(const std::shared_ptr<Type>& readType, const Type* fileType);
+
+ // get read type by column id from file type. or return the file type if
+ // read type is not provided (i.e. no schema evolution requested).
+ const Type* getReadType(const Type& fileType) const;
+
+ // check if we need to convert file type to read type for primitive type.
+ bool needConvert(const Type& fileType) const;
+
+ // check if the PPD conversion is safe
+ bool isSafePPDConversion(uint64_t columnId) const;
+
+ // return selected read type
+ const Type* getReadType() const {
+ return readType.get();
+ }
+
+ private:
+ void buildConversion(const Type* readType, const Type* fileType);
+ void buildSafePPDConversionMap(const Type* readType, const Type* fileType);
+
+ private:
+ const std::shared_ptr<Type> readType;
+ std::unordered_map<uint64_t, const Type*> readTypeMap;
+ std::unordered_set<uint64_t> safePPDConversionMap;
+ };
+
+} // namespace orc
+
+#endif // ORC_SCHEMA_EVOLUTION_HH
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc
index ccc54c291c..8ed29d0e7c 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.cc
+++ b/contrib/libs/apache/orc/c++/src/Statistics.cc
@@ -1,4 +1,4 @@
- /**
+/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -16,9 +16,9 @@
* limitations under the License.
*/
-#include "orc/Exceptions.hh"
-#include "RLE.hh"
#include "Statistics.hh"
+#include "RLE.hh"
+#include "orc/Exceptions.hh"
#include "wrap/coded-stream-wrapper.h"
@@ -26,23 +26,23 @@ namespace orc {
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext) {
- if (s.has_intstatistics()) {
+ if (s.has_int_statistics()) {
return new IntegerColumnStatisticsImpl(s);
- } else if (s.has_doublestatistics()) {
+ } else if (s.has_double_statistics()) {
return new DoubleColumnStatisticsImpl(s);
- } else if (s.has_collectionstatistics()) {
+ } else if (s.has_collection_statistics()) {
return new CollectionColumnStatisticsImpl(s);
- } else if (s.has_stringstatistics()) {
+ } else if (s.has_string_statistics()) {
return new StringColumnStatisticsImpl(s, statContext);
- } else if (s.has_bucketstatistics()) {
+ } else if (s.has_bucket_statistics()) {
return new BooleanColumnStatisticsImpl(s, statContext);
- } else if (s.has_decimalstatistics()) {
+ } else if (s.has_decimal_statistics()) {
return new DecimalColumnStatisticsImpl(s, statContext);
- } else if (s.has_timestampstatistics()) {
+ } else if (s.has_timestamp_statistics()) {
return new TimestampColumnStatisticsImpl(s, statContext);
- } else if (s.has_datestatistics()) {
+ } else if (s.has_date_statistics()) {
return new DateColumnStatisticsImpl(s, statContext);
- } else if (s.has_binarystatistics()) {
+ } else if (s.has_binary_statistics()) {
return new BinaryColumnStatisticsImpl(s, statContext);
} else {
return new ColumnStatisticsImpl(s);
@@ -51,24 +51,20 @@ namespace orc {
StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats,
const StatContext& statContext) {
- for(int i = 0; i < stripeStats.colstats_size(); i++) {
- colStats.push_back(
- convertColumnStatistics(stripeStats.colstats(i), statContext));
+ for (int i = 0; i < stripeStats.col_stats_size(); i++) {
+ colStats.push_back(convertColumnStatistics(stripeStats.col_stats(i), statContext));
}
}
- StatisticsImpl::StatisticsImpl(const proto::Footer& footer,
- const StatContext& statContext) {
- for(int i = 0; i < footer.statistics_size(); i++) {
- colStats.push_back(
- convertColumnStatistics(footer.statistics(i), statContext));
+ StatisticsImpl::StatisticsImpl(const proto::Footer& footer, const StatContext& statContext) {
+ for (int i = 0; i < footer.statistics_size(); i++) {
+ colStats.push_back(convertColumnStatistics(footer.statistics(i), statContext));
}
}
StatisticsImpl::~StatisticsImpl() {
- for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin();
- ptr != colStats.end();
- ++ptr) {
+ for (std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); ptr != colStats.end();
+ ++ptr) {
delete *ptr;
}
}
@@ -86,21 +82,19 @@ namespace orc {
}
StripeStatisticsImpl::StripeStatisticsImpl(
- const proto::StripeStatistics& stripeStats,
- std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
- const StatContext& statContext) {
- columnStats.reset(new StatisticsImpl(stripeStats, statContext));
+ const proto::StripeStatistics& stripeStats,
+ std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
+ const StatContext& statContext) {
+ columnStats = std::make_unique<StatisticsImpl>(stripeStats, statContext);
rowIndexStats.resize(indexStats.size());
- for(size_t i = 0; i < rowIndexStats.size(); i++) {
- for(size_t j = 0; j < indexStats[i].size(); j++) {
- rowIndexStats[i].push_back(
- std::shared_ptr<const ColumnStatistics>(
- convertColumnStatistics(indexStats[i][j], statContext)));
+ for (size_t i = 0; i < rowIndexStats.size(); i++) {
+ for (size_t j = 0; j < indexStats[i].size(); j++) {
+ rowIndexStats[i].push_back(std::shared_ptr<const ColumnStatistics>(
+ convertColumnStatistics(indexStats[i][j], statContext)));
}
}
}
-
ColumnStatistics::~ColumnStatistics() {
// PASS
}
@@ -185,59 +179,57 @@ namespace orc {
// PASS
}
- ColumnStatisticsImpl::ColumnStatisticsImpl
- (const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
+ ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
}
- BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (pb.has_binarystatistics() && statContext.correctStats) {
- _stats.setHasTotalLength(pb.binarystatistics().has_sum());
- _stats.setTotalLength(
- static_cast<uint64_t>(pb.binarystatistics().sum()));
+ BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb,
+ const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (pb.has_binary_statistics() && statContext.correctStats) {
+ _stats.setHasTotalLength(pb.binary_statistics().has_sum());
+ _stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
}
}
- BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (pb.has_bucketstatistics() && statContext.correctStats) {
+ BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb,
+ const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (pb.has_bucket_statistics() && statContext.correctStats) {
_hasCount = true;
- _trueCount = pb.bucketstatistics().count(0);
+ _trueCount = pb.bucket_statistics().count(0);
} else {
_hasCount = false;
_trueCount = 0;
}
}
- DateColumnStatisticsImpl::DateColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_datestatistics() || !statContext.correctStats) {
+ DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb,
+ const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (!pb.has_date_statistics() || !statContext.correctStats) {
// hasMinimum_ is false by default;
// hasMaximum_ is false by default;
_stats.setMinimum(0);
_stats.setMaximum(0);
} else {
- _stats.setHasMinimum(pb.datestatistics().has_minimum());
- _stats.setHasMaximum(pb.datestatistics().has_maximum());
- _stats.setMinimum(pb.datestatistics().minimum());
- _stats.setMaximum(pb.datestatistics().maximum());
+ _stats.setHasMinimum(pb.date_statistics().has_minimum());
+ _stats.setHasMaximum(pb.date_statistics().has_maximum());
+ _stats.setMinimum(pb.date_statistics().minimum());
+ _stats.setMaximum(pb.date_statistics().maximum());
}
}
- DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (pb.has_decimalstatistics() && statContext.correctStats) {
- const proto::DecimalStatistics& stats = pb.decimalstatistics();
+ DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb,
+ const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (pb.has_decimal_statistics() && statContext.correctStats) {
+ const proto::DecimalStatistics& stats = pb.decimal_statistics();
_stats.setHasMinimum(stats.has_minimum());
_stats.setHasMaximum(stats.has_maximum());
_stats.setHasSum(stats.has_sum());
@@ -248,16 +240,15 @@ namespace orc {
}
}
- DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl
- (const proto::ColumnStatistics& pb){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_doublestatistics()) {
+ DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (!pb.has_double_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
_stats.setSum(0);
- }else{
- const proto::DoubleStatistics& stats = pb.doublestatistics();
+ } else {
+ const proto::DoubleStatistics& stats = pb.double_statistics();
_stats.setHasMinimum(stats.has_minimum());
_stats.setHasMaximum(stats.has_maximum());
_stats.setHasSum(stats.has_sum());
@@ -268,16 +259,15 @@ namespace orc {
}
}
- IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl
- (const proto::ColumnStatistics& pb){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_intstatistics()) {
+ IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (!pb.has_int_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
_stats.setSum(0);
- }else{
- const proto::IntegerStatistics& stats = pb.intstatistics();
+ } else {
+ const proto::IntegerStatistics& stats = pb.int_statistics();
_stats.setHasMinimum(stats.has_minimum());
_stats.setHasMaximum(stats.has_maximum());
_stats.setHasSum(stats.has_sum());
@@ -288,14 +278,14 @@ namespace orc {
}
}
- StringColumnStatisticsImpl::StringColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_stringstatistics() || !statContext.correctStats) {
+ StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb,
+ const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (!pb.has_string_statistics() || !statContext.correctStats) {
_stats.setTotalLength(0);
- }else{
- const proto::StringStatistics& stats = pb.stringstatistics();
+ } else {
+ const proto::StringStatistics& stats = pb.string_statistics();
_stats.setHasMinimum(stats.has_minimum());
_stats.setHasMaximum(stats.has_maximum());
_stats.setHasTotalLength(stats.has_sum());
@@ -306,46 +296,40 @@ namespace orc {
}
}
- TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext) {
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_timestampstatistics() || !statContext.correctStats) {
+ TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb,
+ const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (!pb.has_timestamp_statistics() || !statContext.correctStats) {
_stats.setMinimum(0);
_stats.setMaximum(0);
_lowerBound = 0;
_upperBound = 0;
_minimumNanos = DEFAULT_MIN_NANOS;
_maximumNanos = DEFAULT_MAX_NANOS;
- }else{
- const proto::TimestampStatistics& stats = pb.timestampstatistics();
- _stats.setHasMinimum(
- stats.has_minimumutc() ||
- (stats.has_minimum() && (statContext.writerTimezone != nullptr)));
- _stats.setHasMaximum(
- stats.has_maximumutc() ||
- (stats.has_maximum() && (statContext.writerTimezone != nullptr)));
- _hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
- _hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
- // to be consistent with java side, non-default minimumnanos and maximumnanos
+ } else {
+ const proto::TimestampStatistics& stats = pb.timestamp_statistics();
+ _stats.setHasMinimum(stats.has_minimum_utc() ||
+ (stats.has_minimum() && (statContext.writerTimezone != nullptr)));
+ _stats.setHasMaximum(stats.has_maximum_utc() ||
+ (stats.has_maximum() && (statContext.writerTimezone != nullptr)));
+ _hasLowerBound = stats.has_minimum_utc() || stats.has_minimum();
+ _hasUpperBound = stats.has_maximum_utc() || stats.has_maximum();
+ // to be consistent with java side, non-default minimum_nanos and maximum_nanos
// are added by one in their serialized form.
- _minimumNanos = stats.has_minimumnanos() ?
- stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS;
- _maximumNanos = stats.has_maximumnanos() ?
- stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS;
+ _minimumNanos = stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS;
+ _maximumNanos = stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS;
// Timestamp stats are stored in milliseconds
- if (stats.has_minimumutc()) {
- int64_t minimum = stats.minimumutc();
+ if (stats.has_minimum_utc()) {
+ int64_t minimum = stats.minimum_utc();
_stats.setMinimum(minimum);
_lowerBound = minimum;
} else if (statContext.writerTimezone) {
int64_t writerTimeSec = stats.minimum() / 1000;
// multiply the offset by 1000 to convert to millisecond
- int64_t minimum =
- stats.minimum() +
- (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
- * 1000;
+ int64_t minimum = stats.minimum() +
+ (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
_stats.setMinimum(minimum);
_lowerBound = minimum;
} else {
@@ -356,94 +340,82 @@ namespace orc {
}
// Timestamp stats are stored in milliseconds
- if (stats.has_maximumutc()) {
- int64_t maximum = stats.maximumutc();
+ if (stats.has_maximum_utc()) {
+ int64_t maximum = stats.maximum_utc();
_stats.setMaximum(maximum);
_upperBound = maximum;
} else if (statContext.writerTimezone) {
int64_t writerTimeSec = stats.maximum() / 1000;
// multiply the offset by 1000 to convert to millisecond
int64_t maximum = stats.maximum() +
- (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
- * 1000;
+ (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000;
_stats.setMaximum(maximum);
_upperBound = maximum;
} else {
_stats.setMaximum(0);
// add 1 day 1 hour (25 hours) in milliseconds to handle unknown
// TZ and daylight savings
- _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
+ _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
}
// Add 1 millisecond to account for microsecond precision of values
_upperBound += 1;
}
}
- CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl
- (const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_collectionstatistics()) {
+ CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
+ const proto::ColumnStatistics& pb) {
+ _stats.setNumberOfValues(pb.number_of_values());
+ _stats.setHasNull(pb.has_null());
+ if (!pb.has_collection_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
_stats.setSum(0);
} else {
- const proto::CollectionStatistics& stats = pb.collectionstatistics();
- _stats.setHasMinimum(stats.has_minchildren());
- _stats.setHasMaximum(stats.has_maxchildren());
- _stats.setHasSum(stats.has_totalchildren());
-
- _stats.setMinimum(stats.minchildren());
- _stats.setMaximum(stats.maxchildren());
- _stats.setSum(stats.totalchildren());
+ const proto::CollectionStatistics& stats = pb.collection_statistics();
+ _stats.setHasMinimum(stats.has_min_children());
+ _stats.setHasMaximum(stats.has_max_children());
+ _stats.setHasSum(stats.has_total_children());
+
+ _stats.setMinimum(stats.min_children());
+ _stats.setMaximum(stats.max_children());
+ _stats.setSum(stats.total_children());
}
}
- std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
- const Type& type) {
+ std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type) {
switch (static_cast<int64_t>(type.getKind())) {
case BOOLEAN:
- return std::unique_ptr<MutableColumnStatistics>(
- new BooleanColumnStatisticsImpl());
+ return std::make_unique<BooleanColumnStatisticsImpl>();
case BYTE:
case INT:
case LONG:
case SHORT:
- return std::unique_ptr<MutableColumnStatistics>(
- new IntegerColumnStatisticsImpl());
+ return std::make_unique<IntegerColumnStatisticsImpl>();
case MAP:
case LIST:
- return std::unique_ptr<MutableColumnStatistics>(
- new CollectionColumnStatisticsImpl());
+ return std::make_unique<CollectionColumnStatisticsImpl>();
case STRUCT:
case UNION:
- return std::unique_ptr<MutableColumnStatistics>(
- new ColumnStatisticsImpl());
+ return std::make_unique<ColumnStatisticsImpl>();
case FLOAT:
case DOUBLE:
- return std::unique_ptr<MutableColumnStatistics>(
- new DoubleColumnStatisticsImpl());
+ return std::make_unique<DoubleColumnStatisticsImpl>();
case BINARY:
- return std::unique_ptr<MutableColumnStatistics>(
- new BinaryColumnStatisticsImpl());
+ return std::make_unique<BinaryColumnStatisticsImpl>();
case STRING:
case CHAR:
case VARCHAR:
- return std::unique_ptr<MutableColumnStatistics>(
- new StringColumnStatisticsImpl());
+ return std::make_unique<StringColumnStatisticsImpl>();
case DATE:
- return std::unique_ptr<MutableColumnStatistics>(
- new DateColumnStatisticsImpl());
+ return std::make_unique<DateColumnStatisticsImpl>();
case TIMESTAMP:
case TIMESTAMP_INSTANT:
- return std::unique_ptr<MutableColumnStatistics>(
- new TimestampColumnStatisticsImpl());
+ return std::make_unique<TimestampColumnStatisticsImpl>();
case DECIMAL:
- return std::unique_ptr<MutableColumnStatistics>(
- new DecimalColumnStatisticsImpl());
+ return std::make_unique<DecimalColumnStatisticsImpl>();
default:
throw NotImplementedYet("Not supported type: " + type.toString());
}
}
-}// namespace
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh
index 8cb2283f13..b36e431a7f 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.hh
+++ b/contrib/libs/apache/orc/c++/src/Statistics.hh
@@ -29,25 +29,25 @@
namespace orc {
-/**
- * StatContext contains fields required to compute statistics
- */
+ /**
+ * StatContext contains fields required to compute statistics
+ */
struct StatContext {
const bool correctStats;
const Timezone* const writerTimezone;
StatContext() : correctStats(false), writerTimezone(nullptr) {}
- StatContext(bool cStat, const Timezone* const timezone = nullptr) :
- correctStats(cStat), writerTimezone(timezone) {}
+ StatContext(bool cStat, const Timezone* const timezone = nullptr)
+ : correctStats(cStat), writerTimezone(timezone) {}
};
-/**
- * Internal Statistics Implementation
- */
+ /**
+ * Internal Statistics Implementation
+ */
template <typename T>
class InternalStatisticsImpl {
- private:
+ private:
bool _hasNull;
bool _hasMinimum;
bool _hasMaximum;
@@ -58,7 +58,8 @@ namespace orc {
T _minimum;
T _maximum;
T _sum;
- public:
+
+ public:
InternalStatisticsImpl() {
_hasNull = false;
_hasMinimum = false;
@@ -72,52 +73,90 @@ namespace orc {
~InternalStatisticsImpl() {}
// GET / SET _totalLength
- bool hasTotalLength() const { return _hasTotalLength; }
+ bool hasTotalLength() const {
+ return _hasTotalLength;
+ }
void setHasTotalLength(bool hasTotalLength) {
_hasTotalLength = hasTotalLength;
}
- uint64_t getTotalLength() const { return _totalLength; }
+ uint64_t getTotalLength() const {
+ return _totalLength;
+ }
- void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
+ void setTotalLength(uint64_t totalLength) {
+ _totalLength = totalLength;
+ }
// GET / SET _sum
- bool hasSum() const { return _hasSum; }
+ bool hasSum() const {
+ return _hasSum;
+ }
- void setHasSum(bool hasSum) { _hasSum = hasSum; }
+ void setHasSum(bool hasSum) {
+ _hasSum = hasSum;
+ }
- T getSum() const { return _sum; }
+ T getSum() const {
+ return _sum;
+ }
- void setSum(T sum) { _sum = sum; }
+ void setSum(T sum) {
+ _sum = sum;
+ }
// GET / SET _maximum
- bool hasMaximum() const { return _hasMaximum; }
+ bool hasMaximum() const {
+ return _hasMaximum;
+ }
- const T & getMaximum() const { return _maximum; }
+ const T& getMaximum() const {
+ return _maximum;
+ }
- void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
+ void setHasMaximum(bool hasMax) {
+ _hasMaximum = hasMax;
+ }
- void setMaximum(T max) { _maximum = max; }
+ void setMaximum(T max) {
+ _maximum = max;
+ }
// GET / SET _minimum
- bool hasMinimum() const { return _hasMinimum; }
+ bool hasMinimum() const {
+ return _hasMinimum;
+ }
- void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
+ void setHasMinimum(bool hasMin) {
+ _hasMinimum = hasMin;
+ }
- const T & getMinimum() const { return _minimum; }
+ const T& getMinimum() const {
+ return _minimum;
+ }
- void setMinimum(T min) { _minimum = min; }
+ void setMinimum(T min) {
+ _minimum = min;
+ }
// GET / SET _valueCount
- uint64_t getNumberOfValues() const { return _valueCount; }
+ uint64_t getNumberOfValues() const {
+ return _valueCount;
+ }
- void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
+ void setNumberOfValues(uint64_t numValues) {
+ _valueCount = numValues;
+ }
// GET / SET _hasNullValue
- bool hasNull() const { return _hasNull; }
+ bool hasNull() const {
+ return _hasNull;
+ }
- void setHasNull(bool hasNull) { _hasNull = hasNull; }
+ void setHasNull(bool hasNull) {
+ _hasNull = hasNull;
+ }
void reset() {
_hasNull = false;
@@ -164,7 +203,7 @@ namespace orc {
_hasTotalLength = _hasTotalLength && other._hasTotalLength;
_totalLength += other._totalLength;
}
- };
+ };
typedef InternalStatisticsImpl<char> InternalCharStatistics;
typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
@@ -179,7 +218,7 @@ namespace orc {
* Mutable column statistics for use by the writer.
*/
class MutableColumnStatistics {
- public:
+ public:
virtual ~MutableColumnStatistics();
virtual void increase(uint64_t count) = 0;
@@ -195,16 +234,18 @@ namespace orc {
virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
};
-/**
- * ColumnStatistics Implementation
- */
+ /**
+ * ColumnStatistics Implementation
+ */
- class ColumnStatisticsImpl: public ColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics {
+ private:
InternalCharStatistics _stats;
- public:
- ColumnStatisticsImpl() { reset(); }
+
+ public:
+ ColumnStatisticsImpl() {
+ reset();
+ }
ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~ColumnStatisticsImpl() override;
@@ -237,25 +278,26 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Column has " << getNumberOfValues() << " values"
- << " and has null value: " << (hasNull() ? "yes" : "no")
- << std::endl;
+ << " and has null value: " << (hasNull() ? "yes" : "no") << std::endl;
return buffer.str();
}
};
- class BinaryColumnStatisticsImpl: public BinaryColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics {
+ private:
InternalCharStatistics _stats;
- public:
- BinaryColumnStatisticsImpl() { reset(); }
+
+ public:
+ BinaryColumnStatisticsImpl() {
+ reset();
+ }
BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BinaryColumnStatisticsImpl() override;
@@ -285,9 +327,9 @@ namespace orc {
}
uint64_t getTotalLength() const override {
- if(hasTotalLength()){
+ if (hasTotalLength()) {
return _stats.getTotalLength();
- }else{
+ } else {
throw ParseError("Total length is not defined.");
}
}
@@ -303,7 +345,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const BinaryColumnStatisticsImpl& binStats =
- dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
+ dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
_stats.merge(binStats._stats);
}
@@ -313,10 +355,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics();
+ proto::BinaryStatistics* binStats = pbStats.mutable_binary_statistics();
binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
}
@@ -325,24 +367,26 @@ namespace orc {
buffer << "Data type: Binary" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasTotalLength()){
+ if (hasTotalLength()) {
buffer << "Total length: " << getTotalLength() << std::endl;
- }else{
+ } else {
buffer << "Total length: not defined" << std::endl;
}
return buffer.str();
}
};
- class BooleanColumnStatisticsImpl: public BooleanColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class BooleanColumnStatisticsImpl : public BooleanColumnStatistics,
+ public MutableColumnStatistics {
+ private:
InternalBooleanStatistics _stats;
bool _hasCount;
uint64_t _trueCount;
- public:
- BooleanColumnStatisticsImpl() { reset(); }
+ public:
+ BooleanColumnStatisticsImpl() {
+ reset();
+ }
BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BooleanColumnStatisticsImpl() override;
@@ -373,17 +417,17 @@ namespace orc {
}
uint64_t getFalseCount() const override {
- if(hasCount()){
+ if (hasCount()) {
return getNumberOfValues() - _trueCount;
- }else{
+ } else {
throw ParseError("False count is not defined.");
}
}
uint64_t getTrueCount() const override {
- if(hasCount()){
+ if (hasCount()) {
return _trueCount;
- }else{
+ } else {
throw ParseError("True count is not defined.");
}
}
@@ -401,7 +445,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const BooleanColumnStatisticsImpl& boolStats =
- dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
+ dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
_stats.merge(boolStats._stats);
_hasCount = _hasCount && boolStats._hasCount;
_trueCount += boolStats._trueCount;
@@ -413,10 +457,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
+ proto::BucketStatistics* bucketStats = pbStats.mutable_bucket_statistics();
if (_hasCount) {
bucketStats->add_count(_trueCount);
} else {
@@ -429,9 +473,8 @@ namespace orc {
buffer << "Data type: Boolean" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasCount()){
- buffer << "(true: " << getTrueCount() << "; false: "
- << getFalseCount() << ")" << std::endl;
+ if (hasCount()) {
+ buffer << "(true: " << getTrueCount() << "; false: " << getFalseCount() << ")" << std::endl;
} else {
buffer << "(true: not defined; false: not defined)" << std::endl;
buffer << "True and false counts are not defined" << std::endl;
@@ -440,14 +483,15 @@ namespace orc {
}
};
- class DateColumnStatisticsImpl: public DateColumnStatistics,
- public MutableColumnStatistics{
- private:
+ class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics {
+ private:
InternalDateStatistics _stats;
- public:
- DateColumnStatisticsImpl() { reset(); }
- DateColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
+
+ public:
+ DateColumnStatisticsImpl() {
+ reset();
+ }
+ DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
virtual ~DateColumnStatisticsImpl() override;
bool hasMinimum() const override {
@@ -479,17 +523,17 @@ namespace orc {
}
int32_t getMinimum() const override {
- if(hasMinimum()){
+ if (hasMinimum()) {
return _stats.getMinimum();
- }else{
+ } else {
throw ParseError("Minimum is not defined.");
}
}
int32_t getMaximum() const override {
- if(hasMaximum()){
+ if (hasMaximum()) {
return _stats.getMaximum();
- }else{
+ } else {
throw ParseError("Maximum is not defined.");
}
}
@@ -510,7 +554,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const DateColumnStatisticsImpl& dateStats =
- dynamic_cast<const DateColumnStatisticsImpl&>(other);
+ dynamic_cast<const DateColumnStatisticsImpl&>(other);
_stats.merge(dateStats._stats);
}
@@ -519,11 +563,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::DateStatistics* dateStatistics =
- pbStats.mutable_datestatistics();
+ proto::DateStatistics* dateStatistics = pbStats.mutable_date_statistics();
if (_stats.hasMinimum()) {
dateStatistics->set_maximum(_stats.getMaximum());
dateStatistics->set_minimum(_stats.getMinimum());
@@ -538,28 +581,30 @@ namespace orc {
buffer << "Data type: Date" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
+ if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
+ } else {
buffer << "Minimum: not defined" << std::endl;
}
- if(hasMaximum()){
+ if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
+ } else {
buffer << "Maximum: not defined" << std::endl;
}
return buffer.str();
}
};
- class DecimalColumnStatisticsImpl: public DecimalColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class DecimalColumnStatisticsImpl : public DecimalColumnStatistics,
+ public MutableColumnStatistics {
+ private:
InternalDecimalStatistics _stats;
- public:
- DecimalColumnStatisticsImpl() { reset(); }
+ public:
+ DecimalColumnStatisticsImpl() {
+ reset();
+ }
DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~DecimalColumnStatisticsImpl() override;
@@ -597,17 +642,17 @@ namespace orc {
}
Decimal getMinimum() const override {
- if(hasMinimum()){
+ if (hasMinimum()) {
return _stats.getMinimum();
- }else{
+ } else {
throw ParseError("Minimum is not defined.");
}
}
Decimal getMaximum() const override {
- if(hasMaximum()){
+ if (hasMaximum()) {
return _stats.getMaximum();
- }else{
+ } else {
throw ParseError("Maximum is not defined.");
}
}
@@ -623,9 +668,9 @@ namespace orc {
}
Decimal getSum() const override {
- if(hasSum()){
+ if (hasSum()) {
return _stats.getSum();
- }else{
+ } else {
throw ParseError("Sum is not defined.");
}
}
@@ -645,7 +690,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const DecimalColumnStatisticsImpl& decStats =
- dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
+ dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
_stats.merge(decStats._stats);
@@ -661,10 +706,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics();
+ proto::DecimalStatistics* decStats = pbStats.mutable_decimal_statistics();
if (_stats.hasMinimum()) {
decStats->set_minimum(TString(_stats.getMinimum().toString(true)));
decStats->set_maximum(TString(_stats.getMaximum().toString(true)));
@@ -684,40 +729,36 @@ namespace orc {
buffer << "Data type: Decimal" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
+ if (hasMinimum()) {
buffer << "Minimum: " << getMinimum().toString() << std::endl;
- }else{
+ } else {
buffer << "Minimum: not defined" << std::endl;
}
- if(hasMaximum()){
+ if (hasMaximum()) {
buffer << "Maximum: " << getMaximum().toString() << std::endl;
- }else{
+ } else {
buffer << "Maximum: not defined" << std::endl;
}
- if(hasSum()){
+ if (hasSum()) {
buffer << "Sum: " << getSum().toString() << std::endl;
- }else{
+ } else {
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
- private:
+ private:
void updateSum(Decimal value) {
if (_stats.hasSum()) {
bool overflow = false;
Decimal sum = _stats.getSum();
if (sum.scale > value.scale) {
- value.value = scaleUpInt128ByPowerOfTen(value.value,
- sum.scale - value.scale,
- overflow);
+ value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow);
} else if (sum.scale < value.scale) {
- sum.value = scaleUpInt128ByPowerOfTen(sum.value,
- value.scale - sum.scale,
- overflow);
+ sum.value = scaleUpInt128ByPowerOfTen(sum.value, value.scale - sum.scale, overflow);
sum.scale = value.scale;
}
@@ -738,12 +779,14 @@ namespace orc {
}
};
- class DoubleColumnStatisticsImpl: public DoubleColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics {
+ private:
InternalDoubleStatistics _stats;
- public:
- DoubleColumnStatisticsImpl() { reset(); }
+
+ public:
+ DoubleColumnStatisticsImpl() {
+ reset();
+ }
DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~DoubleColumnStatisticsImpl() override;
@@ -780,17 +823,17 @@ namespace orc {
}
double getMinimum() const override {
- if(hasMinimum()){
+ if (hasMinimum()) {
return _stats.getMinimum();
- }else{
+ } else {
throw ParseError("Minimum is not defined.");
}
}
double getMaximum() const override {
- if(hasMaximum()){
+ if (hasMaximum()) {
return _stats.getMaximum();
- }else{
+ } else {
throw ParseError("Maximum is not defined.");
}
}
@@ -806,9 +849,9 @@ namespace orc {
}
double getSum() const override {
- if(hasSum()){
+ if (hasSum()) {
return _stats.getSum();
- }else{
+ } else {
throw ParseError("Sum is not defined.");
}
}
@@ -825,7 +868,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const DoubleColumnStatisticsImpl& doubleStats =
- dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
+ dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
_stats.merge(doubleStats._stats);
_stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
@@ -840,10 +883,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics();
+ proto::DoubleStatistics* doubleStats = pbStats.mutable_double_statistics();
if (_stats.hasMinimum()) {
doubleStats->set_minimum(_stats.getMinimum());
doubleStats->set_maximum(_stats.getMaximum());
@@ -863,33 +906,36 @@ namespace orc {
buffer << "Data type: Double" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
+ if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
+ } else {
buffer << "Minimum: not defined" << std::endl;
}
- if(hasMaximum()){
+ if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
+ } else {
buffer << "Maximum: not defined" << std::endl;
}
- if(hasSum()){
+ if (hasSum()) {
buffer << "Sum: " << getSum() << std::endl;
- }else{
+ } else {
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
};
- class IntegerColumnStatisticsImpl: public IntegerColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class IntegerColumnStatisticsImpl : public IntegerColumnStatistics,
+ public MutableColumnStatistics {
+ private:
InternalIntegerStatistics _stats;
- public:
- IntegerColumnStatisticsImpl() { reset(); }
+
+ public:
+ IntegerColumnStatisticsImpl() {
+ reset();
+ }
IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~IntegerColumnStatisticsImpl() override;
@@ -926,17 +972,17 @@ namespace orc {
}
int64_t getMinimum() const override {
- if(hasMinimum()){
+ if (hasMinimum()) {
return _stats.getMinimum();
- }else{
+ } else {
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
- if(hasMaximum()){
+ if (hasMaximum()) {
return _stats.getMaximum();
- }else{
+ } else {
throw ParseError("Maximum is not defined.");
}
}
@@ -952,9 +998,9 @@ namespace orc {
}
int64_t getSum() const override {
- if(hasSum()){
+ if (hasSum()) {
return _stats.getSum();
- }else{
+ } else {
throw ParseError("Sum is not defined.");
}
}
@@ -984,7 +1030,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const IntegerColumnStatisticsImpl& intStats =
- dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
+ dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
_stats.merge(intStats._stats);
@@ -1005,10 +1051,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics();
+ proto::IntegerStatistics* intStats = pbStats.mutable_int_statistics();
if (_stats.hasMinimum()) {
intStats->set_minimum(_stats.getMinimum());
intStats->set_maximum(_stats.getMaximum());
@@ -1028,33 +1074,32 @@ namespace orc {
buffer << "Data type: Integer" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
+ if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
+ } else {
buffer << "Minimum: not defined" << std::endl;
}
- if(hasMaximum()){
+ if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
+ } else {
buffer << "Maximum: not defined" << std::endl;
}
- if(hasSum()){
+ if (hasSum()) {
buffer << "Sum: " << getSum() << std::endl;
- }else{
+ } else {
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
};
- class StringColumnStatisticsImpl: public StringColumnStatistics,
- public MutableColumnStatistics{
- private:
+ class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics {
+ private:
InternalStringStatistics _stats;
- public:
+ public:
StringColumnStatisticsImpl() {
reset();
}
@@ -1094,18 +1139,18 @@ namespace orc {
_stats.setHasNull(hasNull);
}
- const std::string & getMinimum() const override {
- if(hasMinimum()){
+ const std::string& getMinimum() const override {
+ if (hasMinimum()) {
return _stats.getMinimum();
- }else{
+ } else {
throw ParseError("Minimum is not defined.");
}
}
- const std::string & getMaximum() const override {
- if(hasMaximum()){
+ const std::string& getMaximum() const override {
+ if (hasMaximum()) {
return _stats.getMaximum();
- }else{
+ } else {
throw ParseError("Maximum is not defined.");
}
}
@@ -1121,9 +1166,9 @@ namespace orc {
}
uint64_t getTotalLength() const override {
- if(hasTotalLength()){
+ if (hasTotalLength()) {
return _stats.getTotalLength();
- }else{
+ } else {
throw ParseError("Total length is not defined.");
}
}
@@ -1141,20 +1186,16 @@ namespace orc {
setMaximum(tempStr);
} else {
// update min
- int minCmp = strncmp(_stats.getMinimum().c_str(),
- value,
+ int minCmp = strncmp(_stats.getMinimum().c_str(), value,
std::min(_stats.getMinimum().length(), length));
- if (minCmp > 0 ||
- (minCmp == 0 && length < _stats.getMinimum().length())) {
+ if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) {
setMinimum(std::string(value, value + length));
}
// update max
- int maxCmp = strncmp(_stats.getMaximum().c_str(),
- value,
+ int maxCmp = strncmp(_stats.getMaximum().c_str(), value,
std::min(_stats.getMaximum().length(), length));
- if (maxCmp < 0 ||
- (maxCmp == 0 && length > _stats.getMaximum().length())) {
+ if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) {
setMaximum(std::string(value, value + length));
}
}
@@ -1169,7 +1210,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const StringColumnStatisticsImpl& strStats =
- dynamic_cast<const StringColumnStatisticsImpl&>(other);
+ dynamic_cast<const StringColumnStatisticsImpl&>(other);
_stats.merge(strStats._stats);
}
@@ -1179,10 +1220,10 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::StringStatistics* strStats = pbStats.mutable_stringstatistics();
+ proto::StringStatistics* strStats = pbStats.mutable_string_statistics();
if (_stats.hasMinimum()) {
strStats->set_minimum(TString(_stats.getMinimum()));
strStats->set_maximum(TString(_stats.getMaximum()));
@@ -1202,42 +1243,44 @@ namespace orc {
buffer << "Data type: String" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
+ if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
+ } else {
buffer << "Minimum is not defined" << std::endl;
}
- if(hasMaximum()){
+ if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
+ } else {
buffer << "Maximum is not defined" << std::endl;
}
- if(hasTotalLength()){
+ if (hasTotalLength()) {
buffer << "Total length: " << getTotalLength() << std::endl;
- }else{
+ } else {
buffer << "Total length is not defined" << std::endl;
}
return buffer.str();
}
};
- class TimestampColumnStatisticsImpl: public TimestampColumnStatistics,
- public MutableColumnStatistics {
- private:
+ class TimestampColumnStatisticsImpl : public TimestampColumnStatistics,
+ public MutableColumnStatistics {
+ private:
InternalIntegerStatistics _stats;
bool _hasLowerBound;
bool _hasUpperBound;
int64_t _lowerBound;
int64_t _upperBound;
- int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp
- int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp
+ int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp
+ int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp
static constexpr int32_t DEFAULT_MIN_NANOS = 0;
static constexpr int32_t DEFAULT_MAX_NANOS = 999999;
- public:
- TimestampColumnStatisticsImpl() { reset(); }
+ public:
+ TimestampColumnStatisticsImpl() {
+ reset();
+ }
TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~TimestampColumnStatisticsImpl() override;
@@ -1271,17 +1314,17 @@ namespace orc {
}
int64_t getMinimum() const override {
- if(hasMinimum()){
+ if (hasMinimum()) {
return _stats.getMinimum();
- }else{
+ } else {
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
- if(hasMaximum()){
+ if (hasMaximum()) {
return _stats.getMaximum();
- }else{
+ } else {
throw ParseError("Maximum is not defined.");
}
}
@@ -1326,7 +1369,7 @@ namespace orc {
void merge(const MutableColumnStatistics& other) override {
const TimestampColumnStatisticsImpl& tsStats =
- dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
+ dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
_stats.setHasNull(_stats.hasNull() || tsStats.hasNull());
_stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues());
@@ -1365,25 +1408,24 @@ namespace orc {
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::TimestampStatistics* tsStats =
- pbStats.mutable_timestampstatistics();
+ proto::TimestampStatistics* tsStats = pbStats.mutable_timestamp_statistics();
if (_stats.hasMinimum()) {
- tsStats->set_minimumutc(_stats.getMinimum());
- tsStats->set_maximumutc(_stats.getMaximum());
+ tsStats->set_minimum_utc(_stats.getMinimum());
+ tsStats->set_maximum_utc(_stats.getMaximum());
if (_minimumNanos != DEFAULT_MIN_NANOS) {
- tsStats->set_minimumnanos(_minimumNanos + 1);
+ tsStats->set_minimum_nanos(_minimumNanos + 1);
}
if (_maximumNanos != DEFAULT_MAX_NANOS) {
- tsStats->set_maximumnanos(_maximumNanos + 1);
+ tsStats->set_maximum_nanos(_maximumNanos + 1);
}
} else {
- tsStats->clear_minimumutc();
- tsStats->clear_maximumutc();
- tsStats->clear_minimumnanos();
- tsStats->clear_maximumnanos();
+ tsStats->clear_minimum_utc();
+ tsStats->clear_maximum_utc();
+ tsStats->clear_minimum_nanos();
+ tsStats->clear_maximum_nanos();
}
}
@@ -1396,43 +1438,39 @@ namespace orc {
buffer << "Data type: Timestamp" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
+ if (hasMinimum()) {
secs = static_cast<time_t>(getMinimum() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "Minimum: " << timeBuffer << "."
- << (getMinimum() % 1000) << std::endl;
- }else{
+ buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl;
+ } else {
buffer << "Minimum is not defined" << std::endl;
}
- if(hasLowerBound()){
+ if (hasLowerBound()) {
secs = static_cast<time_t>(getLowerBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "LowerBound: " << timeBuffer << "."
- << (getLowerBound() % 1000) << std::endl;
- }else{
+ buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl;
+ } else {
buffer << "LowerBound is not defined" << std::endl;
}
- if(hasMaximum()){
- secs = static_cast<time_t>(getMaximum()/1000);
+ if (hasMaximum()) {
+ secs = static_cast<time_t>(getMaximum() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "Maximum: " << timeBuffer << "."
- << (getMaximum() % 1000) << std::endl;
- }else{
+ buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl;
+ } else {
buffer << "Maximum is not defined" << std::endl;
}
- if(hasUpperBound()){
+ if (hasUpperBound()) {
secs = static_cast<time_t>(getUpperBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "UpperBound: " << timeBuffer << "."
- << (getUpperBound() % 1000) << std::endl;
- }else{
+ buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl;
+ } else {
buffer << "UpperBound is not defined" << std::endl;
}
@@ -1448,17 +1486,17 @@ namespace orc {
}
int64_t getLowerBound() const override {
- if(hasLowerBound()){
+ if (hasLowerBound()) {
return _lowerBound;
- }else{
+ } else {
throw ParseError("LowerBound is not defined.");
}
}
int64_t getUpperBound() const override {
- if(hasUpperBound()){
+ if (hasUpperBound()) {
return _upperBound;
- }else{
+ } else {
throw ParseError("UpperBound is not defined.");
}
}
@@ -1482,12 +1520,14 @@ namespace orc {
class CollectionColumnStatisticsImpl : public CollectionColumnStatistics,
public MutableColumnStatistics {
- private:
+ private:
InternalCollectionStatistics _stats;
- public:
- CollectionColumnStatisticsImpl() { reset(); }
- CollectionColumnStatisticsImpl(const proto::ColumnStatistics &stats);
+ public:
+ CollectionColumnStatisticsImpl() {
+ reset();
+ }
+ CollectionColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~CollectionColumnStatisticsImpl() override;
bool hasMinimumChildren() const override {
@@ -1523,7 +1563,7 @@ namespace orc {
}
uint64_t getMinimumChildren() const override {
- if(hasMinimumChildren()) {
+ if (hasMinimumChildren()) {
return _stats.getMinimum();
} else {
throw ParseError("MinimumChildren is not defined.");
@@ -1531,7 +1571,7 @@ namespace orc {
}
uint64_t getMaximumChildren() const override {
- if(hasMaximumChildren()) {
+ if (hasMaximumChildren()) {
return _stats.getMaximum();
} else {
throw ParseError("MaximumChildren is not defined.");
@@ -1539,7 +1579,7 @@ namespace orc {
}
uint64_t getTotalChildren() const override {
- if(hasTotalChildren()) {
+ if (hasTotalChildren()) {
return _stats.getSum();
} else {
throw ParseError("TotalChildren is not defined.");
@@ -1598,31 +1638,30 @@ namespace orc {
}
}
- void toProtoBuf(proto::ColumnStatistics &pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_has_null(_stats.hasNull());
+ pbStats.set_number_of_values(_stats.getNumberOfValues());
- proto::CollectionStatistics* collectionStats =
- pbStats.mutable_collectionstatistics();
+ proto::CollectionStatistics* collectionStats = pbStats.mutable_collection_statistics();
if (_stats.hasMinimum()) {
- collectionStats->set_minchildren(_stats.getMinimum());
- collectionStats->set_maxchildren(_stats.getMaximum());
+ collectionStats->set_min_children(_stats.getMinimum());
+ collectionStats->set_max_children(_stats.getMaximum());
} else {
- collectionStats->clear_minchildren();
- collectionStats->clear_maxchildren();
+ collectionStats->clear_min_children();
+ collectionStats->clear_max_children();
}
if (_stats.hasSum()) {
- collectionStats->set_totalchildren(_stats.getSum());
+ collectionStats->set_total_children(_stats.getSum());
} else {
- collectionStats->clear_totalchildren();
+ collectionStats->clear_total_children();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Collection(LIST|MAP)" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimumChildren()) {
buffer << "MinChildren: " << getMinimumChildren() << std::endl;
} else {
@@ -1647,22 +1686,20 @@ namespace orc {
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext);
- class StatisticsImpl: public Statistics {
- private:
+ class StatisticsImpl : public Statistics {
+ private:
std::vector<ColumnStatistics*> colStats;
// DELIBERATELY NOT IMPLEMENTED
StatisticsImpl(const StatisticsImpl&);
StatisticsImpl& operator=(const StatisticsImpl&);
- public:
- StatisticsImpl(const proto::StripeStatistics& stripeStats,
- const StatContext& statContext);
+ public:
+ StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext);
StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
- virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
- ) const override {
+ virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
return colStats[columnId];
}
@@ -1673,24 +1710,21 @@ namespace orc {
}
};
- class StripeStatisticsImpl: public StripeStatistics {
- private:
+ class StripeStatisticsImpl : public StripeStatistics {
+ private:
std::unique_ptr<StatisticsImpl> columnStats;
- std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > >
- rowIndexStats;
+ std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats;
// DELIBERATELY NOT IMPLEMENTED
StripeStatisticsImpl(const StripeStatisticsImpl&);
StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
- public:
- StripeStatisticsImpl(
- const proto::StripeStatistics& stripeStats,
- std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
- const StatContext& statContext);
+ public:
+ StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
+ std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
+ const StatContext& statContext);
- virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
- ) const override {
+ virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
return columnStats->getColumnStatistics(columnId);
}
@@ -1699,8 +1733,7 @@ namespace orc {
}
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
- uint32_t rowIndex
- ) const override {
+ uint32_t rowIndex) const override {
// check id indices are valid
return rowIndexStats[columnId][rowIndex].get();
}
@@ -1717,9 +1750,8 @@ namespace orc {
* @param type of column
* @return MutableColumnStatistics instances
*/
- std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
- const Type& type);
+ std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type);
-}// namespace
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc
index 6d6dda8328..8507e95767 100644
--- a/contrib/libs/apache/orc/c++/src/StripeStream.cc
+++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc
@@ -16,10 +16,10 @@
* limitations under the License.
*/
-#include "orc/Exceptions.hh"
+#include "StripeStream.hh"
#include "RLE.hh"
#include "Reader.hh"
-#include "StripeStream.hh"
+#include "orc/Exceptions.hh"
#include "wrap/coded-stream-wrapper.h"
@@ -27,19 +27,17 @@ namespace orc {
StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index,
const proto::StripeInformation& _stripeInfo,
- const proto::StripeFooter& _footer,
- uint64_t _stripeStart,
- InputStream& _input,
- const Timezone& _writerTimezone,
- const Timezone& _readerTimezone
- ): reader(_reader),
- stripeInfo(_stripeInfo),
- footer(_footer),
- stripeIndex(_index),
- stripeStart(_stripeStart),
- input(_input),
- writerTimezone(_writerTimezone),
- readerTimezone(_readerTimezone) {
+ const proto::StripeFooter& _footer, uint64_t _stripeStart,
+ InputStream& _input, const Timezone& _writerTimezone,
+ const Timezone& _readerTimezone)
+ : reader(_reader),
+ stripeInfo(_stripeInfo),
+ footer(_footer),
+ stripeIndex(_index),
+ stripeStart(_stripeStart),
+ input(_input),
+ writerTimezone(_writerTimezone),
+ readerTimezone(_readerTimezone) {
// PASS
}
@@ -55,7 +53,6 @@ namespace orc {
// PASS
}
-
StreamInformationImpl::~StreamInformationImpl() {
// PASS
}
@@ -64,8 +61,7 @@ namespace orc {
return reader.getSelectedColumns();
}
- proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId
- ) const {
+ proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId) const {
return footer.columns(static_cast<int>(columnId));
}
@@ -81,48 +77,46 @@ namespace orc {
return reader.getFileContents().errorStream;
}
- std::unique_ptr<SeekableInputStream>
- StripeStreamsImpl::getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const {
+ std::unique_ptr<SeekableInputStream> StripeStreamsImpl::getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const {
uint64_t offset = stripeStart;
- uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength();
- MemoryPool *pool = reader.getFileContents().pool;
- for(int i = 0; i < footer.streams_size(); ++i) {
+ uint64_t dataEnd = stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length();
+ MemoryPool* pool = reader.getFileContents().pool;
+ for (int i = 0; i < footer.streams_size(); ++i) {
const proto::Stream& stream = footer.streams(i);
- if (stream.has_kind() &&
- stream.kind() == kind &&
+ if (stream.has_kind() && stream.kind() == kind &&
stream.column() == static_cast<uint64_t>(columnId)) {
uint64_t streamLength = stream.length();
- uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength;
+ uint64_t myBlock = shouldStream ? input.getNaturalReadSize() : streamLength;
if (offset + streamLength > dataEnd) {
std::stringstream msg;
msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex
<< ": streamOffset=" << offset << ", streamLength=" << streamLength
- << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength="
- << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength();
+ << ", stripeOffset=" << stripeInfo.offset()
+ << ", stripeIndexLength=" << stripeInfo.index_length()
+ << ", stripeDataLength=" << stripeInfo.data_length();
throw ParseError(msg.str());
}
return createDecompressor(reader.getCompression(),
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream
- (&input,
- offset,
- stream.length(),
- *pool,
- myBlock)),
- reader.getCompressionSize(),
- *pool);
+ std::make_unique<SeekableFileInputStream>(
+ &input, offset, stream.length(), *pool, myBlock),
+ reader.getCompressionSize(), *pool,
+ reader.getFileContents().readerMetrics);
}
offset += stream.length();
}
- return std::unique_ptr<SeekableInputStream>();
+ return nullptr;
}
MemoryPool& StripeStreamsImpl::getMemoryPool() const {
return *reader.getFileContents().pool;
}
+ ReaderMetrics* StripeStreamsImpl::getReaderMetrics() const {
+ return reader.getFileContents().readerMetrics;
+ }
+
bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const {
return reader.getThrowOnHive11DecimalOverflow();
}
@@ -135,37 +129,33 @@ namespace orc {
return reader.getForcedScaleOnHive11Decimal();
}
+ const SchemaEvolution* StripeStreamsImpl::getSchemaEvolution() const {
+ return reader.getSchemaEvolution();
+ }
+
void StripeInformationImpl::ensureStripeFooterLoaded() const {
if (stripeFooter.get() == nullptr) {
std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(stream,
- offset +
- indexLength +
- dataLength,
- footerLength,
- memory)),
- blockSize,
- memory);
- stripeFooter.reset(new proto::StripeFooter());
+ createDecompressor(compression,
+ std::make_unique<SeekableFileInputStream>(
+ stream, offset + indexLength + dataLength, footerLength, memory),
+ blockSize, memory, metrics);
+ stripeFooter = std::make_unique<proto::StripeFooter>();
if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
throw ParseError("Failed to parse the stripe footer");
}
}
}
- std::unique_ptr<StreamInformation>
- StripeInformationImpl::getStreamInformation(uint64_t streamId) const {
+ std::unique_ptr<StreamInformation> StripeInformationImpl::getStreamInformation(
+ uint64_t streamId) const {
ensureStripeFooterLoaded();
uint64_t streamOffset = offset;
- for(uint64_t s=0; s < streamId; ++s) {
+ for (uint64_t s = 0; s < streamId; ++s) {
streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
}
- return ORC_UNIQUE_PTR<StreamInformation>
- (new StreamInformationImpl(streamOffset,
- stripeFooter->
- streams(static_cast<int>(streamId))));
+ return std::make_unique<StreamInformationImpl>(
+ streamOffset, stripeFooter->streams(static_cast<int>(streamId)));
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh
index 8d9fb06527..eae6ce0c31 100644
--- a/contrib/libs/apache/orc/c++/src/StripeStream.hh
+++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh
@@ -23,6 +23,7 @@
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
+#include "ColumnReader.hh"
#include "Timezone.hh"
#include "TypeImpl.hh"
@@ -31,11 +32,11 @@ namespace orc {
class RowReaderImpl;
/**
- * StripeStream Implementation
- */
+ * StripeStream Implementation
+ */
- class StripeStreamsImpl: public StripeStreams {
- private:
+ class StripeStreamsImpl : public StripeStreams {
+ private:
const RowReaderImpl& reader;
const proto::StripeInformation& stripeInfo;
const proto::StripeFooter& footer;
@@ -45,29 +46,26 @@ namespace orc {
const Timezone& writerTimezone;
const Timezone& readerTimezone;
- public:
+ public:
StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index,
- const proto::StripeInformation& stripeInfo,
- const proto::StripeFooter& footer,
- uint64_t stripeStart,
- InputStream& input,
- const Timezone& writerTimezone,
+ const proto::StripeInformation& stripeInfo, const proto::StripeFooter& footer,
+ uint64_t stripeStart, InputStream& input, const Timezone& writerTimezone,
const Timezone& readerTimezone);
virtual ~StripeStreamsImpl() override;
virtual const std::vector<bool> getSelectedColumns() const override;
- virtual proto::ColumnEncoding getEncoding(uint64_t columnId
- ) const override;
+ virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const override;
- virtual std::unique_ptr<SeekableInputStream>
- getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const override;
+ virtual std::unique_ptr<SeekableInputStream> getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const override;
MemoryPool& getMemoryPool() const override;
+ ReaderMetrics* getReaderMetrics() const override;
+
const Timezone& getWriterTimezone() const override;
const Timezone& getReaderTimezone() const override;
@@ -79,25 +77,27 @@ namespace orc {
bool isDecimalAsLong() const override;
int32_t getForcedScaleOnHive11Decimal() const override;
+
+ const SchemaEvolution* getSchemaEvolution() const override;
};
- /**
- * StreamInformation Implementation
- */
+ /**
+ * StreamInformation Implementation
+ */
- class StreamInformationImpl: public StreamInformation {
- private:
+ class StreamInformationImpl : public StreamInformation {
+ private:
StreamKind kind;
uint64_t column;
uint64_t offset;
uint64_t length;
- public:
- StreamInformationImpl(uint64_t _offset,
- const proto::Stream& stream
- ): kind(static_cast<StreamKind>(stream.kind())),
- column(stream.column()),
- offset(_offset),
- length(stream.length()) {
+
+ public:
+ StreamInformationImpl(uint64_t _offset, const proto::Stream& stream)
+ : kind(static_cast<StreamKind>(stream.kind())),
+ column(stream.column()),
+ offset(_offset),
+ length(stream.length()) {
// PASS
}
@@ -120,9 +120,9 @@ namespace orc {
}
};
- /**
- * StripeInformation Implementation
- */
+ /**
+ * StripeInformation Implementation
+ */
class StripeInformationImpl : public StripeInformation {
uint64_t offset;
@@ -135,27 +135,24 @@ namespace orc {
CompressionKind compression;
uint64_t blockSize;
mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
+ ReaderMetrics* metrics;
void ensureStripeFooterLoaded() const;
- public:
-
- StripeInformationImpl(uint64_t _offset,
- uint64_t _indexLength,
- uint64_t _dataLength,
- uint64_t _footerLength,
- uint64_t _numRows,
- InputStream* _stream,
- MemoryPool& _memory,
- CompressionKind _compression,
- uint64_t _blockSize
- ) : offset(_offset),
- indexLength(_indexLength),
- dataLength(_dataLength),
- footerLength(_footerLength),
- numRows(_numRows),
- stream(_stream),
- memory(_memory),
- compression(_compression),
- blockSize(_blockSize) {
+
+ public:
+ StripeInformationImpl(uint64_t _offset, uint64_t _indexLength, uint64_t _dataLength,
+ uint64_t _footerLength, uint64_t _numRows, InputStream* _stream,
+ MemoryPool& _memory, CompressionKind _compression, uint64_t _blockSize,
+ ReaderMetrics* _metrics)
+ : offset(_offset),
+ indexLength(_indexLength),
+ dataLength(_dataLength),
+ footerLength(_footerLength),
+ numRows(_numRows),
+ stream(_stream),
+ memory(_memory),
+ compression(_compression),
+ blockSize(_blockSize),
+ metrics(_metrics) {
// PASS
}
@@ -174,7 +171,7 @@ namespace orc {
return indexLength;
}
- uint64_t getDataLength()const override {
+ uint64_t getDataLength() const override {
return dataLength;
}
@@ -191,29 +188,25 @@ namespace orc {
return static_cast<uint64_t>(stripeFooter->streams_size());
}
- std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId
- ) const override;
+ std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId) const override;
ColumnEncodingKind getColumnEncoding(uint64_t colId) const override {
ensureStripeFooterLoaded();
- return static_cast<ColumnEncodingKind>(stripeFooter->
- columns(static_cast<int>(colId))
- .kind());
+ return static_cast<ColumnEncodingKind>(stripeFooter->columns(static_cast<int>(colId)).kind());
}
uint64_t getDictionarySize(uint64_t colId) const override {
ensureStripeFooterLoaded();
- return static_cast<ColumnEncodingKind>(stripeFooter->
- columns(static_cast<int>(colId))
- .dictionarysize());
+ return static_cast<ColumnEncodingKind>(
+ stripeFooter->columns(static_cast<int>(colId)).dictionary_size());
}
const std::string& getWriterTimezone() const override {
ensureStripeFooterLoaded();
- return stripeFooter->writertimezone();
+ return stripeFooter->writer_timezone();
}
};
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc
index 318e5bcc12..27e14480d5 100644
--- a/contrib/libs/apache/orc/c++/src/Timezone.cc
+++ b/contrib/libs/apache/orc/c++/src/Timezone.cc
@@ -16,16 +16,17 @@
* limitations under the License.
*/
-#include "orc/OrcFile.hh"
#include "Timezone.hh"
+#include "orc/OrcFile.hh"
#include <errno.h>
-#include <map>
-#include <sstream>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
+#include <filesystem>
+#include <map>
+#include <sstream>
namespace orc {
@@ -35,25 +36,21 @@ namespace orc {
// location of a symlink to the local timezone
static const char LOCAL_TIMEZONE[] = "/etc/localtime";
- enum TransitionKind {
- TRANSITION_JULIAN,
- TRANSITION_DAY,
- TRANSITION_MONTH
- };
+ enum TransitionKind { TRANSITION_JULIAN, TRANSITION_DAY, TRANSITION_MONTH };
static const int64_t MONTHS_PER_YEAR = 12;
/**
* The number of days in each month in non-leap and leap years.
*/
- static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] =
- {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+ static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = {
+ {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
static const int64_t DAYS_PER_WEEK = 7;
// Leap years and day of the week repeat every 400 years, which makes it
// a good cycle length.
static const int64_t SECONDS_PER_400_YEARS =
- SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3));
+ SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3));
/**
* Is the given year a leap year?
@@ -68,7 +65,7 @@ namespace orc {
* @return -1 if the target < array[0] or array is empty or
* i if array[i] <= target and (i == n or array[i] < array[i+1])
*/
- int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) {
+ int64_t binarySearch(const std::vector<int64_t>& array, int64_t target) {
uint64_t size = array.size();
if (size == 0) {
return -1;
@@ -103,18 +100,17 @@ namespace orc {
std::string toString() const {
std::stringstream buffer;
switch (kind) {
- case TRANSITION_JULIAN:
- buffer << "julian " << day;
- break;
- case TRANSITION_DAY:
- buffer << "day " << day;
- break;
- case TRANSITION_MONTH:
- buffer << "month " << month << " week " << week << " day " << day;
- break;
+ case TRANSITION_JULIAN:
+ buffer << "julian " << day;
+ break;
+ case TRANSITION_DAY:
+ buffer << "day " << day;
+ break;
+ case TRANSITION_MONTH:
+ buffer << "month " << month << " week " << week << " day " << day;
+ break;
}
- buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60)
- << ":" << (time % 60);
+ buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) << ":" << (time % 60);
return buffer.str();
}
@@ -127,48 +123,48 @@ namespace orc {
int64_t getTime(int64_t year) const {
int64_t result = time;
switch (kind) {
- case TRANSITION_JULIAN:
- result += SECONDS_PER_DAY * day;
- if (day > 60 && isLeap(year)) {
- result += SECONDS_PER_DAY;
- }
- break;
- case TRANSITION_DAY:
- result += SECONDS_PER_DAY * day;
- break;
- case TRANSITION_MONTH: {
- bool inLeap = isLeap(year);
- int64_t adjustedMonth = (month + 9) % 12 + 1;
- int64_t adjustedYear = (month <= 2) ? (year - 1) : year;
- int64_t adjustedCentury = adjustedYear / 100;
- int64_t adjustedRemainder = adjustedYear % 100;
-
- // day of the week of the first day of month
- int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 +
- 1 + adjustedRemainder + adjustedRemainder / 4 +
- adjustedCentury / 4 - 2 * adjustedCentury) % 7;
- if (dayOfWeek < 0) {
- dayOfWeek += DAYS_PER_WEEK;
- }
+ case TRANSITION_JULIAN:
+ result += SECONDS_PER_DAY * day;
+ if (day > 60 && isLeap(year)) {
+ result += SECONDS_PER_DAY;
+ }
+ break;
+ case TRANSITION_DAY:
+ result += SECONDS_PER_DAY * day;
+ break;
+ case TRANSITION_MONTH: {
+ bool inLeap = isLeap(year);
+ int64_t adjustedMonth = (month + 9) % 12 + 1;
+ int64_t adjustedYear = (month <= 2) ? (year - 1) : year;
+ int64_t adjustedCentury = adjustedYear / 100;
+ int64_t adjustedRemainder = adjustedYear % 100;
+
+ // day of the week of the first day of month
+ int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + 1 + adjustedRemainder +
+ adjustedRemainder / 4 + adjustedCentury / 4 - 2 * adjustedCentury) %
+ 7;
+ if (dayOfWeek < 0) {
+ dayOfWeek += DAYS_PER_WEEK;
+ }
- int64_t d = day - dayOfWeek;
- if (d < 0) {
- d += DAYS_PER_WEEK;
- }
- for (int w = 1; w < week; ++w) {
- if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) {
- break;
+ int64_t d = day - dayOfWeek;
+ if (d < 0) {
+ d += DAYS_PER_WEEK;
}
- d += DAYS_PER_WEEK;
- }
- result += d * SECONDS_PER_DAY;
+ for (int w = 1; w < week; ++w) {
+ if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) {
+ break;
+ }
+ d += DAYS_PER_WEEK;
+ }
+ result += d * SECONDS_PER_DAY;
- // Add in the time for the month
- for(int m=0; m < month - 1; ++m) {
- result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY;
+ // Add in the time for the month
+ for (int m = 0; m < month - 1; ++m) {
+ result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY;
+ }
+ break;
}
- break;
- }
}
return result;
}
@@ -187,7 +183,7 @@ namespace orc {
* daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)?
* day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week>
*/
- class FutureRuleImpl: public FutureRule {
+ class FutureRuleImpl : public FutureRule {
std::string ruleString;
TimezoneVariant standard;
bool hasDst;
@@ -215,17 +211,17 @@ namespace orc {
offsets.resize(400 * 2 + 1);
startInStd = start.getTime(1970) < end.getTime(1970);
int64_t base = 0;
- for(int64_t year = 1970; year < 1970 + 400; ++year) {
+ for (int64_t year = 1970; year < 1970 + 400; ++year) {
if (startInStd) {
offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
- base + start.getTime(year) - standard.gmtOffset;
+ base + start.getTime(year) - standard.gmtOffset;
offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
- base + end.getTime(year) - dst.gmtOffset;
+ base + end.getTime(year) - dst.gmtOffset;
} else {
offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
- base + end.getTime(year) - dst.gmtOffset;
+ base + end.getTime(year) - dst.gmtOffset;
offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
- base + start.getTime(year) - standard.gmtOffset;
+ base + start.getTime(year) - standard.gmtOffset;
}
base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY;
}
@@ -233,7 +229,7 @@ namespace orc {
offsets[0] = 0;
}
- public:
+ public:
virtual ~FutureRuleImpl() override;
bool isDefined() const override;
const TimezoneVariant& getVariant(int64_t clk) const override;
@@ -287,13 +283,9 @@ namespace orc {
* A parser for the future rule strings.
*/
class FutureRuleParser {
- public:
- FutureRuleParser(const std::string& str,
- FutureRuleImpl* rule
- ): ruleString(str),
- length(str.size()),
- position(0),
- output(*rule) {
+ public:
+ FutureRuleParser(const std::string& str, FutureRuleImpl* rule)
+ : ruleString(str), length(str.size()), position(0), output(*rule) {
output.ruleString = str;
if (position != length) {
parseName(output.standard.name);
@@ -318,14 +310,13 @@ namespace orc {
}
}
- private:
-
+ private:
const std::string& ruleString;
size_t length;
size_t position;
- FutureRuleImpl &output;
+ FutureRuleImpl& output;
- void throwError(const char *msg) {
+ void throwError(const char* msg) {
std::stringstream buffer;
buffer << msg << " at " << position << " in '" << ruleString << "'";
throw TimezoneError(buffer.str());
@@ -348,7 +339,7 @@ namespace orc {
if (position == length) {
throwError("missing close '>'");
}
- position +=1;
+ position += 1;
} else {
while (position < length) {
char ch = ruleString[position];
@@ -456,9 +447,8 @@ namespace orc {
* Parse the POSIX TZ string.
*/
std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) {
- std::shared_ptr<FutureRule> result(new FutureRuleImpl());
- FutureRuleParser parser(ruleString,
- dynamic_cast<FutureRuleImpl*>(result.get()));
+ auto result = std::make_shared<FutureRuleImpl>();
+ FutureRuleParser parser(ruleString, dynamic_cast<FutureRuleImpl*>(result.get()));
return result;
}
@@ -475,7 +465,7 @@ namespace orc {
* An abstraction of the differences between versions.
*/
class VersionParser {
- public:
+ public:
virtual ~VersionParser();
/**
@@ -496,8 +486,7 @@ namespace orc {
/**
* Parse the future string
*/
- virtual std::string parseFutureString(const unsigned char *ptr,
- uint64_t offset,
+ virtual std::string parseFutureString(const unsigned char* ptr, uint64_t offset,
uint64_t length) const = 0;
};
@@ -506,14 +495,12 @@ namespace orc {
}
static uint32_t decode32(const unsigned char* ptr) {
- return static_cast<uint32_t>(ptr[0] << 24) |
- static_cast<uint32_t>(ptr[1] << 16) |
- static_cast<uint32_t>(ptr[2] << 8) |
- static_cast<uint32_t>(ptr[3]);
+ return static_cast<uint32_t>(ptr[0] << 24) | static_cast<uint32_t>(ptr[1] << 16) |
+ static_cast<uint32_t>(ptr[2] << 8) | static_cast<uint32_t>(ptr[3]);
}
- class Version1Parser: public VersionParser {
- public:
+ class Version1Parser : public VersionParser {
+ public:
virtual ~Version1Parser() override;
virtual uint64_t getVersion() const override {
@@ -535,9 +522,7 @@ namespace orc {
return static_cast<int32_t>(decode32(ptr));
}
- virtual std::string parseFutureString(const unsigned char *,
- uint64_t,
- uint64_t) const override {
+ virtual std::string parseFutureString(const unsigned char*, uint64_t, uint64_t) const override {
return "";
}
};
@@ -546,8 +531,8 @@ namespace orc {
// PASS
}
- class Version2Parser: public VersionParser {
- public:
+ class Version2Parser : public VersionParser {
+ public:
virtual ~Version2Parser() override;
virtual uint64_t getVersion() const override {
@@ -568,11 +553,9 @@ namespace orc {
return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4);
}
- virtual std::string parseFutureString(const unsigned char *ptr,
- uint64_t offset,
+ virtual std::string parseFutureString(const unsigned char* ptr, uint64_t offset,
uint64_t length) const override {
- return std::string(reinterpret_cast<const char*>(ptr) + offset + 1,
- length - 2);
+ return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, length - 2);
}
};
@@ -580,10 +563,9 @@ namespace orc {
// PASS
}
- class TimezoneImpl: public Timezone {
- public:
- TimezoneImpl(const std::string& name,
- const std::vector<unsigned char> bytes);
+ class TimezoneImpl : public Timezone {
+ public:
+ TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer);
virtual ~TimezoneImpl() override;
/**
@@ -605,15 +587,16 @@ namespace orc {
return clk + getVariant(clk).gmtOffset;
}
- private:
- void parseTimeVariants(const unsigned char* ptr,
- uint64_t variantOffset,
- uint64_t variantCount,
- uint64_t nameOffset,
- uint64_t nameCount);
- void parseZoneFile(const unsigned char* ptr,
- uint64_t sectionOffset,
- uint64_t fileLength,
+ int64_t convertFromUTC(int64_t clk) const override {
+ int64_t adjustedTime = clk - getVariant(clk).gmtOffset;
+ const auto& adjustedReader = getVariant(adjustedTime);
+ return clk - adjustedReader.gmtOffset;
+ }
+
+ private:
+ void parseTimeVariants(const unsigned char* ptr, uint64_t variantOffset, uint64_t variantCount,
+ uint64_t nameOffset, uint64_t nameCount);
+ void parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset, uint64_t fileLength,
const VersionParser& version);
// filename
std::string filename;
@@ -644,10 +627,10 @@ namespace orc {
};
DIAGNOSTIC_PUSH
- #ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wglobal-constructors")
- DIAGNOSTIC_IGNORE("-Wexit-time-destructors")
- #endif
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wglobal-constructors")
+ DIAGNOSTIC_IGNORE("-Wexit-time-destructors")
+#endif
static std::mutex timezone_mutex;
static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache;
DIAGNOSTIC_POP
@@ -656,9 +639,8 @@ namespace orc {
// PASS
}
- TimezoneImpl::TimezoneImpl(const std::string& _filename,
- const std::vector<unsigned char> buffer
- ): filename(_filename) {
+ TimezoneImpl::TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer)
+ : filename(_filename) {
parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser());
// Build the literal for the ORC epoch
// 2015 Jan 1 00:00:00
@@ -675,7 +657,7 @@ namespace orc {
}
const char* getTimezoneDirectory() {
- const char *dir = getenv("TZDIR");
+ const char* dir = getenv("TZDIR");
if (!dir) {
dir = DEFAULT_TZDIR;
}
@@ -689,18 +671,23 @@ namespace orc {
const Timezone& getTimezoneByFilename(const std::string& filename) {
// ORC-110
std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
- std::map<std::string, std::shared_ptr<Timezone> >::iterator itr =
- timezoneCache.find(filename);
+ std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = timezoneCache.find(filename);
if (itr != timezoneCache.end()) {
return *(itr->second).get();
}
+ if (!std::filesystem::exists(std::filesystem::path(filename))) {
+ std::stringstream ss;
+ ss << "Time zone file " << filename << " does not exist."
+ << " Please install IANA time zone database and set TZDIR env.";
+ throw TimezoneError(ss.str());
+ }
try {
- ORC_UNIQUE_PTR<InputStream> file = readFile(filename);
+ std::unique_ptr<InputStream> file = readFile(filename);
size_t size = static_cast<size_t>(file->getLength());
std::vector<unsigned char> buffer(size);
file->read(&buffer[0], size, 0);
- timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer));
- } catch(ParseError& err) {
+ timezoneCache[filename] = std::make_shared<TimezoneImpl>(filename, buffer);
+ } catch (ParseError& err) {
throw TimezoneError(err.what());
}
return *timezoneCache[filename].get();
@@ -732,32 +719,30 @@ namespace orc {
* Parse a set of bytes as a timezone file as if they came from filename.
*/
std::unique_ptr<Timezone> getTimezone(const std::string& filename,
- const std::vector<unsigned char>& b){
- return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b));
+ const std::vector<unsigned char>& b) {
+ return std::make_unique<TimezoneImpl>(filename, b);
}
TimezoneImpl::~TimezoneImpl() {
// PASS
}
- void TimezoneImpl::parseTimeVariants(const unsigned char* ptr,
- uint64_t variantOffset,
- uint64_t variantCount,
- uint64_t nameOffset,
+ void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, uint64_t variantOffset,
+ uint64_t variantCount, uint64_t nameOffset,
uint64_t nameCount) {
- for(uint64_t variant=0; variant < variantCount; ++variant) {
+ for (uint64_t variant = 0; variant < variantCount; ++variant) {
variants[variant].gmtOffset =
- static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
+ static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0;
uint64_t nameStart = ptr[variantOffset + 6 * variant + 5];
if (nameStart >= nameCount) {
std::stringstream buffer;
- buffer << "name out of range in variant " << variant
- << " - " << nameStart << " >= " << nameCount;
+ buffer << "name out of range in variant " << variant << " - " << nameStart
+ << " >= " << nameCount;
throw TimezoneError(buffer.str());
}
- variants[variant].name = std::string(reinterpret_cast<const char*>(ptr)
- + nameOffset + nameStart);
+ variants[variant].name =
+ std::string(reinterpret_cast<const char*>(ptr) + nameOffset + nameStart);
}
}
@@ -787,17 +772,14 @@ namespace orc {
* IsGmt
* FutureString
*/
- void TimezoneImpl::parseZoneFile(const unsigned char *ptr,
- uint64_t sectionOffset,
- uint64_t fileLength,
- const VersionParser& versionParser) {
- const uint64_t magicOffset = sectionOffset + 0;
+ void TimezoneImpl::parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset,
+ uint64_t fileLength, const VersionParser& versionParser) {
+ const uint64_t magicOffset = sectionOffset + 0;
const uint64_t headerOffset = magicOffset + 20;
// check for validity before we start parsing
if (fileLength < headerOffset + 6 * 4 ||
- strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4)
- != 0) {
+ strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) != 0) {
std::stringstream buffer;
buffer << "non-tzfile " << filename;
throw TimezoneError(buffer.str());
@@ -805,24 +787,23 @@ namespace orc {
const uint64_t isGmtCount = decode32(ptr + headerOffset + 0);
const uint64_t isStdCount = decode32(ptr + headerOffset + 4);
- const uint64_t leapCount = decode32(ptr + headerOffset + 8);
- const uint64_t timeCount = decode32(ptr + headerOffset + 12);
- const uint64_t variantCount = decode32(ptr + headerOffset + 16);
- const uint64_t nameCount = decode32(ptr + headerOffset + 20);
+ const uint64_t leapCount = decode32(ptr + headerOffset + 8);
+ const uint64_t timeCount = decode32(ptr + headerOffset + 12);
+ const uint64_t variantCount = decode32(ptr + headerOffset + 16);
+ const uint64_t nameCount = decode32(ptr + headerOffset + 20);
const uint64_t timeOffset = headerOffset + 24;
- const uint64_t timeVariantOffset =
- timeOffset + versionParser.getTimeSize() * timeCount;
+ const uint64_t timeVariantOffset = timeOffset + versionParser.getTimeSize() * timeCount;
const uint64_t variantOffset = timeVariantOffset + timeCount;
const uint64_t nameOffset = variantOffset + variantCount * 6;
- const uint64_t sectionLength = nameOffset + nameCount
- + (versionParser.getTimeSize() + 4) * leapCount
- + isGmtCount + isStdCount;
+ const uint64_t sectionLength = nameOffset + nameCount +
+ (versionParser.getTimeSize() + 4) * leapCount + isGmtCount +
+ isStdCount;
if (sectionLength > fileLength) {
std::stringstream buffer;
- buffer << "tzfile too short " << filename
- << " needs " << sectionLength << " and has " << fileLength;
+ buffer << "tzfile too short " << filename << " needs " << sectionLength << " and has "
+ << fileLength;
throw TimezoneError(buffer.str());
}
@@ -835,24 +816,19 @@ namespace orc {
variants.resize(variantCount);
transitions.resize(timeCount);
currentVariant.resize(timeCount);
- parseTimeVariants(ptr, variantOffset, variantCount, nameOffset,
- nameCount);
+ parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, nameCount);
bool foundAncient = false;
- for(uint64_t t=0; t < timeCount; ++t) {
- transitions[t] =
- versionParser.parseTime(ptr + timeOffset +
- t * versionParser.getTimeSize());
+ for (uint64_t t = 0; t < timeCount; ++t) {
+ transitions[t] = versionParser.parseTime(ptr + timeOffset + t * versionParser.getTimeSize());
currentVariant[t] = ptr[timeVariantOffset + t];
if (currentVariant[t] >= variantCount) {
std::stringstream buffer;
- buffer << "tzfile rule out of range " << filename
- << " references rule " << currentVariant[t]
- << " of " << variantCount;
+ buffer << "tzfile rule out of range " << filename << " references rule "
+ << currentVariant[t] << " of " << variantCount;
throw TimezoneError(buffer.str());
}
// find the oldest standard time and use that as the ancient value
- if (!foundAncient &&
- !variants[currentVariant[t]].isDst) {
+ if (!foundAncient && !variants[currentVariant[t]].isDst) {
foundAncient = true;
ancientVariant = currentVariant[t];
}
@@ -860,9 +836,8 @@ namespace orc {
if (!foundAncient) {
ancientVariant = 0;
}
- futureRule = parseFutureRule(versionParser.parseFutureString
- (ptr, sectionLength,
- fileLength - sectionLength));
+ futureRule = parseFutureRule(
+ versionParser.parseFutureString(ptr, sectionLength, fileLength - sectionLength));
// find the lower bound for applying the future rule
if (futureRule->isDefined()) {
@@ -897,11 +872,10 @@ namespace orc {
out << "Timezone file: " << filename << "\n";
out << " Version: " << version << "\n";
futureRule->print(out);
- for(uint64_t r=0; r < variants.size(); ++r) {
- out << " Variant " << r << ": "
- << variants[r].toString() << "\n";
+ for (uint64_t r = 0; r < variants.size(); ++r) {
+ out << " Variant " << r << ": " << variants[r].toString() << "\n";
}
- for(uint64_t t=0; t < transitions.size(); ++t) {
+ for (uint64_t t = 0; t < transitions.size(); ++t) {
tm timeStruct;
tm* result = nullptr;
char buffer[25];
@@ -912,25 +886,21 @@ namespace orc {
strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct);
}
}
- std::cout << " Transition: " << (result == nullptr ? "null" : buffer)
- << " (" << transitions[t] << ") -> "
- << variants[currentVariant[t]].name
- << "\n";
+ out << " Transition: " << (result == nullptr ? "null" : buffer) << " (" << transitions[t]
+ << ") -> " << variants[currentVariant[t]].name << "\n";
}
}
- TimezoneError::TimezoneError(const std::string& what
- ): std::runtime_error(what) {
+ TimezoneError::TimezoneError(const std::string& what) : std::runtime_error(what) {
// PASS
}
- TimezoneError::TimezoneError(const TimezoneError& other
- ): std::runtime_error(other) {
+ TimezoneError::TimezoneError(const TimezoneError& other) : std::runtime_error(other) {
// PASS
}
- TimezoneError::~TimezoneError() ORC_NOEXCEPT {
+ TimezoneError::~TimezoneError() noexcept {
// PASS
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Timezone.hh b/contrib/libs/apache/orc/c++/src/Timezone.hh
index 6c8b861259..0716c5a3f2 100644
--- a/contrib/libs/apache/orc/c++/src/Timezone.hh
+++ b/contrib/libs/apache/orc/c++/src/Timezone.hh
@@ -23,9 +23,9 @@
#include "Adaptor.hh"
+#include <stdint.h>
#include <memory>
#include <stdexcept>
-#include <stdint.h>
#include <string>
#include <vector>
@@ -55,7 +55,7 @@ namespace orc {
* city in the region (eg. America/Los_Angeles or America/Mexico_City).
*/
class Timezone {
- public:
+ public:
virtual ~Timezone();
/**
@@ -79,12 +79,17 @@ namespace orc {
/**
* Get the version of the zone file.
*/
- virtual uint64_t getVersion() const =0;
+ virtual uint64_t getVersion() const = 0;
/**
* Convert wall clock time of current timezone to UTC timezone
*/
virtual int64_t convertToUTC(int64_t clk) const = 0;
+
+ /**
+ * Convert UTC timezone to wall clock time of current timezone
+ */
+ virtual int64_t convertFromUTC(int64_t clk) const = 0;
};
/**
@@ -105,11 +110,11 @@ namespace orc {
std::unique_ptr<Timezone> getTimezone(const std::string& filename,
const std::vector<unsigned char>& b);
- class TimezoneError: public std::runtime_error {
- public:
- TimezoneError(const std::string& what);
- TimezoneError(const TimezoneError&);
- virtual ~TimezoneError() ORC_NOEXCEPT;
+ class TimezoneError : public std::runtime_error {
+ public:
+ explicit TimezoneError(const std::string& what);
+ explicit TimezoneError(const TimezoneError&);
+ ~TimezoneError() noexcept override;
};
/**
@@ -118,7 +123,7 @@ namespace orc {
* the future.
*/
class FutureRule {
- public:
+ public:
virtual ~FutureRule();
virtual bool isDefined() const = 0;
virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
@@ -129,6 +134,6 @@ namespace orc {
* Parse the POSIX TZ string.
*/
std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString);
-}
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
index 14517ce164..c427a962b5 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
@@ -16,9 +16,9 @@
* limitations under the License.
*/
+#include "TypeImpl.hh"
#include "Adaptor.hh"
#include "orc/Exceptions.hh"
-#include "TypeImpl.hh"
#include <iostream>
#include <sstream>
@@ -51,8 +51,7 @@ namespace orc {
subtypeCount = 0;
}
- TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision,
- uint64_t _scale) {
+ TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, uint64_t _scale) {
parent = nullptr;
columnId = -1;
maximumColumnId = -1;
@@ -66,7 +65,7 @@ namespace orc {
uint64_t TypeImpl::assignIds(uint64_t root) const {
columnId = static_cast<int64_t>(root);
uint64_t current = root + 1;
- for(uint64_t i=0; i < subtypeCount; ++i) {
+ for (uint64_t i = 0; i < subtypeCount; ++i) {
current = dynamic_cast<TypeImpl*>(subTypes[i].get())->assignIds(current);
}
maximumColumnId = static_cast<int64_t>(current) - 1;
@@ -121,8 +120,7 @@ namespace orc {
return scale;
}
- Type& TypeImpl::setAttribute(const std::string& key,
- const std::string& value) {
+ Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) {
attributes[key] = value;
return *this;
}
@@ -171,8 +169,7 @@ namespace orc {
subtypeCount += 1;
}
- Type* TypeImpl::addStructField(const std::string& fieldName,
- std::unique_ptr<Type> fieldType) {
+ Type* TypeImpl::addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) {
addChildType(std::move(fieldType));
fieldNames.push_back(fieldName);
return this;
@@ -184,299 +181,301 @@ namespace orc {
}
bool isUnquotedFieldName(std::string fieldName) {
- for (auto &ch : fieldName) {
- if (!isalnum(ch) && ch != '_') {
- return false;
- }
+ for (auto& ch : fieldName) {
+ if (!isalnum(ch) && ch != '_') {
+ return false;
+ }
}
return true;
}
std::string TypeImpl::toString() const {
switch (static_cast<int64_t>(kind)) {
- case BOOLEAN:
- return "boolean";
- case BYTE:
- return "tinyint";
- case SHORT:
- return "smallint";
- case INT:
- return "int";
- case LONG:
- return "bigint";
- case FLOAT:
- return "float";
- case DOUBLE:
- return "double";
- case STRING:
- return "string";
- case BINARY:
- return "binary";
- case TIMESTAMP:
- return "timestamp";
- case TIMESTAMP_INSTANT:
- return "timestamp with local time zone";
- case LIST:
- return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
- case MAP:
- return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
- (subTypes[1] ? subTypes[1]->toString() : "void") + ">";
- case STRUCT: {
- std::string result = "struct<";
- for(size_t i=0; i < subTypes.size(); ++i) {
- if (i != 0) {
- result += ",";
- }
- if (isUnquotedFieldName(fieldNames[i])) {
- result += fieldNames[i];
- } else {
- std::string name(fieldNames[i]);
- size_t pos = 0;
- while ((pos = name.find("`", pos)) != std::string::npos) {
- name.replace(pos, 1, "``");
- pos += 2;
+ case BOOLEAN:
+ return "boolean";
+ case BYTE:
+ return "tinyint";
+ case SHORT:
+ return "smallint";
+ case INT:
+ return "int";
+ case LONG:
+ return "bigint";
+ case FLOAT:
+ return "float";
+ case DOUBLE:
+ return "double";
+ case STRING:
+ return "string";
+ case BINARY:
+ return "binary";
+ case TIMESTAMP:
+ return "timestamp";
+ case TIMESTAMP_INSTANT:
+ return "timestamp with local time zone";
+ case LIST:
+ return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
+ case MAP:
+ return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
+ (subTypes[1] ? subTypes[1]->toString() : "void") + ">";
+ case STRUCT: {
+ std::string result = "struct<";
+ for (size_t i = 0; i < subTypes.size(); ++i) {
+ if (i != 0) {
+ result += ",";
}
- result += "`";
- result += name;
- result += "`";
+ if (isUnquotedFieldName(fieldNames[i])) {
+ result += fieldNames[i];
+ } else {
+ std::string name(fieldNames[i]);
+ size_t pos = 0;
+ while ((pos = name.find("`", pos)) != std::string::npos) {
+ name.replace(pos, 1, "``");
+ pos += 2;
+ }
+ result += "`";
+ result += name;
+ result += "`";
+ }
+ result += ":";
+ result += subTypes[i]->toString();
}
- result += ":";
- result += subTypes[i]->toString();
+ result += ">";
+ return result;
}
- result += ">";
- return result;
- }
- case UNION: {
- std::string result = "uniontype<";
- for(size_t i=0; i < subTypes.size(); ++i) {
- if (i != 0) {
- result += ",";
+ case UNION: {
+ std::string result = "uniontype<";
+ for (size_t i = 0; i < subTypes.size(); ++i) {
+ if (i != 0) {
+ result += ",";
+ }
+ result += subTypes[i]->toString();
}
- result += subTypes[i]->toString();
+ result += ">";
+ return result;
}
- result += ">";
- return result;
- }
- case DECIMAL: {
- std::stringstream result;
- result << "decimal(" << precision << "," << scale << ")";
- return result.str();
- }
- case DATE:
- return "date";
- case VARCHAR: {
- std::stringstream result;
- result << "varchar(" << maxLength << ")";
- return result.str();
- }
- case CHAR: {
- std::stringstream result;
- result << "char(" << maxLength << ")";
- return result.str();
- }
- default:
- throw NotImplementedYet("Unknown type");
+ case DECIMAL: {
+ std::stringstream result;
+ result << "decimal(" << precision << "," << scale << ")";
+ return result.str();
+ }
+ case DATE:
+ return "date";
+ case VARCHAR: {
+ std::stringstream result;
+ result << "varchar(" << maxLength << ")";
+ return result.str();
+ }
+ case CHAR: {
+ std::stringstream result;
+ result << "char(" << maxLength << ")";
+ return result.str();
+ }
+ default:
+ throw NotImplementedYet("Unknown type");
}
}
- std::unique_ptr<ColumnVectorBatch>
- TypeImpl::createRowBatch(uint64_t capacity,
- MemoryPool& memoryPool,
- bool encoded) const {
+ std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(uint64_t capacity,
+ MemoryPool& memoryPool,
+ bool encoded) const {
+ return createRowBatch(capacity, memoryPool, encoded, /*useTightNumericVector=*/false);
+ }
+
+ std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(uint64_t capacity,
+ MemoryPool& memoryPool, bool encoded,
+ bool useTightNumericVector) const {
switch (static_cast<int64_t>(kind)) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case DATE:
- return std::unique_ptr<ColumnVectorBatch>
- (new LongVectorBatch(capacity, memoryPool));
-
- case FLOAT:
- case DOUBLE:
- return std::unique_ptr<ColumnVectorBatch>
- (new DoubleVectorBatch(capacity, memoryPool));
-
- case STRING:
- case BINARY:
- case CHAR:
- case VARCHAR:
- return encoded ?
- std::unique_ptr<ColumnVectorBatch>
- (new EncodedStringVectorBatch(capacity, memoryPool))
- : std::unique_ptr<ColumnVectorBatch>
- (new StringVectorBatch(capacity, memoryPool));
-
- case TIMESTAMP:
- case TIMESTAMP_INSTANT:
- return std::unique_ptr<ColumnVectorBatch>
- (new TimestampVectorBatch(capacity, memoryPool));
-
- case STRUCT: {
- StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- for(uint64_t i=0; i < getSubtypeCount(); ++i) {
- result->fields.push_back(getSubtype(i)->
- createRowBatch(capacity,
- memoryPool, encoded).release());
+ case BOOLEAN:
+ if (useTightNumericVector) {
+ return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
+ }
+ return std::make_unique<LongVectorBatch>(capacity, memoryPool);
+ case BYTE:
+ if (useTightNumericVector) {
+ return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
+ }
+ return std::make_unique<LongVectorBatch>(capacity, memoryPool);
+ case SHORT:
+ if (useTightNumericVector) {
+ return std::make_unique<ShortVectorBatch>(capacity, memoryPool);
+ }
+ return std::make_unique<LongVectorBatch>(capacity, memoryPool);
+ case INT:
+ if (useTightNumericVector) {
+ return std::make_unique<IntVectorBatch>(capacity, memoryPool);
+ }
+ return std::make_unique<LongVectorBatch>(capacity, memoryPool);
+ case LONG:
+ case DATE:
+ return std::make_unique<LongVectorBatch>(capacity, memoryPool);
+
+ case FLOAT:
+ if (useTightNumericVector) {
+ return std::make_unique<FloatVectorBatch>(capacity, memoryPool);
+ }
+ return std::make_unique<DoubleVectorBatch>(capacity, memoryPool);
+ case DOUBLE:
+ return std::make_unique<DoubleVectorBatch>(capacity, memoryPool);
+
+ case STRING:
+ case BINARY:
+ case CHAR:
+ case VARCHAR:
+ return encoded ? std::make_unique<EncodedStringVectorBatch>(capacity, memoryPool)
+ : std::make_unique<StringVectorBatch>(capacity, memoryPool);
+
+ case TIMESTAMP:
+ case TIMESTAMP_INSTANT:
+ return std::make_unique<TimestampVectorBatch>(capacity, memoryPool);
+
+ case STRUCT: {
+ auto result = std::make_unique<StructVectorBatch>(capacity, memoryPool);
+ for (uint64_t i = 0; i < getSubtypeCount(); ++i) {
+ result->fields.push_back(
+ getSubtype(i)
+ ->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector)
+ .release());
+ }
+ return result;
}
- return return_value;
- }
- case LIST: {
- ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- if (getSubtype(0) != nullptr) {
- result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded);
+ case LIST: {
+ auto result = std::make_unique<ListVectorBatch>(capacity, memoryPool);
+ if (getSubtype(0) != nullptr) {
+ result->elements =
+ getSubtype(0)->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector);
+ }
+ return result;
}
- return return_value;
- }
- case MAP: {
- MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- if (getSubtype(0) != nullptr) {
- result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded);
- }
- if (getSubtype(1) != nullptr) {
- result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded);
+ case MAP: {
+ auto result = std::make_unique<MapVectorBatch>(capacity, memoryPool);
+ if (getSubtype(0) != nullptr) {
+ result->keys =
+ getSubtype(0)->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector);
+ }
+ if (getSubtype(1) != nullptr) {
+ result->elements =
+ getSubtype(1)->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector);
+ }
+ return result;
}
- return return_value;
- }
- case DECIMAL: {
- if (getPrecision() == 0 || getPrecision() > 18) {
- return std::unique_ptr<ColumnVectorBatch>
- (new Decimal128VectorBatch(capacity, memoryPool));
- } else {
- return std::unique_ptr<ColumnVectorBatch>
- (new Decimal64VectorBatch(capacity, memoryPool));
+ case DECIMAL: {
+ if (getPrecision() == 0 || getPrecision() > 18) {
+ return std::make_unique<Decimal128VectorBatch>(capacity, memoryPool);
+ } else {
+ return std::make_unique<Decimal64VectorBatch>(capacity, memoryPool);
+ }
}
- }
- case UNION: {
- UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- for(uint64_t i=0; i < getSubtypeCount(); ++i) {
- result->children.push_back(getSubtype(i)->createRowBatch(capacity,
- memoryPool, encoded)
- .release());
+ case UNION: {
+ auto result = std::make_unique<UnionVectorBatch>(capacity, memoryPool);
+ for (uint64_t i = 0; i < getSubtypeCount(); ++i) {
+ result->children.push_back(
+ getSubtype(i)
+ ->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector)
+ .release());
+ }
+ return result;
}
- return return_value;
- }
- default:
- throw NotImplementedYet("not supported yet");
+ default:
+ throw NotImplementedYet("not supported yet");
}
}
std::unique_ptr<Type> createPrimitiveType(TypeKind kind) {
- return std::unique_ptr<Type>(new TypeImpl(kind));
+ return std::make_unique<TypeImpl>(kind);
}
- std::unique_ptr<Type> createCharType(TypeKind kind,
- uint64_t maxLength) {
- return std::unique_ptr<Type>(new TypeImpl(kind, maxLength));
+ std::unique_ptr<Type> createCharType(TypeKind kind, uint64_t maxLength) {
+ return std::make_unique<TypeImpl>(kind, maxLength);
}
- std::unique_ptr<Type> createDecimalType(uint64_t precision,
- uint64_t scale) {
- return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
+ std::unique_ptr<Type> createDecimalType(uint64_t precision, uint64_t scale) {
+ return std::make_unique<TypeImpl>(DECIMAL, precision, scale);
}
std::unique_ptr<Type> createStructType() {
- return std::unique_ptr<Type>(new TypeImpl(STRUCT));
+ return std::make_unique<TypeImpl>(STRUCT);
}
std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) {
- TypeImpl* result = new TypeImpl(LIST);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
+ auto result = std::make_unique<TypeImpl>(LIST);
result->addChildType(std::move(elements));
- return return_value;
+ return result;
}
- std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
- std::unique_ptr<Type> value) {
- TypeImpl* result = new TypeImpl(MAP);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
+ std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, std::unique_ptr<Type> value) {
+ auto result = std::make_unique<TypeImpl>(MAP);
result->addChildType(std::move(key));
result->addChildType(std::move(value));
- return return_value;
+ return result;
}
std::unique_ptr<Type> createUnionType() {
- return std::unique_ptr<Type>(new TypeImpl(UNION));
+ return std::make_unique<TypeImpl>(UNION);
}
std::string printProtobufMessage(const google::protobuf::Message& message);
- std::unique_ptr<Type> convertType(const proto::Type& type,
- const proto::Footer& footer) {
+ std::unique_ptr<Type> convertType(const proto::Type& type, const proto::Footer& footer) {
std::unique_ptr<Type> ret;
switch (static_cast<int64_t>(type.kind())) {
-
- case proto::Type_Kind_BOOLEAN:
- case proto::Type_Kind_BYTE:
- case proto::Type_Kind_SHORT:
- case proto::Type_Kind_INT:
- case proto::Type_Kind_LONG:
- case proto::Type_Kind_FLOAT:
- case proto::Type_Kind_DOUBLE:
- case proto::Type_Kind_STRING:
- case proto::Type_Kind_BINARY:
- case proto::Type_Kind_TIMESTAMP:
- case proto::Type_Kind_TIMESTAMP_INSTANT:
- case proto::Type_Kind_DATE:
- ret = std::unique_ptr<Type>
- (new TypeImpl(static_cast<TypeKind>(type.kind())));
- break;
-
- case proto::Type_Kind_CHAR:
- case proto::Type_Kind_VARCHAR:
- ret = std::unique_ptr<Type>
- (new TypeImpl(static_cast<TypeKind>(type.kind()),
- type.maximumlength()));
- break;
-
- case proto::Type_Kind_DECIMAL:
- ret = std::unique_ptr<Type>
- (new TypeImpl(DECIMAL, type.precision(), type.scale()));
- break;
-
- case proto::Type_Kind_LIST:
- case proto::Type_Kind_MAP:
- case proto::Type_Kind_UNION: {
- TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind()));
- ret = std::unique_ptr<Type>(result);
- if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1)
- throw ParseError("Illegal LIST type that doesn't contain one subtype");
- if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2)
- throw ParseError("Illegal MAP type that doesn't contain two subtypes");
- if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0)
- throw ParseError("Illegal UNION type that doesn't contain any subtypes");
- for(int i=0; i < type.subtypes_size(); ++i) {
- result->addUnionChild(convertType(footer.types(static_cast<int>
- (type.subtypes(i))),
- footer));
+ case proto::Type_Kind_BOOLEAN:
+ case proto::Type_Kind_BYTE:
+ case proto::Type_Kind_SHORT:
+ case proto::Type_Kind_INT:
+ case proto::Type_Kind_LONG:
+ case proto::Type_Kind_FLOAT:
+ case proto::Type_Kind_DOUBLE:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_BINARY:
+ case proto::Type_Kind_TIMESTAMP:
+ case proto::Type_Kind_TIMESTAMP_INSTANT:
+ case proto::Type_Kind_DATE:
+ ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()));
+ break;
+
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_VARCHAR:
+ ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()), type.maximum_length());
+ break;
+
+ case proto::Type_Kind_DECIMAL:
+ ret = std::make_unique<TypeImpl>(DECIMAL, type.precision(), type.scale());
+ break;
+
+ case proto::Type_Kind_LIST:
+ case proto::Type_Kind_MAP:
+ case proto::Type_Kind_UNION: {
+ ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()));
+ if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1)
+ throw ParseError("Illegal LIST type that doesn't contain one subtype");
+ if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2)
+ throw ParseError("Illegal MAP type that doesn't contain two subtypes");
+ if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0)
+ throw ParseError("Illegal UNION type that doesn't contain any subtypes");
+ for (int i = 0; i < type.subtypes_size(); ++i) {
+ ret->addUnionChild(convertType(footer.types(static_cast<int>(type.subtypes(i))), footer));
+ }
+ break;
}
- break;
- }
-
- case proto::Type_Kind_STRUCT: {
- TypeImpl* result = new TypeImpl(STRUCT);
- ret = std::unique_ptr<Type>(result);
- if (type.subtypes_size() > type.fieldnames_size())
- throw ParseError("Illegal STRUCT type that contains less fieldnames than subtypes");
- for(int i=0; i < type.subtypes_size(); ++i) {
- result->addStructField(type.fieldnames(i),
- convertType(footer.types(static_cast<int>
- (type.subtypes(i))),
- footer));
+
+ case proto::Type_Kind_STRUCT: {
+ ret = std::make_unique<TypeImpl>(STRUCT);
+ if (type.subtypes_size() > type.field_names_size())
+ throw ParseError("Illegal STRUCT type that contains less field_names than subtypes");
+ for (int i = 0; i < type.subtypes_size(); ++i) {
+ ret->addStructField(
+ type.field_names(i),
+ convertType(footer.types(static_cast<int>(type.subtypes(i))), footer));
+ }
+ break;
}
- break;
- }
- default:
- throw NotImplementedYet("Unknown type kind");
+ default:
+ throw NotImplementedYet("Unknown type kind");
}
for (int i = 0; i < type.attributes_size(); ++i) {
const auto& attribute = type.attributes(i);
@@ -493,143 +492,126 @@ namespace orc {
* @param selected is each column by id selected
* @return a clone of the fileType filtered by the selection array
*/
- std::unique_ptr<Type> buildSelectedType(const Type *fileType,
- const std::vector<bool>& selected) {
+ std::unique_ptr<Type> buildSelectedType(const Type* fileType, const std::vector<bool>& selected) {
if (fileType == nullptr || !selected[fileType->getColumnId()]) {
- return std::unique_ptr<Type>();
+ return nullptr;
}
- TypeImpl* result;
+ std::unique_ptr<TypeImpl> result;
switch (static_cast<int>(fileType->getKind())) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case FLOAT:
- case DOUBLE:
- case STRING:
- case BINARY:
- case TIMESTAMP:
- case TIMESTAMP_INSTANT:
- case DATE:
- result = new TypeImpl(fileType->getKind());
- break;
-
- case DECIMAL:
- result= new TypeImpl(fileType->getKind(),
- fileType->getPrecision(), fileType->getScale());
- break;
-
- case VARCHAR:
- case CHAR:
- result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength());
- break;
-
- case LIST:
- result = new TypeImpl(fileType->getKind());
- result->addChildType(buildSelectedType(fileType->getSubtype(0),
- selected));
- break;
-
- case MAP:
- result = new TypeImpl(fileType->getKind());
- result->addChildType(buildSelectedType(fileType->getSubtype(0),
- selected));
- result->addChildType(buildSelectedType(fileType->getSubtype(1),
- selected));
- break;
-
- case STRUCT: {
- result = new TypeImpl(fileType->getKind());
- for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) {
- std::unique_ptr<Type> childType =
- buildSelectedType(fileType->getSubtype(child), selected);
- if (childType.get() != nullptr) {
- result->addStructField(fileType->getFieldName(child),
- std::move(childType));
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case STRING:
+ case BINARY:
+ case TIMESTAMP:
+ case TIMESTAMP_INSTANT:
+ case DATE:
+ result = std::make_unique<TypeImpl>(fileType->getKind());
+ break;
+
+ case DECIMAL:
+ result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getPrecision(),
+ fileType->getScale());
+ break;
+
+ case VARCHAR:
+ case CHAR:
+ result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getMaximumLength());
+ break;
+
+ case LIST:
+ result = std::make_unique<TypeImpl>(fileType->getKind());
+ result->addChildType(buildSelectedType(fileType->getSubtype(0), selected));
+ break;
+
+ case MAP:
+ result = std::make_unique<TypeImpl>(fileType->getKind());
+ result->addChildType(buildSelectedType(fileType->getSubtype(0), selected));
+ result->addChildType(buildSelectedType(fileType->getSubtype(1), selected));
+ break;
+
+ case STRUCT: {
+ result = std::make_unique<TypeImpl>(fileType->getKind());
+ for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) {
+ std::unique_ptr<Type> childType =
+ buildSelectedType(fileType->getSubtype(child), selected);
+ if (childType.get() != nullptr) {
+ result->addStructField(fileType->getFieldName(child), std::move(childType));
+ }
}
+ break;
}
- break;
- }
- case UNION: {
- result = new TypeImpl(fileType->getKind());
- for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) {
- std::unique_ptr<Type> childType =
- buildSelectedType(fileType->getSubtype(child), selected);
- if (childType.get() != nullptr) {
- result->addUnionChild(std::move(childType));
+ case UNION: {
+ result = std::make_unique<TypeImpl>(fileType->getKind());
+ for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) {
+ std::unique_ptr<Type> childType =
+ buildSelectedType(fileType->getSubtype(child), selected);
+ if (childType.get() != nullptr) {
+ result->addUnionChild(std::move(childType));
+ }
}
+ break;
}
- break;
- }
- default:
- throw NotImplementedYet("Unknown type kind");
+ default:
+ throw NotImplementedYet("Unknown type kind");
}
result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
for (auto& key : fileType->getAttributeKeys()) {
const auto& value = fileType->getAttributeValue(key);
result->setAttribute(key, value);
}
- return std::unique_ptr<Type>(result);
+ return result;
}
- ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) {
+ std::unique_ptr<Type> Type::buildTypeFromString(const std::string& input) {
size_t size = input.size();
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> res =
- TypeImpl::parseType(input, 0, size);
+ std::pair<std::unique_ptr<Type>, size_t> res = TypeImpl::parseType(input, 0, size);
if (res.second != size) {
throw std::logic_error("Invalid type string.");
}
return std::move(res.first);
}
- std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input,
- size_t start,
+ std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string& input, size_t start,
size_t end) {
- TypeImpl* arrayType = new TypeImpl(LIST);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType);
+ auto result = std::make_unique<TypeImpl>(LIST);
if (input[start] != '<') {
throw std::logic_error("Missing < after array.");
}
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> res =
- TypeImpl::parseType(input, start + 1, end);
+ std::pair<std::unique_ptr<Type>, size_t> res = TypeImpl::parseType(input, start + 1, end);
if (res.second != end) {
- throw std::logic_error(
- "Array type must contain exactly one sub type.");
+ throw std::logic_error("Array type must contain exactly one sub type.");
}
- arrayType->addChildType(std::move(res.first));
- return return_value;
+ result->addChildType(std::move(res.first));
+ return result;
}
- std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input,
- size_t start,
- size_t end) {
- TypeImpl* mapType = new TypeImpl(MAP);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType);
+ std::unique_ptr<Type> TypeImpl::parseMapType(const std::string& input, size_t start, size_t end) {
+ auto result = std::make_unique<TypeImpl>(MAP);
if (input[start] != '<') {
throw std::logic_error("Missing < after map.");
}
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> key =
- TypeImpl::parseType(input, start + 1, end);
+ std::pair<std::unique_ptr<Type>, size_t> key = TypeImpl::parseType(input, start + 1, end);
if (input[key.second] != ',') {
throw std::logic_error("Missing comma after key.");
}
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> val =
- TypeImpl::parseType(input, key.second + 1, end);
+ std::pair<std::unique_ptr<Type>, size_t> val = TypeImpl::parseType(input, key.second + 1, end);
if (val.second != end) {
- throw std::logic_error(
- "Map type must contain exactly two sub types.");
+ throw std::logic_error("Map type must contain exactly two sub types.");
}
- mapType->addChildType(std::move(key.first));
- mapType->addChildType(std::move(val.first));
- return return_value;
+ result->addChildType(std::move(key.first));
+ result->addChildType(std::move(val.first));
+ return result;
}
- std::pair<std::string, size_t> TypeImpl::parseName(const std::string &input,
- const size_t start,
+ std::pair<std::string, size_t> TypeImpl::parseName(const std::string& input, const size_t start,
const size_t end) {
size_t pos = start;
if (input[pos] == '`') {
@@ -638,7 +620,7 @@ namespace orc {
while (pos < end) {
char ch = input[++pos];
if (ch == '`') {
- if (pos < end && input[pos+1] == '`') {
+ if (pos < end && input[pos + 1] == '`') {
++pos;
oss.put('`');
} else {
@@ -667,11 +649,9 @@ namespace orc {
}
}
- std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input,
- size_t start,
+ std::unique_ptr<Type> TypeImpl::parseStructType(const std::string& input, size_t start,
size_t end) {
- TypeImpl* structType = new TypeImpl(STRUCT);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType);
+ auto result = std::make_unique<TypeImpl>(STRUCT);
size_t pos = start + 1;
if (input[start] != '<') {
throw std::logic_error("Missing < after struct.");
@@ -682,9 +662,8 @@ namespace orc {
if (input[pos] != ':') {
throw std::logic_error("Invalid struct type. No field name set.");
}
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> typeRes =
- TypeImpl::parseType(input, ++pos, end);
- structType->addStructField(nameRes.first, std::move(typeRes.first));
+ std::pair<std::unique_ptr<Type>, size_t> typeRes = TypeImpl::parseType(input, ++pos, end);
+ result->addStructField(nameRes.first, std::move(typeRes.first));
pos = typeRes.second;
if (pos != end && input[pos] != ',') {
throw std::logic_error("Missing comma after field.");
@@ -692,22 +671,19 @@ namespace orc {
++pos;
}
- return return_value;
+ return result;
}
- std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input,
- size_t start,
+ std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string& input, size_t start,
size_t end) {
- TypeImpl* unionType = new TypeImpl(UNION);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType);
+ auto result = std::make_unique<TypeImpl>(UNION);
size_t pos = start + 1;
if (input[start] != '<') {
throw std::logic_error("Missing < after uniontype.");
}
while (pos < end) {
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> res =
- TypeImpl::parseType(input, pos, end);
- unionType->addChildType(std::move(res.first));
+ std::pair<std::unique_ptr<Type>, size_t> res = TypeImpl::parseType(input, pos, end);
+ result->addChildType(std::move(res.first));
pos = res.second;
if (pos != end && input[pos] != ',') {
throw std::logic_error("Missing comma after union sub type.");
@@ -715,11 +691,10 @@ namespace orc {
++pos;
}
- return return_value;
+ return result;
}
- std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input,
- size_t start,
+ std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string& input, size_t start,
size_t end) {
if (input[start] != '(') {
throw std::logic_error("Missing ( after decimal.");
@@ -729,61 +704,54 @@ namespace orc {
if (sep + 1 >= end || sep == std::string::npos) {
throw std::logic_error("Decimal type must specify precision and scale.");
}
- uint64_t precision =
- static_cast<uint64_t>(atoi(input.substr(pos, sep - pos).c_str()));
- uint64_t scale =
- static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str()));
- return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
+ uint64_t precision = static_cast<uint64_t>(atoi(input.substr(pos, sep - pos).c_str()));
+ uint64_t scale = static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str()));
+ return std::make_unique<TypeImpl>(DECIMAL, precision, scale);
}
- void validatePrimitiveType(std::string category,
- const std::string &input,
- const size_t pos) {
+ void validatePrimitiveType(std::string category, const std::string& input, const size_t pos) {
if (input[pos] == '<' || input[pos] == '(') {
std::ostringstream oss;
- oss << "Invalid " << input[pos] << " after "
- << category << " type.";
+ oss << "Invalid " << input[pos] << " after " << category << " type.";
throw std::logic_error(oss.str());
}
}
- std::unique_ptr<Type> TypeImpl::parseCategory(std::string category,
- const std::string &input,
- size_t start,
- size_t end) {
+ std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, const std::string& input,
+ size_t start, size_t end) {
if (category == "boolean") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(BOOLEAN));
+ return std::make_unique<TypeImpl>(BOOLEAN);
} else if (category == "tinyint") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(BYTE));
+ return std::make_unique<TypeImpl>(BYTE);
} else if (category == "smallint") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(SHORT));
+ return std::make_unique<TypeImpl>(SHORT);
} else if (category == "int") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(INT));
+ return std::make_unique<TypeImpl>(INT);
} else if (category == "bigint") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(LONG));
+ return std::make_unique<TypeImpl>(LONG);
} else if (category == "float") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(FLOAT));
+ return std::make_unique<TypeImpl>(FLOAT);
} else if (category == "double") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(DOUBLE));
+ return std::make_unique<TypeImpl>(DOUBLE);
} else if (category == "string") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(STRING));
+ return std::make_unique<TypeImpl>(STRING);
} else if (category == "binary") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(BINARY));
+ return std::make_unique<TypeImpl>(BINARY);
} else if (category == "timestamp") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP));
+ return std::make_unique<TypeImpl>(TIMESTAMP);
} else if (category == "timestamp with local time zone") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP_INSTANT));
+ return std::make_unique<TypeImpl>(TIMESTAMP_INSTANT);
} else if (category == "array") {
return parseArrayType(input, start, end);
} else if (category == "map") {
@@ -796,27 +764,28 @@ namespace orc {
return parseDecimalType(input, start, end);
} else if (category == "date") {
validatePrimitiveType(category, input, start);
- return std::unique_ptr<Type>(new TypeImpl(DATE));
+ return std::make_unique<TypeImpl>(DATE);
} else if (category == "varchar") {
if (input[start] != '(') {
throw std::logic_error("Missing ( after varchar.");
}
- uint64_t maxLength = static_cast<uint64_t>(
- atoi(input.substr(start + 1, end - start + 1).c_str()));
- return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength));
+ uint64_t maxLength =
+ static_cast<uint64_t>(atoi(input.substr(start + 1, end - start + 1).c_str()));
+ return std::make_unique<TypeImpl>(VARCHAR, maxLength);
} else if (category == "char") {
if (input[start] != '(') {
throw std::logic_error("Missing ( after char.");
}
- uint64_t maxLength = static_cast<uint64_t>(
- atoi(input.substr(start + 1, end - start + 1).c_str()));
- return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength));
+ uint64_t maxLength =
+ static_cast<uint64_t>(atoi(input.substr(start + 1, end - start + 1).c_str()));
+ return std::make_unique<TypeImpl>(CHAR, maxLength);
} else {
throw std::logic_error("Unknown type " + category);
}
}
- std::pair<ORC_UNIQUE_PTR<Type>, size_t> TypeImpl::parseType(const std::string &input, size_t start, size_t end) {
+ std::pair<std::unique_ptr<Type>, size_t> TypeImpl::parseType(const std::string& input,
+ size_t start, size_t end) {
size_t pos = start;
while (pos < end && (isalpha(input[pos]) || input[pos] == ' ')) {
++pos;
@@ -854,4 +823,18 @@ namespace orc {
return std::make_pair(parseCategory(category, input, pos, nextPos), endPos);
}
-}
+ const Type* TypeImpl::getTypeByColumnId(uint64_t colIdx) const {
+ if (getColumnId() == colIdx) {
+ return this;
+ }
+
+ for (uint64_t i = 0; i != getSubtypeCount(); ++i) {
+ const Type* ret = getSubtype(i)->getTypeByColumnId(colIdx);
+ if (ret != nullptr) {
+ return ret;
+ }
+ }
+ return nullptr;
+ }
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
index 88c4737d18..6d0743793a 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
@@ -28,8 +28,8 @@
namespace orc {
- class TypeImpl: public Type {
- private:
+ class TypeImpl : public Type {
+ private:
TypeImpl* parent;
mutable int64_t columnId;
mutable int64_t maximumColumnId;
@@ -42,7 +42,7 @@ namespace orc {
uint64_t scale;
std::map<std::string, std::string> attributes;
- public:
+ public:
/**
* Create most of the primitive types.
*/
@@ -56,8 +56,7 @@ namespace orc {
/**
* Create decimal type.
*/
- TypeImpl(TypeKind kind, uint64_t precision,
- uint64_t scale);
+ TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale);
uint64_t getColumnId() const override;
@@ -77,8 +76,7 @@ namespace orc {
uint64_t getScale() const override;
- Type& setAttribute(const std::string& key,
- const std::string& value) override;
+ Type& setAttribute(const std::string& key, const std::string& value) override;
bool hasAttributeKey(const std::string& key) const override;
@@ -90,14 +88,16 @@ namespace orc {
std::string toString() const override;
- Type* addStructField(const std::string& fieldName,
- std::unique_ptr<Type> fieldType) override;
+ const Type* getTypeByColumnId(uint64_t colIdx) const override;
+ Type* addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) override;
Type* addUnionChild(std::unique_ptr<Type> fieldType) override;
- std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size,
- MemoryPool& memoryPool,
- bool encoded = false
- ) const override;
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& memoryPool,
+ bool encoded = false) const override;
+
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(
+ uint64_t size, MemoryPool& memoryPool, bool encoded = false,
+ bool useTightNumericVector = false) const override;
/**
* Explicitly set the column ids. Only for internal usage.
@@ -109,12 +109,10 @@ namespace orc {
*/
void addChildType(std::unique_ptr<Type> childType);
- static std::pair<ORC_UNIQUE_PTR<Type>, size_t> parseType(
- const std::string &input,
- size_t start,
- size_t end);
+ static std::pair<std::unique_ptr<Type>, size_t> parseType(const std::string& input,
+ size_t start, size_t end);
- private:
+ private:
/**
* Assign ids to this node and its children giving this
* node rootId.
@@ -133,9 +131,7 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::unique_ptr<Type> parseArrayType(const std::string &input,
- size_t start,
- size_t end);
+ static std::unique_ptr<Type> parseArrayType(const std::string& input, size_t start, size_t end);
/**
* Parse map type from string
@@ -143,9 +139,7 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::unique_ptr<Type> parseMapType(const std::string &input,
- size_t start,
- size_t end);
+ static std::unique_ptr<Type> parseMapType(const std::string& input, size_t start, size_t end);
/**
* Parse field name from string
@@ -153,8 +147,7 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::pair<std::string, size_t> parseName(const std::string &input,
- const size_t start,
+ static std::pair<std::string, size_t> parseName(const std::string& input, const size_t start,
const size_t end);
/**
@@ -163,8 +156,7 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::unique_ptr<Type> parseStructType(const std::string &input,
- size_t start,
+ static std::unique_ptr<Type> parseStructType(const std::string& input, size_t start,
size_t end);
/**
@@ -173,9 +165,7 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::unique_ptr<Type> parseUnionType(const std::string &input,
- size_t start,
- size_t end);
+ static std::unique_ptr<Type> parseUnionType(const std::string& input, size_t start, size_t end);
/**
* Parse decimal type from string
@@ -183,8 +173,7 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::unique_ptr<Type> parseDecimalType(const std::string &input,
- size_t start,
+ static std::unique_ptr<Type> parseDecimalType(const std::string& input, size_t start,
size_t end);
/**
@@ -194,14 +183,11 @@ namespace orc {
* @param start start position of the input string
* @param end end position of the input string
*/
- static std::unique_ptr<Type> parseCategory(std::string category,
- const std::string &input,
- size_t start,
- size_t end);
+ static std::unique_ptr<Type> parseCategory(std::string category, const std::string& input,
+ size_t start, size_t end);
};
- std::unique_ptr<Type> convertType(const proto::Type& type,
- const proto::Footer& footer);
+ std::unique_ptr<Type> convertType(const proto::Type& type, const proto::Footer& footer);
/**
* Build a clone of the file type, projecting columns from the selected
@@ -211,8 +197,7 @@ namespace orc {
* @param selected is each column by id selected
* @return a clone of the fileType filtered by the selection array
*/
- std::unique_ptr<Type> buildSelectedType(const Type *fileType,
- const std::vector<bool>& selected);
-}
+ std::unique_ptr<Type> buildSelectedType(const Type* fileType, const std::vector<bool>& selected);
+} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/src/Utils.hh b/contrib/libs/apache/orc/c++/src/Utils.hh
new file mode 100644
index 0000000000..751c09b205
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Utils.hh
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_UTILS_HH
+#define ORC_UTILS_HH
+
+#include <atomic>
+#include <chrono>
+
+namespace orc {
+
+ class AutoStopwatch {
+ std::chrono::high_resolution_clock::time_point start;
+ std::atomic<uint64_t>* latencyUs;
+ std::atomic<uint64_t>* count;
+ bool minus;
+
+ public:
+ AutoStopwatch(std::atomic<uint64_t>* _latencyUs, std::atomic<uint64_t>* _count,
+ bool _minus = false)
+ : latencyUs(_latencyUs), count(_count), minus(_minus) {
+ if (latencyUs) {
+ start = std::chrono::high_resolution_clock::now();
+ }
+ }
+
+ ~AutoStopwatch() {
+ if (latencyUs) {
+ std::chrono::microseconds elapsedTime =
+ std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::high_resolution_clock::now() - start);
+ if (!minus) {
+ latencyUs->fetch_add(static_cast<uint64_t>(elapsedTime.count()));
+ } else {
+ latencyUs->fetch_sub(static_cast<uint64_t>(elapsedTime.count()));
+ }
+ }
+
+ if (count) {
+ count->fetch_add(1);
+ }
+ }
+ };
+
+#if ENABLE_METRICS
+#define SCOPED_STOPWATCH(METRICS_PTR, LATENCY_VAR, COUNT_VAR) \
+ AutoStopwatch measure((METRICS_PTR == nullptr ? nullptr : &METRICS_PTR->LATENCY_VAR), \
+ (METRICS_PTR == nullptr ? nullptr : &METRICS_PTR->COUNT_VAR))
+
+#define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR) \
+ AutoStopwatch measure((METRICS_PTR == nullptr ? nullptr : &METRICS_PTR->LATENCY_VAR), nullptr, \
+ true)
+#else
+#define SCOPED_STOPWATCH(METRICS_PTR, LATENCY_VAR, COUNT_VAR)
+#define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR)
+#endif
+
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc
index fefaaad4b1..b9e2854586 100644
--- a/contrib/libs/apache/orc/c++/src/Vector.cc
+++ b/contrib/libs/apache/orc/c++/src/Vector.cc
@@ -20,21 +20,21 @@
#include "Adaptor.hh"
#include "orc/Exceptions.hh"
+#include "orc/MemoryPool.hh"
+#include <cstdlib>
#include <iostream>
#include <sstream>
-#include <cstdlib>
namespace orc {
- ColumnVectorBatch::ColumnVectorBatch(uint64_t cap,
- MemoryPool& pool
- ): capacity(cap),
- numElements(0),
- notNull(pool, cap),
- hasNulls(false),
- isEncoded(false),
- memoryPool(pool) {
+ ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, MemoryPool& pool)
+ : capacity(cap),
+ numElements(0),
+ notNull(pool, cap),
+ hasNulls(false),
+ isEncoded(false),
+ memoryPool(pool) {
std::memset(notNull.data(), 1, capacity);
}
@@ -61,81 +61,13 @@ namespace orc {
return false;
}
- LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool
- ): ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity) {
- // PASS
- }
-
- LongVectorBatch::~LongVectorBatch() {
- // PASS
- }
-
- std::string LongVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Long vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void LongVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- data.resize(cap);
- }
- }
-
- void LongVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t LongVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage() +
- static_cast<uint64_t>(data.capacity() * sizeof(int64_t));
- }
-
- DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool
- ): ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity) {
- // PASS
- }
-
- DoubleVectorBatch::~DoubleVectorBatch() {
- // PASS
- }
-
- std::string DoubleVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Double vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void DoubleVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- data.resize(cap);
- }
- }
-
- void DoubleVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t DoubleVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(data.capacity() * sizeof(double));
- }
-
StringDictionary::StringDictionary(MemoryPool& pool)
- : dictionaryBlob(pool),
- dictionaryOffset(pool) {
+ : dictionaryBlob(pool), dictionaryOffset(pool) {
// PASS
}
- EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity,
- MemoryPool& pool)
- : StringVectorBatch(_capacity, pool),
- dictionary(),
- index(pool, _capacity) {
+ EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, MemoryPool& pool)
+ : StringVectorBatch(_capacity, pool), dictionary(), index(pool, _capacity) {
// PASS
}
@@ -156,11 +88,11 @@ namespace orc {
}
}
- StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool
- ): ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity),
- length(pool, _capacity),
- blob(pool) {
+ StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool)
+ : ColumnVectorBatch(_capacity, pool),
+ data(pool, _capacity),
+ length(pool, _capacity),
+ blob(pool) {
// PASS
}
@@ -187,28 +119,27 @@ namespace orc {
}
uint64_t StringVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(data.capacity() * sizeof(char*)
- + length.capacity() * sizeof(int64_t));
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(data.capacity() * sizeof(char*) +
+ length.capacity() * sizeof(int64_t));
}
- StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool) {
+ StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool) {
// PASS
}
StructVectorBatch::~StructVectorBatch() {
- for (uint64_t i=0; i<this->fields.size(); i++) {
+ for (uint64_t i = 0; i < this->fields.size(); i++) {
delete this->fields[i];
}
}
std::string StructVectorBatch::toString() const {
std::ostringstream buffer;
- buffer << "Struct vector <" << numElements << " of " << capacity
- << "; ";
- for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin();
- ptr != fields.end(); ++ptr) {
+ buffer << "Struct vector <" << numElements << " of " << capacity << "; ";
+ for (std::vector<ColumnVectorBatch*>::const_iterator ptr = fields.begin(); ptr != fields.end();
+ ++ptr) {
buffer << (*ptr)->toString() << "; ";
}
buffer << ">";
@@ -220,7 +151,7 @@ namespace orc {
}
void StructVectorBatch::clear() {
- for(size_t i=0; i < fields.size(); i++) {
+ for (size_t i = 0; i < fields.size(); i++) {
fields[i]->clear();
}
numElements = 0;
@@ -228,14 +159,14 @@ namespace orc {
uint64_t StructVectorBatch::getMemoryUsage() {
uint64_t memory = ColumnVectorBatch::getMemoryUsage();
- for (unsigned int i=0; i < fields.size(); i++) {
+ for (unsigned int i = 0; i < fields.size(); i++) {
memory += fields[i]->getMemoryUsage();
}
return memory;
}
bool StructVectorBatch::hasVariableLength() {
- for (unsigned int i=0; i < fields.size(); i++) {
+ for (unsigned int i = 0; i < fields.size(); i++) {
if (fields[i]->hasVariableLength()) {
return true;
}
@@ -243,10 +174,9 @@ namespace orc {
return false;
}
- ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- offsets(pool, cap+1) {
- // PASS
+ ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
+ offsets.zeroOut();
}
ListVectorBatch::~ListVectorBatch() {
@@ -255,8 +185,8 @@ namespace orc {
std::string ListVectorBatch::toString() const {
std::ostringstream buffer;
- buffer << "List vector <" << elements->toString() << " with "
- << numElements << " of " << capacity << ">";
+ buffer << "List vector <" << elements->toString() << " with " << numElements << " of "
+ << capacity << ">";
return buffer.str();
}
@@ -273,19 +203,17 @@ namespace orc {
}
uint64_t ListVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t))
- + elements->getMemoryUsage();
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + elements->getMemoryUsage();
}
bool ListVectorBatch::hasVariableLength() {
return true;
}
- MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- offsets(pool, cap+1) {
- // PASS
+ MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
+ offsets.zeroOut();
}
MapVectorBatch::~MapVectorBatch() {
@@ -294,9 +222,9 @@ namespace orc {
std::string MapVectorBatch::toString() const {
std::ostringstream buffer;
- buffer << "Map vector <" << (keys ? keys->toString(): "key not selected") << ", "
- << (elements ? elements->toString(): "value not selected") << " with "
- << numElements << " of " << capacity << ">";
+ buffer << "Map vector <" << (keys ? keys->toString() : "key not selected") << ", "
+ << (elements ? elements->toString() : "value not selected") << " with " << numElements
+ << " of " << capacity << ">";
return buffer.str();
}
@@ -314,25 +242,23 @@ namespace orc {
}
uint64_t MapVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t))
- + (keys ? keys->getMemoryUsage() : 0)
- + (elements ? elements->getMemoryUsage() : 0);
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) +
+ (keys ? keys->getMemoryUsage() : 0) + (elements ? elements->getMemoryUsage() : 0);
}
bool MapVectorBatch::hasVariableLength() {
return true;
}
- UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- tags(pool, cap),
- offsets(pool, cap) {
- // PASS
+ UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) {
+ tags.zeroOut();
+ offsets.zeroOut();
}
UnionVectorBatch::~UnionVectorBatch() {
- for (uint64_t i=0; i < children.size(); i++) {
+ for (uint64_t i = 0; i < children.size(); i++) {
delete children[i];
}
}
@@ -340,7 +266,7 @@ namespace orc {
std::string UnionVectorBatch::toString() const {
std::ostringstream buffer;
buffer << "Union vector <";
- for(size_t i=0; i < children.size(); ++i) {
+ for (size_t i = 0; i < children.size(); ++i) {
if (i != 0) {
buffer << ", ";
}
@@ -359,24 +285,24 @@ namespace orc {
}
void UnionVectorBatch::clear() {
- for(size_t i=0; i < children.size(); i++) {
+ for (size_t i = 0; i < children.size(); i++) {
children[i]->clear();
}
numElements = 0;
}
uint64_t UnionVectorBatch::getMemoryUsage() {
- uint64_t memory = ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char)
- + offsets.capacity() * sizeof(uint64_t));
- for(size_t i=0; i < children.size(); ++i) {
+ uint64_t memory = ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) +
+ offsets.capacity() * sizeof(uint64_t));
+ for (size_t i = 0; i < children.size(); ++i) {
memory += children[i]->getMemoryUsage();
}
return memory;
}
bool UnionVectorBatch::hasVariableLength() {
- for(size_t i=0; i < children.size(); ++i) {
+ for (size_t i = 0; i < children.size(); ++i) {
if (children[i]->hasVariableLength()) {
return true;
}
@@ -384,12 +310,12 @@ namespace orc {
return false;
}
- Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- precision(0),
- scale(0),
- values(pool, cap),
- readScales(pool, cap) {
+ Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool),
+ precision(0),
+ scale(0),
+ values(pool, cap),
+ readScales(pool, cap) {
// PASS
}
@@ -399,8 +325,7 @@ namespace orc {
std::string Decimal64VectorBatch::toString() const {
std::ostringstream buffer;
- buffer << "Decimal64 vector with "
- << numElements << " of " << capacity << ">";
+ buffer << "Decimal64 vector with " << numElements << " of " << capacity << ">";
return buffer.str();
}
@@ -417,17 +342,16 @@ namespace orc {
}
uint64_t Decimal64VectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(
- (values.capacity() + readScales.capacity()) * sizeof(int64_t));
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>((values.capacity() + readScales.capacity()) * sizeof(int64_t));
}
- Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- precision(0),
- scale(0),
- values(pool, cap),
- readScales(pool, cap) {
+ Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool)
+ : ColumnVectorBatch(cap, pool),
+ precision(0),
+ scale(0),
+ values(pool, cap),
+ readScales(pool, cap) {
// PASS
}
@@ -437,8 +361,7 @@ namespace orc {
std::string Decimal128VectorBatch::toString() const {
std::ostringstream buffer;
- buffer << "Decimal128 vector with "
- << numElements << " of " << capacity << ">";
+ buffer << "Decimal128 vector with " << numElements << " of " << capacity << ">";
return buffer.str();
}
@@ -455,23 +378,22 @@ namespace orc {
}
uint64_t Decimal128VectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(values.capacity() * sizeof(Int128)
- + readScales.capacity() * sizeof(int64_t));
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(values.capacity() * sizeof(Int128) +
+ readScales.capacity() * sizeof(int64_t));
}
- Decimal::Decimal(const Int128& _value,
- int32_t _scale): value(_value), scale(_scale) {
+ Decimal::Decimal(const Int128& _value, int32_t _scale) : value(_value), scale(_scale) {
// PASS
}
Decimal::Decimal(const std::string& str) {
std::size_t foundPoint = str.find(".");
// no decimal point, it is int
- if(foundPoint == std::string::npos){
+ if (foundPoint == std::string::npos) {
value = Int128(str);
scale = 0;
- }else{
+ } else {
std::string copy(str);
scale = static_cast<int32_t>(str.length() - foundPoint - 1);
value = Int128(copy.replace(foundPoint, 1, ""));
@@ -486,12 +408,8 @@ namespace orc {
return value.toDecimalString(scale, trimTrailingZeros);
}
- TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity,
- MemoryPool& pool
- ): ColumnVectorBatch(_capacity,
- pool),
- data(pool, _capacity),
- nanoseconds(pool, _capacity) {
+ TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, MemoryPool& pool)
+ : ColumnVectorBatch(_capacity, pool), data(pool, _capacity), nanoseconds(pool, _capacity) {
// PASS
}
@@ -518,8 +436,7 @@ namespace orc {
}
uint64_t TimestampVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(
- (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t));
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>((data.capacity() + nanoseconds.capacity()) * sizeof(int64_t));
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc
index 8a7d10ba81..19b71190a3 100644
--- a/contrib/libs/apache/orc/c++/src/Writer.cc
+++ b/contrib/libs/apache/orc/c++/src/Writer.cc
@@ -21,6 +21,7 @@
#include "ColumnWriter.hh"
#include "Timezone.hh"
+#include "Utils.hh"
#include <memory>
@@ -42,37 +43,41 @@ namespace orc {
double bloomFilterFalsePositiveProb;
BloomFilterVersion bloomFilterVersion;
std::string timezone;
+ WriterMetrics* metrics;
+ bool useTightNumericVector;
+ uint64_t outputBufferCapacity;
- WriterOptionsPrivate() :
- fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
- stripeSize = 64 * 1024 * 1024; // 64M
- compressionBlockSize = 64 * 1024; // 64K
+ WriterOptionsPrivate() : fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
+ stripeSize = 64 * 1024 * 1024; // 64M
+ compressionBlockSize = 64 * 1024; // 64K
rowIndexStride = 10000;
- compression = CompressionKind_ZLIB;
+ compression = CompressionKind_ZSTD;
compressionStrategy = CompressionStrategy_SPEED;
memoryPool = getDefaultPool();
paddingTolerance = 0.0;
errorStream = &std::cerr;
dictionaryKeySizeThreshold = 0.0;
enableIndex = true;
- bloomFilterFalsePositiveProb = 0.05;
+ bloomFilterFalsePositiveProb = 0.01;
bloomFilterVersion = UTF8;
- //Writer timezone uses "GMT" by default to get rid of potential issues
- //introduced by moving timestamps between different timezones.
- //Explictly set the writer timezone if the use case depends on it.
+ // Writer timezone uses "GMT" by default to get rid of potential issues
+ // introduced by moving timestamps between different timezones.
+ // Explictly set the writer timezone if the use case depends on it.
timezone = "GMT";
+ metrics = nullptr;
+ useTightNumericVector = false;
+ outputBufferCapacity = 1024 * 1024;
}
};
- WriterOptions::WriterOptions():
- privateBits(std::unique_ptr<WriterOptionsPrivate>
- (new WriterOptionsPrivate())) {
+ WriterOptions::WriterOptions()
+ : privateBits(std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate())) {
// PASS
}
- WriterOptions::WriterOptions(const WriterOptions& rhs):
- privateBits(std::unique_ptr<WriterOptionsPrivate>
- (new WriterOptionsPrivate(*(rhs.privateBits.get())))) {
+ WriterOptions::WriterOptions(const WriterOptions& rhs)
+ : privateBits(std::unique_ptr<WriterOptionsPrivate>(
+ new WriterOptionsPrivate(*(rhs.privateBits.get())))) {
// PASS
}
@@ -92,8 +97,7 @@ namespace orc {
// PASS
}
RleVersion WriterOptions::getRleVersion() const {
- if(privateBits->fileVersion == FileVersion::v_0_11())
- {
+ if (privateBits->fileVersion == FileVersion::v_0_11()) {
return RleVersion_1;
}
@@ -110,6 +114,9 @@ namespace orc {
}
WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) {
+ if (size >= (1 << 23)) {
+ throw std::invalid_argument("Compression block size cannot be greater or equal than 8M");
+ }
privateBits->compressionBlockSize = size;
return *this;
}
@@ -167,8 +174,7 @@ namespace orc {
return privateBits->compression;
}
- WriterOptions& WriterOptions::setCompressionStrategy(
- CompressionStrategy strategy) {
+ WriterOptions& WriterOptions::setCompressionStrategy(CompressionStrategy strategy) {
privateBits->compressionStrategy = strategy;
return *this;
}
@@ -216,8 +222,7 @@ namespace orc {
return privateBits->dictionaryKeySizeThreshold > 0.0;
}
- WriterOptions& WriterOptions::setColumnsUseBloomFilter(
- const std::set<uint64_t>& columns) {
+ WriterOptions& WriterOptions::setColumnsUseBloomFilter(const std::set<uint64_t>& columns) {
privateBits->columnsUseBloomFilter = columns;
return *this;
}
@@ -255,12 +260,39 @@ namespace orc {
return *this;
}
+ WriterMetrics* WriterOptions::getWriterMetrics() const {
+ return privateBits->metrics;
+ }
+
+ WriterOptions& WriterOptions::setWriterMetrics(WriterMetrics* metrics) {
+ privateBits->metrics = metrics;
+ return *this;
+ }
+
+ WriterOptions& WriterOptions::setUseTightNumericVector(bool useTightNumericVector) {
+ privateBits->useTightNumericVector = useTightNumericVector;
+ return *this;
+ }
+
+ bool WriterOptions::getUseTightNumericVector() const {
+ return privateBits->useTightNumericVector;
+ }
+
+ WriterOptions& WriterOptions::setOutputBufferCapacity(uint64_t capacity) {
+ privateBits->outputBufferCapacity = capacity;
+ return *this;
+ }
+
+ uint64_t WriterOptions::getOutputBufferCapacity() const {
+ return privateBits->outputBufferCapacity;
+ }
+
Writer::~Writer() {
// PASS
}
class WriterImpl : public Writer {
- private:
+ private:
std::unique_ptr<ColumnWriter> columnWriter;
std::unique_ptr<BufferedOutputStream> compressionStream;
std::unique_ptr<BufferedOutputStream> bufferedStream;
@@ -277,23 +309,24 @@ namespace orc {
static const char* magicId;
static const WriterId writerId;
+ bool useTightNumericVector;
+ int32_t stripesAtLastFlush;
+ uint64_t lastFlushOffset;
- public:
- WriterImpl(
- const Type& type,
- OutputStream* stream,
- const WriterOptions& options);
+ public:
+ WriterImpl(const Type& type, OutputStream* stream, const WriterOptions& options);
- std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size)
- const override;
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const override;
void add(ColumnVectorBatch& rowsToAdd) override;
void close() override;
- void addUserMetadata(const std::string name, const std::string value) override;
+ void addUserMetadata(const std::string& name, const std::string& value) override;
- private:
+ uint64_t writeIntermediateFooter() override;
+
+ private:
void init();
void initStripe();
void writeStripe();
@@ -301,48 +334,41 @@ namespace orc {
void writeFileFooter();
void writePostscript();
void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index);
- static proto::CompressionKind convertCompressionKind(
- const CompressionKind& kind);
+ static proto::CompressionKind convertCompressionKind(const CompressionKind& kind);
};
- const char * WriterImpl::magicId = "ORC";
+ const char* WriterImpl::magicId = "ORC";
const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER;
- WriterImpl::WriterImpl(
- const Type& t,
- OutputStream* stream,
- const WriterOptions& opts) :
- outStream(stream),
- options(opts),
- type(t) {
+ WriterImpl::WriterImpl(const Type& t, OutputStream* stream, const WriterOptions& opts)
+ : outStream(stream), options(opts), type(t) {
streamsFactory = createStreamsFactory(options, outStream);
columnWriter = buildWriter(type, *streamsFactory, options);
stripeRows = totalRows = indexRows = 0;
currentOffset = 0;
+ stripesAtLastFlush = 0;
+ lastFlushOffset = 0;
+
+ useTightNumericVector = opts.getUseTightNumericVector();
// compression stream for stripe footer, file footer and metadata
- compressionStream = createCompressor(
- options.getCompression(),
- outStream,
- options.getCompressionStrategy(),
- 1 * 1024 * 1024, // buffer capacity: 1M
- options.getCompressionBlockSize(),
- *options.getMemoryPool());
+ compressionStream =
+ createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(),
+ options.getOutputBufferCapacity(), options.getCompressionBlockSize(),
+ *options.getMemoryPool(), options.getWriterMetrics());
// uncompressed stream for post script
- bufferedStream.reset(new BufferedOutputStream(
- *options.getMemoryPool(),
- outStream,
- 1024, // buffer capacity: 1024 bytes
- options.getCompressionBlockSize()));
+ bufferedStream.reset(new BufferedOutputStream(*options.getMemoryPool(), outStream,
+ 1024, // buffer capacity: 1024 bytes
+ options.getCompressionBlockSize(),
+ options.getWriterMetrics()));
init();
}
- std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size)
- const {
- return type.createRowBatch(size, *options.getMemoryPool());
+ std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) const {
+ return type.createRowBatch(size, *options.getMemoryPool(), false, useTightNumericVector);
}
void WriterImpl::add(ColumnVectorBatch& rowsToAdd) {
@@ -351,8 +377,7 @@ namespace orc {
uint64_t chunkSize = 0;
uint64_t rowIndexStride = options.getRowIndexStride();
while (pos < rowsToAdd.numElements) {
- chunkSize = std::min(rowsToAdd.numElements - pos,
- rowIndexStride - indexRows);
+ chunkSize = std::min(rowsToAdd.numElements - pos, rowIndexStride - indexRows);
columnWriter->add(rowsToAdd, pos, chunkSize, nullptr);
pos += chunkSize;
@@ -384,7 +409,25 @@ namespace orc {
outStream->close();
}
- void WriterImpl::addUserMetadata(const std::string name, const std::string value){
+ uint64_t WriterImpl::writeIntermediateFooter() {
+ if (stripeRows > 0) {
+ writeStripe();
+ }
+ if (stripesAtLastFlush != fileFooter.stripes_size()) {
+ writeMetadata();
+ writeFileFooter();
+ writePostscript();
+ stripesAtLastFlush = fileFooter.stripes_size();
+ outStream->flush();
+ lastFlushOffset = outStream->getLength();
+ currentOffset = lastFlushOffset;
+ // init stripe now that we adjusted the currentOffset
+ initStripe();
+ }
+ return lastFlushOffset;
+ }
+
+ void WriterImpl::addUserMetadata(const std::string& name, const std::string& value) {
proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata();
userMetadataItem->set_name(TString(name));
userMetadataItem->set_value(TString(value));
@@ -393,31 +436,32 @@ namespace orc {
void WriterImpl::init() {
// Write file header
const static size_t magicIdLength = strlen(WriterImpl::magicId);
- outStream->write(WriterImpl::magicId, magicIdLength);
+ {
+ SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount);
+ outStream->write(WriterImpl::magicId, magicIdLength);
+ }
currentOffset += magicIdLength;
// Initialize file footer
- fileFooter.set_headerlength(currentOffset);
- fileFooter.set_contentlength(0);
- fileFooter.set_numberofrows(0);
- fileFooter.set_rowindexstride(
- static_cast<uint32_t>(options.getRowIndexStride()));
+ fileFooter.set_header_length(currentOffset);
+ fileFooter.set_content_length(0);
+ fileFooter.set_number_of_rows(0);
+ fileFooter.set_row_index_stride(static_cast<uint32_t>(options.getRowIndexStride()));
fileFooter.set_writer(writerId);
- fileFooter.set_softwareversion(ORC_VERSION);
+ fileFooter.set_software_version(ORC_VERSION);
uint32_t index = 0;
buildFooterType(type, fileFooter, index);
// Initialize post script
- postScript.set_footerlength(0);
- postScript.set_compression(
- WriterImpl::convertCompressionKind(options.getCompression()));
- postScript.set_compressionblocksize(options.getCompressionBlockSize());
+ postScript.set_footer_length(0);
+ postScript.set_compression(WriterImpl::convertCompressionKind(options.getCompression()));
+ postScript.set_compression_block_size(options.getCompressionBlockSize());
postScript.add_version(options.getFileVersion().getMajor());
postScript.add_version(options.getFileVersion().getMinor());
- postScript.set_writerversion(WriterVersion_ORC_135);
+ postScript.set_writer_version(WriterVersion_ORC_135);
postScript.set_magic("ORC");
// Initialize first stripe
@@ -426,10 +470,10 @@ namespace orc {
void WriterImpl::initStripe() {
stripeInfo.set_offset(currentOffset);
- stripeInfo.set_indexlength(0);
- stripeInfo.set_datalength(0);
- stripeInfo.set_footerlength(0);
- stripeInfo.set_numberofrows(0);
+ stripeInfo.set_index_length(0);
+ stripeInfo.set_data_length(0);
+ stripeInfo.set_footer_length(0);
+ stripeInfo.set_number_of_rows(0);
stripeRows = indexRows = 0;
}
@@ -466,14 +510,14 @@ namespace orc {
*stripeFooter.add_columns() = encodings[i];
}
- stripeFooter.set_writertimezone(TString(options.getTimezoneName()));
+ stripeFooter.set_writer_timezone(TString(options.getTimezoneName()));
// add stripe statistics to metadata
- proto::StripeStatistics* stripeStats = metadata.add_stripestats();
+ proto::StripeStatistics* stripeStats = metadata.add_stripe_stats();
std::vector<proto::ColumnStatistics> colStats;
columnWriter->getStripeStatistics(colStats);
for (uint32_t i = 0; i != colStats.size(); ++i) {
- *stripeStats->add_colstats() = colStats[i];
+ *stripeStats->add_col_stats() = colStats[i];
}
// merge stripe stats into file stats and clear stripe stats
columnWriter->mergeStripeStatsIntoFileStats();
@@ -496,10 +540,10 @@ namespace orc {
}
// update stripe info
- stripeInfo.set_indexlength(indexLength);
- stripeInfo.set_datalength(dataLength);
- stripeInfo.set_footerlength(footerLength);
- stripeInfo.set_numberofrows(stripeRows);
+ stripeInfo.set_index_length(indexLength);
+ stripeInfo.set_data_length(dataLength);
+ stripeInfo.set_footer_length(footerLength);
+ stripeInfo.set_number_of_rows(stripeRows);
*fileFooter.add_stripes() = stripeInfo;
@@ -515,16 +559,17 @@ namespace orc {
if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) {
throw std::logic_error("Failed to write metadata.");
}
- postScript.set_metadatalength(compressionStream.get()->flush());
+ postScript.set_metadata_length(compressionStream.get()->flush());
}
void WriterImpl::writeFileFooter() {
- fileFooter.set_contentlength(currentOffset - fileFooter.headerlength());
- fileFooter.set_numberofrows(totalRows);
+ fileFooter.set_content_length(currentOffset - fileFooter.header_length());
+ fileFooter.set_number_of_rows(totalRows);
// update file statistics
std::vector<proto::ColumnStatistics> colStats;
columnWriter->getFileStatistics(colStats);
+ fileFooter.clear_statistics();
for (uint32_t i = 0; i != colStats.size(); ++i) {
*fileFooter.add_statistics() = colStats[i];
}
@@ -532,106 +577,103 @@ namespace orc {
if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) {
throw std::logic_error("Failed to write file footer.");
}
- postScript.set_footerlength(compressionStream->flush());
+ postScript.set_footer_length(compressionStream->flush());
}
void WriterImpl::writePostscript() {
if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) {
throw std::logic_error("Failed to write post script.");
}
- unsigned char psLength =
- static_cast<unsigned char>(bufferedStream->flush());
+ unsigned char psLength = static_cast<unsigned char>(bufferedStream->flush());
+ SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount);
outStream->write(&psLength, sizeof(unsigned char));
}
- void WriterImpl::buildFooterType(
- const Type& t,
- proto::Footer& footer,
- uint32_t & index) {
+ void WriterImpl::buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index) {
proto::Type protoType;
- protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength()));
+ protoType.set_maximum_length(static_cast<uint32_t>(t.getMaximumLength()));
protoType.set_precision(static_cast<uint32_t>(t.getPrecision()));
protoType.set_scale(static_cast<uint32_t>(t.getScale()));
switch (t.getKind()) {
- case BOOLEAN: {
- protoType.set_kind(proto::Type_Kind_BOOLEAN);
- break;
- }
- case BYTE: {
- protoType.set_kind(proto::Type_Kind_BYTE);
- break;
- }
- case SHORT: {
- protoType.set_kind(proto::Type_Kind_SHORT);
- break;
- }
- case INT: {
- protoType.set_kind(proto::Type_Kind_INT);
- break;
- }
- case LONG: {
- protoType.set_kind(proto::Type_Kind_LONG);
- break;
- }
- case FLOAT: {
- protoType.set_kind(proto::Type_Kind_FLOAT);
- break;
- }
- case DOUBLE: {
- protoType.set_kind(proto::Type_Kind_DOUBLE);
- break;
- }
- case STRING: {
- protoType.set_kind(proto::Type_Kind_STRING);
- break;
- }
- case BINARY: {
- protoType.set_kind(proto::Type_Kind_BINARY);
- break;
- }
- case TIMESTAMP: {
- protoType.set_kind(proto::Type_Kind_TIMESTAMP);
- break;
- }
- case TIMESTAMP_INSTANT: {
- protoType.set_kind(proto::Type_Kind_TIMESTAMP_INSTANT);
- break;
- }
- case LIST: {
- protoType.set_kind(proto::Type_Kind_LIST);
- break;
- }
- case MAP: {
- protoType.set_kind(proto::Type_Kind_MAP);
- break;
- }
- case STRUCT: {
- protoType.set_kind(proto::Type_Kind_STRUCT);
- break;
- }
- case UNION: {
- protoType.set_kind(proto::Type_Kind_UNION);
- break;
- }
- case DECIMAL: {
- protoType.set_kind(proto::Type_Kind_DECIMAL);
- break;
- }
- case DATE: {
- protoType.set_kind(proto::Type_Kind_DATE);
- break;
- }
- case VARCHAR: {
- protoType.set_kind(proto::Type_Kind_VARCHAR);
- break;
- }
- case CHAR: {
- protoType.set_kind(proto::Type_Kind_CHAR);
- break;
- }
- default:
- throw std::logic_error("Unknown type.");
+ case BOOLEAN: {
+ protoType.set_kind(proto::Type_Kind_BOOLEAN);
+ break;
+ }
+ case BYTE: {
+ protoType.set_kind(proto::Type_Kind_BYTE);
+ break;
+ }
+ case SHORT: {
+ protoType.set_kind(proto::Type_Kind_SHORT);
+ break;
+ }
+ case INT: {
+ protoType.set_kind(proto::Type_Kind_INT);
+ break;
+ }
+ case LONG: {
+ protoType.set_kind(proto::Type_Kind_LONG);
+ break;
+ }
+ case FLOAT: {
+ protoType.set_kind(proto::Type_Kind_FLOAT);
+ break;
+ }
+ case DOUBLE: {
+ protoType.set_kind(proto::Type_Kind_DOUBLE);
+ break;
+ }
+ case STRING: {
+ protoType.set_kind(proto::Type_Kind_STRING);
+ break;
+ }
+ case BINARY: {
+ protoType.set_kind(proto::Type_Kind_BINARY);
+ break;
+ }
+ case TIMESTAMP: {
+ protoType.set_kind(proto::Type_Kind_TIMESTAMP);
+ break;
+ }
+ case TIMESTAMP_INSTANT: {
+ protoType.set_kind(proto::Type_Kind_TIMESTAMP_INSTANT);
+ break;
+ }
+ case LIST: {
+ protoType.set_kind(proto::Type_Kind_LIST);
+ break;
+ }
+ case MAP: {
+ protoType.set_kind(proto::Type_Kind_MAP);
+ break;
+ }
+ case STRUCT: {
+ protoType.set_kind(proto::Type_Kind_STRUCT);
+ break;
+ }
+ case UNION: {
+ protoType.set_kind(proto::Type_Kind_UNION);
+ break;
+ }
+ case DECIMAL: {
+ protoType.set_kind(proto::Type_Kind_DECIMAL);
+ break;
+ }
+ case DATE: {
+ protoType.set_kind(proto::Type_Kind_DATE);
+ break;
+ }
+ case VARCHAR: {
+ protoType.set_kind(proto::Type_Kind_VARCHAR);
+ break;
+ }
+ case CHAR: {
+ protoType.set_kind(proto::Type_Kind_CHAR);
+ break;
+ }
+ default:
+ throw std::logic_error("Unknown type.");
}
for (auto& key : t.getAttributeKeys()) {
@@ -647,28 +689,20 @@ namespace orc {
for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) {
// only add subtypes' field names if this type is STRUCT
if (t.getKind() == STRUCT) {
- footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i)));
+ footer.mutable_types(pos)->add_field_names(TString(t.getFieldName(i)));
}
footer.mutable_types(pos)->add_subtypes(++index);
buildFooterType(*t.getSubtype(i), footer, index);
}
}
- proto::CompressionKind WriterImpl::convertCompressionKind(
- const CompressionKind& kind) {
+ proto::CompressionKind WriterImpl::convertCompressionKind(const CompressionKind& kind) {
return static_cast<proto::CompressionKind>(kind);
}
- std::unique_ptr<Writer> createWriter(
- const Type& type,
- OutputStream* stream,
+ std::unique_ptr<Writer> createWriter(const Type& type, OutputStream* stream,
const WriterOptions& options) {
- return std::unique_ptr<Writer>(
- new WriterImpl(
- type,
- stream,
- options));
+ return std::unique_ptr<Writer>(new WriterImpl(type, stream, options));
}
-}
-
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
index ec798d4ed7..3bf1781747 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
@@ -16,26 +16,22 @@
* limitations under the License.
*/
-#include "orc/Exceptions.hh"
#include "InputStream.hh"
+#include "orc/Exceptions.hh"
#include <algorithm>
#include <iomanip>
namespace orc {
- void printBuffer(std::ostream& out,
- const char *buffer,
- uint64_t length) {
+ void printBuffer(std::ostream& out, const char* buffer, uint64_t length) {
const uint64_t width = 24;
out << std::hex;
- for(uint64_t line = 0; line < (length + width - 1) / width; ++line) {
+ for (uint64_t line = 0; line < (length + width - 1) / width; ++line) {
out << std::setfill('0') << std::setw(7) << (line * width);
- for(uint64_t byte = 0;
- byte < width && line * width + byte < length; ++byte) {
+ for (uint64_t byte = 0; byte < width && line * width + byte < length; ++byte) {
out << " " << std::setfill('0') << std::setw(2)
- << static_cast<uint64_t>(0xff & buffer[line * width +
- byte]);
+ << static_cast<uint64_t>(0xff & buffer[line * width + byte]);
}
out << "\n";
}
@@ -64,26 +60,23 @@ namespace orc {
// PASS
}
- SeekableArrayInputStream::SeekableArrayInputStream
- (const unsigned char* values,
- uint64_t size,
- uint64_t blkSize
- ): data(reinterpret_cast<const char*>(values)) {
+ SeekableArrayInputStream::SeekableArrayInputStream(const unsigned char* values, uint64_t size,
+ uint64_t blkSize)
+ : data(reinterpret_cast<const char*>(values)) {
length = size;
position = 0;
blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
}
- SeekableArrayInputStream::SeekableArrayInputStream(const char* values,
- uint64_t size,
- uint64_t blkSize
- ): data(values) {
+ SeekableArrayInputStream::SeekableArrayInputStream(const char* values, uint64_t size,
+ uint64_t blkSize)
+ : data(values) {
length = size;
position = 0;
blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
}
- bool SeekableArrayInputStream::Next(const void** buffer, int*size) {
+ bool SeekableArrayInputStream::Next(const void** buffer, int* size) {
uint64_t currentSize = std::min(length - position, blockSize);
if (currentSize > 0) {
*buffer = data + position;
@@ -137,19 +130,14 @@ namespace orc {
return std::min(length, request == 0 ? 256 * 1024 : request);
}
- SeekableFileInputStream::SeekableFileInputStream(InputStream* stream,
- uint64_t offset,
- uint64_t byteCount,
- MemoryPool& _pool,
- uint64_t _blockSize
- ):pool(_pool),
- input(stream),
- start(offset),
- length(byteCount),
- blockSize(computeBlock
- (_blockSize,
- length)) {
-
+ SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, uint64_t offset,
+ uint64_t byteCount, MemoryPool& _pool,
+ uint64_t _blockSize)
+ : pool(_pool),
+ input(stream),
+ start(offset),
+ length(byteCount),
+ blockSize(computeBlock(_blockSize, length)) {
position = 0;
buffer.reset(new DataBuffer<char>(pool));
pushBack = 0;
@@ -159,7 +147,7 @@ namespace orc {
// PASS
}
- bool SeekableFileInputStream::Next(const void** data, int*size) {
+ bool SeekableFileInputStream::Next(const void** data, int* size) {
uint64_t bytesRead;
if (pushBack != 0) {
*data = buffer->data() + (buffer->size() - pushBack);
@@ -168,7 +156,7 @@ namespace orc {
bytesRead = std::min(length - position, blockSize);
buffer->resize(bytesRead);
if (bytesRead > 0) {
- input->read(buffer->data(), bytesRead, start+position);
+ input->read(buffer->data(), bytesRead, start + position);
*data = static_cast<void*>(buffer->data());
}
}
@@ -218,9 +206,8 @@ namespace orc {
std::string SeekableFileInputStream::getName() const {
std::ostringstream result;
- result << input->getName() << " from " << start << " for "
- << length;
+ result << input->getName() << " from " << start << " for " << length;
return result.str();
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
index ab7ecedb44..33c64f8809 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
@@ -23,22 +23,21 @@
#include "orc/OrcFile.hh"
#include "wrap/zero-copy-stream-wrapper.h"
-#include <list>
#include <fstream>
#include <iostream>
+#include <list>
#include <sstream>
#include <vector>
namespace orc {
- void printBuffer(std::ostream& out,
- const char *buffer,
- uint64_t length);
+ void printBuffer(std::ostream& out, const char* buffer, uint64_t length);
class PositionProvider {
- private:
+ private:
std::list<uint64_t>::const_iterator position;
- public:
+
+ public:
PositionProvider(const std::list<uint64_t>& positions);
uint64_t next();
uint64_t current();
@@ -49,9 +48,9 @@ namespace orc {
* By extending Google's class, we get the ability to pass it directly
* to the protobuf readers.
*/
- class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream {
- public:
- virtual ~SeekableInputStream();
+ class SeekableInputStream : public google::protobuf::io::ZeroCopyInputStream {
+ public:
+ ~SeekableInputStream() override;
virtual void seek(PositionProvider& position) = 0;
virtual std::string getName() const = 0;
};
@@ -59,22 +58,18 @@ namespace orc {
/**
* Create a seekable input stream based on a memory range.
*/
- class SeekableArrayInputStream: public SeekableInputStream {
- private:
+ class SeekableArrayInputStream : public SeekableInputStream {
+ private:
const char* data;
uint64_t length;
uint64_t position;
uint64_t blockSize;
- public:
- SeekableArrayInputStream(const unsigned char* list,
- uint64_t length,
- uint64_t block_size = 0);
- SeekableArrayInputStream(const char* list,
- uint64_t length,
- uint64_t block_size = 0);
+ public:
+ SeekableArrayInputStream(const unsigned char* list, uint64_t length, uint64_t block_size = 0);
+ SeekableArrayInputStream(const char* list, uint64_t length, uint64_t block_size = 0);
virtual ~SeekableArrayInputStream() override;
- virtual bool Next(const void** data, int*size) override;
+ virtual bool Next(const void** data, int* size) override;
virtual void BackUp(int count) override;
virtual bool Skip(int count) override;
virtual int64_t ByteCount() const override;
@@ -85,8 +80,8 @@ namespace orc {
/**
* Create a seekable input stream based on an input stream.
*/
- class SeekableFileInputStream: public SeekableInputStream {
- private:
+ class SeekableFileInputStream : public SeekableInputStream {
+ private:
MemoryPool& pool;
InputStream* const input;
const uint64_t start;
@@ -96,15 +91,12 @@ namespace orc {
uint64_t position;
uint64_t pushBack;
- public:
- SeekableFileInputStream(InputStream* input,
- uint64_t offset,
- uint64_t byteCount,
- MemoryPool& pool,
- uint64_t blockSize = 0);
+ public:
+ SeekableFileInputStream(InputStream* input, uint64_t offset, uint64_t byteCount,
+ MemoryPool& pool, uint64_t blockSize = 0);
virtual ~SeekableFileInputStream() override;
- virtual bool Next(const void** data, int*size) override;
+ virtual bool Next(const void** data, int* size) override;
virtual void BackUp(int count) override;
virtual bool Skip(int count) override;
virtual int64_t ByteCount() const override;
@@ -112,6 +104,6 @@ namespace orc {
virtual std::string getName() const override;
};
-}
+} // namespace orc
-#endif //ORC_INPUTSTREAM_HH
+#endif // ORC_INPUTSTREAM_HH
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
index 14d5e5e7c4..7d9fb92206 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
@@ -16,8 +16,9 @@
* limitations under the License.
*/
-#include "orc/Exceptions.hh"
#include "OutputStream.hh"
+#include "Utils.hh"
+#include "orc/Exceptions.hh"
#include <sstream>
@@ -27,14 +28,11 @@ namespace orc {
// PASS
}
- BufferedOutputStream::BufferedOutputStream(
- MemoryPool& pool,
- OutputStream * outStream,
- uint64_t capacity_,
- uint64_t blockSize_)
- : outputStream(outStream),
- blockSize(blockSize_) {
- dataBuffer.reset(new DataBuffer<char>(pool));
+ BufferedOutputStream::BufferedOutputStream(MemoryPool& pool, OutputStream* outStream,
+ uint64_t capacity_, uint64_t blockSize_,
+ WriterMetrics* metrics_)
+ : outputStream(outStream), blockSize(blockSize_), metrics(metrics_) {
+ dataBuffer.reset(new BlockBuffer(pool, blockSize));
dataBuffer->reserve(capacity_);
}
@@ -43,16 +41,12 @@ namespace orc {
}
bool BufferedOutputStream::Next(void** buffer, int* size) {
- *size = static_cast<int>(blockSize);
- uint64_t oldSize = dataBuffer->size();
- uint64_t newSize = oldSize + blockSize;
- uint64_t newCapacity = dataBuffer->capacity();
- while (newCapacity < newSize) {
- newCapacity += dataBuffer->capacity();
+ auto block = dataBuffer->getNextBlock();
+ if (block.data == nullptr) {
+ throw std::logic_error("Failed to get next buffer from block buffer.");
}
- dataBuffer->reserve(newCapacity);
- dataBuffer->resize(newSize);
- *buffer = dataBuffer->data() + oldSize;
+ *buffer = block.data;
+ *size = static_cast<int>(block.size);
return true;
}
@@ -71,7 +65,7 @@ namespace orc {
return static_cast<google::protobuf::int64>(dataBuffer->size());
}
- bool BufferedOutputStream::WriteAliasedRaw(const void *, int) {
+ bool BufferedOutputStream::WriteAliasedRaw(const void*, int) {
throw NotImplementedYet("WriteAliasedRaw is not supported.");
}
@@ -81,8 +75,7 @@ namespace orc {
std::string BufferedOutputStream::getName() const {
std::ostringstream result;
- result << "BufferedOutputStream " << dataBuffer->size() << " of "
- << dataBuffer->capacity();
+ result << "BufferedOutputStream " << dataBuffer->size() << " of " << dataBuffer->capacity();
return result.str();
}
@@ -92,7 +85,11 @@ namespace orc {
uint64_t BufferedOutputStream::flush() {
uint64_t dataSize = dataBuffer->size();
- outputStream->write(dataBuffer->data(), dataSize);
+ // flush data buffer into outputStream
+ if (dataSize > 0) {
+ SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount);
+ dataBuffer->writeTo(outputStream, metrics);
+ }
dataBuffer->resize(0);
return dataSize;
}
@@ -101,20 +98,16 @@ namespace orc {
dataBuffer->resize(0);
}
- void AppendOnlyBufferedStream::write(const char * data, size_t size) {
+ void AppendOnlyBufferedStream::write(const char* data, size_t size) {
size_t dataOffset = 0;
while (size > 0) {
if (bufferOffset == bufferLength) {
- if (!outStream->Next(
- reinterpret_cast<void **>(&buffer),
- &bufferLength)) {
+ if (!outStream->Next(reinterpret_cast<void**>(&buffer), &bufferLength)) {
throw std::logic_error("Failed to allocate buffer.");
}
bufferOffset = 0;
}
- size_t len = std::min(
- static_cast<size_t>(bufferLength - bufferOffset),
- size);
+ size_t len = std::min(static_cast<size_t>(bufferLength - bufferOffset), size);
memcpy(buffer + bufferOffset, data + dataOffset, len);
bufferOffset += static_cast<int>(len);
dataOffset += len;
@@ -148,4 +141,4 @@ namespace orc {
}
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
index 0fb92465e9..d8bc21ce6d 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
@@ -20,6 +20,7 @@
#define ORC_OUTPUTSTREAM_HH
#include "Adaptor.hh"
+#include "BlockBuffer.hh"
#include "orc/OrcFile.hh"
#include "wrap/zero-copy-stream-wrapper.h"
@@ -27,36 +28,41 @@ namespace orc {
/**
* Record write position for creating index stream
- */
+ */
class PositionRecorder {
- public:
+ public:
virtual ~PositionRecorder();
virtual void add(uint64_t pos) = 0;
};
+ DIAGNOSTIC_PUSH
+
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wunused-private-field")
+#endif
+ struct WriterMetrics;
/**
* A subclass of Google's ZeroCopyOutputStream that supports output to memory
* buffer, and flushing to OutputStream.
* By extending Google's class, we get the ability to pass it directly
* to the protobuf writers.
*/
- class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream {
- private:
- OutputStream * outputStream;
- std::unique_ptr<DataBuffer<char> > dataBuffer;
+ class BufferedOutputStream : public google::protobuf::io::ZeroCopyOutputStream {
+ private:
+ OutputStream* outputStream;
+ std::unique_ptr<BlockBuffer> dataBuffer;
uint64_t blockSize;
+ WriterMetrics* metrics;
- public:
- BufferedOutputStream(MemoryPool& pool,
- OutputStream * outStream,
- uint64_t capacity,
- uint64_t block_size);
+ public:
+ BufferedOutputStream(MemoryPool& pool, OutputStream* outStream, uint64_t capacity,
+ uint64_t block_size, WriterMetrics* metrics);
virtual ~BufferedOutputStream() override;
- virtual bool Next(void** data, int*size) override;
+ virtual bool Next(void** data, int* size) override;
virtual void BackUp(int count) override;
virtual int64_t ByteCount() const override;
- virtual bool WriteAliasedRaw(const void * data, int size) override;
+ virtual bool WriteAliasedRaw(const void* data, int size) override;
virtual bool AllowsAliasing() const override;
virtual std::string getName() const;
@@ -64,8 +70,11 @@ namespace orc {
virtual uint64_t flush();
virtual void suppress();
- virtual bool isCompressed() const { return false; }
+ virtual bool isCompressed() const {
+ return false;
+ }
};
+ DIAGNOSTIC_POP
/**
* An append only buffered stream that allows
@@ -74,24 +83,24 @@ namespace orc {
* to the protobuf writers.
*/
class AppendOnlyBufferedStream {
- private:
+ private:
std::unique_ptr<BufferedOutputStream> outStream;
- char * buffer;
+ char* buffer;
int bufferOffset, bufferLength;
- public:
- AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) :
- outStream(std::move(_outStream)) {
+ public:
+ AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream)
+ : outStream(std::move(_outStream)) {
buffer = nullptr;
bufferOffset = bufferLength = 0;
}
- void write(const char * data, size_t size);
+ void write(const char* data, size_t size);
uint64_t getSize() const;
uint64_t flush();
void recordPosition(PositionRecorder* recorder) const;
};
-}
+} // namespace orc
-#endif // ORC_OUTPUTSTREAM_HH
+#endif // ORC_OUTPUTSTREAM_HH
diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
index e7d87083d8..9176c1f6c3 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
@@ -24,41 +24,28 @@
namespace orc {
ExpressionTree::ExpressionTree(Operator op)
- : mOperator(op)
- , mLeaf(UNUSED_LEAF)
- , mConstant(TruthValue::YES_NO_NULL) {
- }
-
+ : mOperator(op), mLeaf(UNUSED_LEAF), mConstant(TruthValue::YES_NO_NULL) {}
- ExpressionTree::ExpressionTree(Operator op,
- std::initializer_list<TreeNode> children)
- : mOperator(op)
- , mChildren(children.begin(), children.end())
- , mLeaf(UNUSED_LEAF)
- , mConstant(TruthValue::YES_NO_NULL) {
+ ExpressionTree::ExpressionTree(Operator op, std::initializer_list<TreeNode> children)
+ : mOperator(op),
+ mChildren(children.begin(), children.end()),
+ mLeaf(UNUSED_LEAF),
+ mConstant(TruthValue::YES_NO_NULL) {
// PASS
}
ExpressionTree::ExpressionTree(size_t leaf)
- : mOperator(Operator::LEAF)
- , mChildren()
- , mLeaf(leaf)
- , mConstant(TruthValue::YES_NO_NULL) {
+ : mOperator(Operator::LEAF), mChildren(), mLeaf(leaf), mConstant(TruthValue::YES_NO_NULL) {
// PASS
}
ExpressionTree::ExpressionTree(TruthValue constant)
- : mOperator(Operator::CONSTANT)
- , mChildren()
- , mLeaf(UNUSED_LEAF)
- , mConstant(constant) {
+ : mOperator(Operator::CONSTANT), mChildren(), mLeaf(UNUSED_LEAF), mConstant(constant) {
// PASS
}
ExpressionTree::ExpressionTree(const ExpressionTree& other)
- : mOperator(other.mOperator)
- , mLeaf(other.mLeaf)
- , mConstant(other.mConstant) {
+ : mOperator(other.mOperator), mLeaf(other.mLeaf), mConstant(other.mConstant) {
for (TreeNode child : other.mChildren) {
mChildren.emplace_back(std::make_shared<ExpressionTree>(*child));
}
@@ -74,7 +61,7 @@ namespace orc {
std::vector<TreeNode>& ExpressionTree::getChildren() {
return const_cast<std::vector<TreeNode>&>(
- const_cast<const ExpressionTree *>(this)->getChildren());
+ const_cast<const ExpressionTree*>(this)->getChildren());
}
const TreeNode ExpressionTree::getChild(size_t i) const {
@@ -83,7 +70,7 @@ namespace orc {
TreeNode ExpressionTree::getChild(size_t i) {
return std::const_pointer_cast<ExpressionTree>(
- const_cast<const ExpressionTree *>(this)->getChild(i));
+ const_cast<const ExpressionTree*>(this)->getChild(i));
}
TruthValue ExpressionTree::getConstant() const {
@@ -105,20 +92,17 @@ namespace orc {
mChildren.push_back(child);
}
- TruthValue ExpressionTree::evaluate(
- const std::vector<TruthValue>& leaves) const {
+ TruthValue ExpressionTree::evaluate(const std::vector<TruthValue>& leaves) const {
TruthValue result;
switch (mOperator) {
- case Operator::OR:
- {
+ case Operator::OR: {
result = mChildren.at(0)->evaluate(leaves);
for (size_t i = 1; i < mChildren.size() && !isNeeded(result); ++i) {
result = mChildren.at(i)->evaluate(leaves) || result;
}
return result;
}
- case Operator::AND:
- {
+ case Operator::AND: {
result = mChildren.at(0)->evaluate(leaves);
for (size_t i = 1; i < mChildren.size() && isNeeded(result); ++i) {
result = mChildren.at(i)->evaluate(leaves) && result;
@@ -189,4 +173,4 @@ namespace orc {
return sstream.str();
}
-} // namespace orc
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh
index bb3d16e924..3e0b331a2d 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh
@@ -40,7 +40,7 @@ namespace orc {
* the SearchArgument into an internal form.
*/
class ExpressionTree {
- public:
+ public:
enum class Operator { OR, AND, NOT, LEAF, CONSTANT };
ExpressionTree(Operator op);
@@ -73,13 +73,13 @@ namespace orc {
TruthValue evaluate(const std::vector<TruthValue>& leaves) const;
- private:
+ private:
Operator mOperator;
std::vector<TreeNode> mChildren;
size_t mLeaf;
TruthValue mConstant;
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_EXPRESSIONTREE_HH
+#endif // ORC_EXPRESSIONTREE_HH
diff --git a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
index da4cdd0d47..c0cdd62201 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
@@ -78,7 +78,7 @@ namespace orc {
mHashCode = hashCode();
}
- Literal::Literal(const char * str, size_t size) {
+ Literal::Literal(const char* str, size_t size) {
mType = PredicateDataType::STRING;
mValue.Buffer = new char[size];
memcpy(mValue.Buffer, str, size);
@@ -110,10 +110,8 @@ namespace orc {
mHashCode = hashCode();
}
- Literal::Literal(const Literal& r): mType(r.mType)
- , mSize(r.mSize)
- , mIsNull(r.mIsNull)
- , mHashCode(r.mHashCode) {
+ Literal::Literal(const Literal& r)
+ : mType(r.mType), mSize(r.mSize), mIsNull(r.mIsNull), mHashCode(r.mHashCode) {
if (mType == PredicateDataType::STRING) {
mValue.Buffer = new char[r.mSize];
memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize);
@@ -134,7 +132,7 @@ namespace orc {
Literal::~Literal() {
if (mType == PredicateDataType::STRING && mValue.Buffer) {
- delete [] mValue.Buffer;
+ delete[] mValue.Buffer;
mValue.Buffer = nullptr;
}
}
@@ -142,7 +140,7 @@ namespace orc {
Literal& Literal::operator=(const Literal& r) {
if (this != &r) {
if (mType == PredicateDataType::STRING && mValue.Buffer) {
- delete [] mValue.Buffer;
+ delete[] mValue.Buffer;
mValue.Buffer = nullptr;
}
@@ -178,8 +176,7 @@ namespace orc {
sstream << mValue.DateVal;
break;
case PredicateDataType::TIMESTAMP:
- sstream << mValue.TimeStampVal.second << "."
- << mValue.TimeStampVal.nanos;
+ sstream << mValue.TimeStampVal.second << "." << mValue.TimeStampVal.nanos;
break;
case PredicateDataType::FLOAT:
sstream << mValue.DoubleVal;
@@ -209,14 +206,13 @@ namespace orc {
return std::hash<int64_t>{}(mValue.DateVal);
case PredicateDataType::TIMESTAMP:
return std::hash<int64_t>{}(mValue.TimeStampVal.second) * 17 +
- std::hash<int32_t>{}(mValue.TimeStampVal.nanos);
+ std::hash<int32_t>{}(mValue.TimeStampVal.nanos);
case PredicateDataType::FLOAT:
return std::hash<double>{}(mValue.DoubleVal);
case PredicateDataType::BOOLEAN:
return std::hash<bool>{}(mValue.BooleanVal);
case PredicateDataType::STRING:
- return std::hash<std::string>{}(
- std::string(mValue.Buffer, mSize));
+ return std::hash<std::string>{}(std::string(mValue.Buffer, mSize));
case PredicateDataType::DECIMAL:
// current glibc does not support hash<int128_t>
return std::hash<int64_t>{}(mValue.IntVal);
@@ -246,12 +242,11 @@ namespace orc {
return mValue.TimeStampVal == r.mValue.TimeStampVal;
case PredicateDataType::FLOAT:
return std::fabs(mValue.DoubleVal - r.mValue.DoubleVal) <
- std::numeric_limits<double>::epsilon();
+ std::numeric_limits<double>::epsilon();
case PredicateDataType::BOOLEAN:
return mValue.BooleanVal == r.mValue.BooleanVal;
case PredicateDataType::STRING:
- return mSize == r.mSize && memcmp(
- mValue.Buffer, r.mValue.Buffer, mSize) == 0;
+ return mSize == r.mSize && memcmp(mValue.Buffer, r.mValue.Buffer, mSize) == 0;
case PredicateDataType::DECIMAL:
return mValue.DecimalVal == r.mValue.DecimalVal;
default:
@@ -263,8 +258,7 @@ namespace orc {
return !(*this == r);
}
- inline void validate(const bool& isNull,
- const PredicateDataType& type,
+ inline void validate(const bool& isNull, const PredicateDataType& type,
const PredicateDataType& expected) {
if (isNull) {
throw std::logic_error("cannot get value when it is null!");
@@ -309,4 +303,4 @@ namespace orc {
return Decimal(mValue.DecimalVal, mScale);
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
index 3b012cece4..5fceedd854 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
@@ -16,10 +16,10 @@
* limitations under the License.
*/
+#include "PredicateLeaf.hh"
#include "orc/BloomFilter.hh"
#include "orc/Common.hh"
#include "orc/Type.hh"
-#include "PredicateLeaf.hh"
#include <algorithm>
#include <functional>
@@ -28,81 +28,62 @@
namespace orc {
- PredicateLeaf::PredicateLeaf(Operator op,
- PredicateDataType type,
- const std::string& colName,
+ PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
Literal literal)
- : mOperator(op)
- , mType(type)
- , mColumnName(colName)
- , mHasColumnName(true)
- , mColumnId(0) {
+ : mOperator(op), mType(type), mColumnName(colName), mHasColumnName(true), mColumnId(0) {
mLiterals.emplace_back(literal);
mHashCode = hashCode();
validate();
}
- PredicateLeaf::PredicateLeaf(Operator op,
- PredicateDataType type,
- uint64_t columnId,
+ PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
Literal literal)
- : mOperator(op)
- , mType(type)
- , mHasColumnName(false)
- , mColumnId(columnId) {
+ : mOperator(op), mType(type), mHasColumnName(false), mColumnId(columnId) {
mLiterals.emplace_back(literal);
mHashCode = hashCode();
validate();
}
- PredicateLeaf::PredicateLeaf(Operator op,
- PredicateDataType type,
- const std::string& colName,
+ PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
const std::initializer_list<Literal>& literals)
- : mOperator(op)
- , mType(type)
- , mColumnName(colName)
- , mHasColumnName(true)
- , mLiterals(literals.begin(), literals.end()) {
+ : mOperator(op),
+ mType(type),
+ mColumnName(colName),
+ mHasColumnName(true),
+ mLiterals(literals.begin(), literals.end()) {
mHashCode = hashCode();
validate();
}
- PredicateLeaf::PredicateLeaf(Operator op,
- PredicateDataType type,
- uint64_t columnId,
+ PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
const std::initializer_list<Literal>& literals)
- : mOperator(op)
- , mType(type)
- , mHasColumnName(false)
- , mColumnId(columnId)
- , mLiterals(literals.begin(), literals.end()) {
+ : mOperator(op),
+ mType(type),
+ mHasColumnName(false),
+ mColumnId(columnId),
+ mLiterals(literals.begin(), literals.end()) {
mHashCode = hashCode();
validate();
}
- PredicateLeaf::PredicateLeaf(Operator op,
- PredicateDataType type,
- const std::string& colName,
+ PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
const std::vector<Literal>& literals)
- : mOperator(op)
- , mType(type)
- , mColumnName(colName)
- , mHasColumnName(true)
- , mLiterals(literals.begin(), literals.end()) {
+ : mOperator(op),
+ mType(type),
+ mColumnName(colName),
+ mHasColumnName(true),
+ mLiterals(literals.begin(), literals.end()) {
mHashCode = hashCode();
validate();
}
- PredicateLeaf::PredicateLeaf(Operator op,
- PredicateDataType type,
- uint64_t columnId,
+ PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
const std::vector<Literal>& literals)
- : mOperator(op)
- , mType(type)
- , mHasColumnName(false)
- , mColumnId(columnId)
- , mLiterals(literals.begin(), literals.end()) {
+ : mOperator(op),
+ mType(type),
+ mHasColumnName(false),
+ mColumnId(columnId),
+ mLiterals(literals.begin(), literals.end()) {
mHashCode = hashCode();
validate();
}
@@ -131,8 +112,7 @@ namespace orc {
if (mLiterals.size() != 1) {
throw std::invalid_argument("One literal is required!");
}
- if (static_cast<int>(mLiterals.at(0).getType()) !=
- static_cast<int>(mType)) {
+ if (static_cast<int>(mLiterals.at(0).getType()) != static_cast<int>(mType)) {
throw std::invalid_argument("leaf and literal types do not match!");
}
break;
@@ -232,8 +212,7 @@ namespace orc {
sstream << columnDebugString() << " = " << getLiteralString(mLiterals);
break;
case Operator::NULL_SAFE_EQUALS:
- sstream << columnDebugString() << " null_safe_= "
- << getLiteralString(mLiterals);
+ sstream << columnDebugString() << " null_safe_= " << getLiteralString(mLiterals);
break;
case Operator::LESS_THAN:
sstream << columnDebugString() << " < " << getLiteralString(mLiterals);
@@ -248,9 +227,8 @@ namespace orc {
sstream << columnDebugString() << " between " << getLiteralsString(mLiterals);
break;
default:
- sstream << "unknown operator, column: "
- << columnDebugString() << ", literals: "
- << getLiteralsString(mLiterals);
+ sstream << "unknown operator, column: " << columnDebugString()
+ << ", literals: " << getLiteralsString(mLiterals);
}
sstream << ')';
return sstream.str();
@@ -259,16 +237,11 @@ namespace orc {
size_t PredicateLeaf::hashCode() const {
size_t value = 0;
std::for_each(mLiterals.cbegin(), mLiterals.cend(),
- [&](const Literal& literal) {
- value = value * 17 + literal.getHashCode();
- });
- auto colHash = mHasColumnName ?
- std::hash<std::string>{}(mColumnName) :
- std::hash<uint64_t>{}(mColumnId);
- return value * 103 * 101 * 3 * 17 +
- std::hash<int>{}(static_cast<int>(mOperator)) +
- std::hash<int>{}(static_cast<int>(mType)) * 17 +
- colHash * 3 * 17;
+ [&](const Literal& literal) { value = value * 17 + literal.getHashCode(); });
+ auto colHash =
+ mHasColumnName ? std::hash<std::string>{}(mColumnName) : std::hash<uint64_t>{}(mColumnId);
+ return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(mOperator)) +
+ std::hash<int>{}(static_cast<int>(mType)) * 17 + colHash * 3 * 17;
}
bool PredicateLeaf::operator==(const PredicateLeaf& r) const {
@@ -289,9 +262,7 @@ namespace orc {
}
// enum to mark the position of predicate in the range
- enum class Location {
- BEFORE, MIN, MIDDLE, MAX, AFTER
- };
+ enum class Location { BEFORE, MIN, MIDDLE, MAX, AFTER };
DIAGNOSTIC_PUSH
DIAGNOSTIC_IGNORE("-Wfloat-equal")
@@ -331,11 +302,8 @@ namespace orc {
* @return the TruthValue result of the test
*/
template <typename T>
- TruthValue evaluatePredicateRange(const PredicateLeaf::Operator op,
- const std::vector<T>& values,
- const T& minValue,
- const T& maxValue,
- bool hasNull) {
+ TruthValue evaluatePredicateRange(const PredicateLeaf::Operator op, const std::vector<T>& values,
+ const T& minValue, const T& maxValue, bool hasNull) {
Location loc;
switch (op) {
case PredicateLeaf::Operator::NULL_SAFE_EQUALS:
@@ -387,8 +355,7 @@ namespace orc {
// are all of the values outside of the range?
for (auto& value : values) {
loc = compareToRange(value, minValue, maxValue);
- if (loc == Location::MIN || loc == Location::MIDDLE ||
- loc == Location::MAX) {
+ if (loc == Location::MIN || loc == Location::MIDDLE || loc == Location::MAX) {
return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
}
}
@@ -423,19 +390,17 @@ namespace orc {
DIAGNOSTIC_POP
- static TruthValue evaluateBoolPredicate(
- const PredicateLeaf::Operator op,
- const std::vector<Literal>& literals,
- const proto::ColumnStatistics& stats) {
- bool hasNull = stats.hasnull();
- if (!stats.has_bucketstatistics() ||
- stats.bucketstatistics().count_size() == 0) {
+ static TruthValue evaluateBoolPredicate(const PredicateLeaf::Operator op,
+ const std::vector<Literal>& literals,
+ const proto::ColumnStatistics& stats) {
+ bool hasNull = stats.has_null();
+ if (!stats.has_bucket_statistics() || stats.bucket_statistics().count_size() == 0) {
// does not have bool stats
return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
}
- auto trueCount = stats.bucketstatistics().count(0);
- auto falseCount = stats.numberofvalues() - trueCount;
+ auto trueCount = stats.bucket_statistics().count(0);
+ auto falseCount = stats.number_of_values() - trueCount;
switch (op) {
case PredicateLeaf::Operator::IS_NULL:
return hasNull ? TruthValue::YES_NO : TruthValue::NO;
@@ -500,8 +465,7 @@ namespace orc {
return result;
}
- static std::vector<Literal::Timestamp> literal2Timestamp(
- const std::vector<Literal>& values) {
+ static std::vector<Literal::Timestamp> literal2Timestamp(const std::vector<Literal>& values) {
std::vector<Literal::Timestamp> result;
std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) {
if (!val.isNull()) {
@@ -511,8 +475,7 @@ namespace orc {
return result;
}
- static std::vector<Decimal> literal2Decimal(
- const std::vector<Literal>& values) {
+ static std::vector<Decimal> literal2Decimal(const std::vector<Literal>& values) {
std::vector<Decimal> result;
std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) {
if (!val.isNull()) {
@@ -522,8 +485,7 @@ namespace orc {
return result;
}
- static std::vector<double> literal2Double(
- const std::vector<Literal>& values) {
+ static std::vector<double> literal2Double(const std::vector<Literal>& values) {
std::vector<double> result;
std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) {
if (!val.isNull()) {
@@ -533,8 +495,7 @@ namespace orc {
return result;
}
- static std::vector<TString> literal2String(
- const std::vector<Literal>& values) {
+ static std::vector<TString> literal2String(const std::vector<Literal>& values) {
std::vector<TString> result;
std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) {
if (!val.isNull()) {
@@ -544,114 +505,84 @@ namespace orc {
return result;
}
- TruthValue PredicateLeaf::evaluatePredicateMinMax(
- const proto::ColumnStatistics& colStats) const {
+ TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const {
TruthValue result = TruthValue::YES_NO_NULL;
switch (mType) {
case PredicateDataType::LONG: {
- if (colStats.has_intstatistics() &&
- colStats.intstatistics().has_minimum() &&
- colStats.intstatistics().has_maximum()) {
- const auto& stats = colStats.intstatistics();
- result = evaluatePredicateRange(
- mOperator,
- literal2Long(mLiterals),
- stats.minimum(),
- stats.maximum(),
- colStats.hasnull());
+ if (colStats.has_int_statistics() && colStats.int_statistics().has_minimum() &&
+ colStats.int_statistics().has_maximum()) {
+ const auto& stats = colStats.int_statistics();
+ result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(),
+ stats.maximum(), colStats.has_null());
}
break;
}
case PredicateDataType::FLOAT: {
- if (colStats.has_doublestatistics() &&
- colStats.doublestatistics().has_minimum() &&
- colStats.doublestatistics().has_maximum()) {
- const auto& stats = colStats.doublestatistics();
+ if (colStats.has_double_statistics() && colStats.double_statistics().has_minimum() &&
+ colStats.double_statistics().has_maximum()) {
+ const auto& stats = colStats.double_statistics();
if (!std::isfinite(stats.sum())) {
- result = colStats.hasnull() ?
- TruthValue::YES_NO_NULL : TruthValue::YES_NO;
+ result = colStats.has_null() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
} else {
- result = evaluatePredicateRange(
- mOperator,
- literal2Double(mLiterals),
- stats.minimum(),
- stats.maximum(),
- colStats.hasnull());
+ result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(),
+ stats.maximum(), colStats.has_null());
}
}
break;
}
case PredicateDataType::STRING: {
- ///TODO: check lowerBound and upperBound as well
- if (colStats.has_stringstatistics() &&
- colStats.stringstatistics().has_minimum() &&
- colStats.stringstatistics().has_maximum()) {
- const auto& stats = colStats.stringstatistics();
- result = evaluatePredicateRange(
- mOperator,
- literal2String(mLiterals),
- stats.minimum(),
- stats.maximum(),
- colStats.hasnull());
+ /// TODO: check lowerBound and upperBound as well
+ if (colStats.has_string_statistics() && colStats.string_statistics().has_minimum() &&
+ colStats.string_statistics().has_maximum()) {
+ const auto& stats = colStats.string_statistics();
+ result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(),
+ stats.maximum(), colStats.has_null());
}
break;
}
case PredicateDataType::DATE: {
- if (colStats.has_datestatistics() &&
- colStats.datestatistics().has_minimum() &&
- colStats.datestatistics().has_maximum()) {
- const auto& stats = colStats.datestatistics();
- result = evaluatePredicateRange(
- mOperator,
- literal2Date(mLiterals),
- stats.minimum(),
- stats.maximum(),
- colStats.hasnull());
+ if (colStats.has_date_statistics() && colStats.date_statistics().has_minimum() &&
+ colStats.date_statistics().has_maximum()) {
+ const auto& stats = colStats.date_statistics();
+ result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(),
+ stats.maximum(), colStats.has_null());
}
break;
}
case PredicateDataType::TIMESTAMP: {
- if (colStats.has_timestampstatistics() &&
- colStats.timestampstatistics().has_minimumutc() &&
- colStats.timestampstatistics().has_maximumutc()) {
- const auto& stats = colStats.timestampstatistics();
+ if (colStats.has_timestamp_statistics() &&
+ colStats.timestamp_statistics().has_minimum_utc() &&
+ colStats.timestamp_statistics().has_maximum_utc()) {
+ const auto& stats = colStats.timestamp_statistics();
constexpr int32_t DEFAULT_MIN_NANOS = 0;
constexpr int32_t DEFAULT_MAX_NANOS = 999999;
- int32_t minNano = stats.has_minimumnanos() ?
- stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS;
- int32_t maxNano = stats.has_maximumnanos() ?
- stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS;
+ int32_t minNano =
+ stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS;
+ int32_t maxNano =
+ stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS;
Literal::Timestamp minTimestamp(
- stats.minimumutc() / 1000,
- static_cast<int32_t>((stats.minimumutc() % 1000) * 1000000) + minNano);
+ stats.minimum_utc() / 1000,
+ static_cast<int32_t>((stats.minimum_utc() % 1000) * 1000000) + minNano);
Literal::Timestamp maxTimestamp(
- stats.maximumutc() / 1000,
- static_cast<int32_t>((stats.maximumutc() % 1000) * 1000000) + maxNano);
- result = evaluatePredicateRange(
- mOperator,
- literal2Timestamp(mLiterals),
- minTimestamp,
- maxTimestamp,
- colStats.hasnull());
+ stats.maximum_utc() / 1000,
+ static_cast<int32_t>((stats.maximum_utc() % 1000) * 1000000) + maxNano);
+ result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp,
+ maxTimestamp, colStats.has_null());
}
break;
}
case PredicateDataType::DECIMAL: {
- if (colStats.has_decimalstatistics() &&
- colStats.decimalstatistics().has_minimum() &&
- colStats.decimalstatistics().has_maximum()) {
- const auto& stats = colStats.decimalstatistics();
- result = evaluatePredicateRange(
- mOperator,
- literal2Decimal(mLiterals),
- Decimal(stats.minimum()),
- Decimal(stats.maximum()),
- colStats.hasnull());
+ if (colStats.has_decimal_statistics() && colStats.decimal_statistics().has_minimum() &&
+ colStats.decimal_statistics().has_maximum()) {
+ const auto& stats = colStats.decimal_statistics();
+ result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals),
+ Decimal(stats.minimum()), Decimal(stats.maximum()),
+ colStats.has_null());
}
break;
}
- case PredicateDataType::BOOLEAN: {
- if (colStats.has_bucketstatistics()) {
+ case PredicateDataType::BOOLEAN: {
+ if (colStats.has_bucket_statistics()) {
result = evaluateBoolPredicate(mOperator, mLiterals, colStats);
}
break;
@@ -661,7 +592,7 @@ namespace orc {
}
// make sure null literal is respected for IN operator
- if (mOperator == Operator::IN && colStats.hasnull()) {
+ if (mOperator == Operator::IN && colStats.has_null()) {
for (const auto& literal : mLiterals) {
if (literal.isNull()) {
result = TruthValue::YES_NO_NULL;
@@ -673,29 +604,24 @@ namespace orc {
return result;
}
- static bool shouldEvaluateBloomFilter(PredicateLeaf::Operator op,
- TruthValue result,
- const BloomFilter * bloomFilter) {
+ static bool shouldEvaluateBloomFilter(PredicateLeaf::Operator op, TruthValue result,
+ const BloomFilter* bloomFilter) {
// evaluate bloom filter only when
// 1) Bloom filter is available
// 2) Min/Max evaluation yield YES or MAYBE
// 3) Predicate is EQUALS or IN list
// 4) Decimal type stores its string representation
// but has inconsistency in trailing zeros
- if (bloomFilter != nullptr
- && result != TruthValue::NO_NULL && result != TruthValue::NO
- && (op == PredicateLeaf::Operator::EQUALS
- || op == PredicateLeaf::Operator::NULL_SAFE_EQUALS
- || op == PredicateLeaf::Operator::IN)) {
+ if (bloomFilter != nullptr && result != TruthValue::NO_NULL && result != TruthValue::NO &&
+ (op == PredicateLeaf::Operator::EQUALS || op == PredicateLeaf::Operator::NULL_SAFE_EQUALS ||
+ op == PredicateLeaf::Operator::IN)) {
return true;
}
return false;
}
- static TruthValue checkInBloomFilter(PredicateLeaf::Operator,
- PredicateDataType type,
- const Literal& literal,
- const BloomFilter * bf,
+ static TruthValue checkInBloomFilter(PredicateLeaf::Operator, PredicateDataType type,
+ const Literal& literal, const BloomFilter* bf,
bool hasNull) {
TruthValue result = hasNull ? TruthValue::NO_NULL : TruthValue::NO;
if (literal.isNull()) {
@@ -715,7 +641,7 @@ namespace orc {
}
} else if (type == PredicateDataType::DECIMAL) {
std::string decimal = literal.getDecimal().toString(true);
- if (bf->testBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()))) {
+ if (bf->testBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()))) {
result = TruthValue::YES_NO_NULL;
}
} else if (type == PredicateDataType::TIMESTAMP) {
@@ -737,25 +663,20 @@ namespace orc {
return result;
}
- TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter * bf,
- bool hasNull) const {
+ TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter* bf, bool hasNull) const {
switch (mOperator) {
case Operator::NULL_SAFE_EQUALS:
// null safe equals does not return *_NULL variant.
// So set hasNull to false
- return checkInBloomFilter(
- mOperator, mType, mLiterals.front(), bf, false);
+ return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, false);
case Operator::EQUALS:
- return checkInBloomFilter(
- mOperator, mType, mLiterals.front(), bf, hasNull);
+ return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, hasNull);
case Operator::IN:
- for (const auto &literal : mLiterals) {
+ for (const auto& literal : mLiterals) {
// if at least one value in IN list exist in bloom filter,
// qualify the row group/stripe
- TruthValue result = checkInBloomFilter(
- mOperator, mType, literal, bf, hasNull);
- if (result == TruthValue::YES_NO_NULL ||
- result == TruthValue::YES_NO) {
+ TruthValue result = checkInBloomFilter(mOperator, mType, literal, bf, hasNull);
+ if (result == TruthValue::YES_NO_NULL || result == TruthValue::YES_NO) {
return result;
}
}
@@ -771,7 +692,7 @@ namespace orc {
TruthValue PredicateLeaf::evaluate(const WriterVersion writerVersion,
const proto::ColumnStatistics& colStats,
- const BloomFilter * bloomFilter) const {
+ const BloomFilter* bloomFilter) const {
// files written before ORC-135 stores timestamp wrt to local timezone
// causing issues with PPD. disable PPD for timestamp for all old files
if (mType == PredicateDataType::TIMESTAMP) {
@@ -780,14 +701,13 @@ namespace orc {
}
}
- bool allNull = colStats.hasnull() && colStats.numberofvalues() == 0;
- if (mOperator == Operator::IS_NULL || ((
- mOperator == Operator::EQUALS ||
- mOperator == Operator::NULL_SAFE_EQUALS) &&
- mLiterals.at(0).isNull())) {
+ bool allNull = colStats.has_null() && colStats.number_of_values() == 0;
+ if (mOperator == Operator::IS_NULL ||
+ ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) &&
+ mLiterals.at(0).isNull())) {
// IS_NULL operator does not need to check min/max stats and bloom filter
- return allNull ? TruthValue::YES :
- (colStats.hasnull() ? TruthValue::YES_NO : TruthValue::NO);
+ return allNull ? TruthValue::YES
+ : (colStats.has_null() ? TruthValue::YES_NO : TruthValue::NO);
} else if (allNull) {
// if we don't have any value, everything must have been null
return TruthValue::IS_NULL;
@@ -795,10 +715,10 @@ namespace orc {
TruthValue result = evaluatePredicateMinMax(colStats);
if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) {
- return evaluatePredicateBloomFiter(bloomFilter, colStats.hasnull());
+ return evaluatePredicateBloomFiter(bloomFilter, colStats.has_null());
} else {
return result;
}
}
-} // namespace orc
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh
index 99791cf976..21ed456155 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh
@@ -19,18 +19,17 @@
#ifndef ORC_PREDICATELEAF_HH
#define ORC_PREDICATELEAF_HH
-#include "wrap/orc-proto-wrapper.hh"
#include "orc/Common.hh"
#include "orc/sargs/Literal.hh"
#include "orc/sargs/TruthValue.hh"
+#include "wrap/orc-proto-wrapper.hh"
#include <string>
#include <vector>
namespace orc {
- static constexpr uint64_t INVALID_COLUMN_ID =
- std::numeric_limits<uint64_t>::max();
+ static constexpr uint64_t INVALID_COLUMN_ID = std::numeric_limits<uint64_t>::max();
class BloomFilter;
@@ -38,7 +37,7 @@ namespace orc {
* The primitive predicates that form a SearchArgument.
*/
class PredicateLeaf {
- public:
+ public:
/**
* The possible operators for predicates. To get the opposites, construct
* an expression with a not operator.
@@ -55,9 +54,9 @@ namespace orc {
// The possible types for sargs.
enum class Type {
- LONG = 0, // all of the integer types
- FLOAT, // float and double
- STRING, // string, char, varchar
+ LONG = 0, // all of the integer types
+ FLOAT, // float and double
+ STRING, // string, char, varchar
DATE,
DECIMAL,
TIMESTAMP,
@@ -66,34 +65,20 @@ namespace orc {
PredicateLeaf() = default;
- PredicateLeaf(Operator op,
- PredicateDataType type,
- const std::string& colName,
- Literal literal);
+ PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, Literal literal);
- PredicateLeaf(Operator op,
- PredicateDataType type,
- uint64_t columnId,
- Literal literal);
+ PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, Literal literal);
- PredicateLeaf(Operator op,
- PredicateDataType type,
- const std::string& colName,
+ PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
const std::initializer_list<Literal>& literalList);
- PredicateLeaf(Operator op,
- PredicateDataType type,
- uint64_t columnId,
+ PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
const std::initializer_list<Literal>& literalList);
- PredicateLeaf(Operator op,
- PredicateDataType type,
- const std::string& colName,
+ PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName,
const std::vector<Literal>& literalList);
- PredicateLeaf(Operator op,
- PredicateDataType type,
- uint64_t columnId,
+ PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId,
const std::vector<Literal>& literalList);
/**
@@ -134,17 +119,18 @@ namespace orc {
/**
* Evaluate current PredicateLeaf based on ColumnStatistics and BloomFilter
*/
- TruthValue evaluate(const WriterVersion writerVersion,
- const proto::ColumnStatistics& colStats,
- const BloomFilter * bloomFilter) const;
+ TruthValue evaluate(const WriterVersion writerVersion, const proto::ColumnStatistics& colStats,
+ const BloomFilter* bloomFilter) const;
std::string toString() const;
bool operator==(const PredicateLeaf& r) const;
- size_t getHashCode() const { return mHashCode; }
+ size_t getHashCode() const {
+ return mHashCode;
+ }
- private:
+ private:
size_t hashCode() const;
void validate() const;
@@ -152,13 +138,11 @@ namespace orc {
std::string columnDebugString() const;
- TruthValue evaluatePredicateMinMax(
- const proto::ColumnStatistics& colStats) const;
+ TruthValue evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const;
- TruthValue evaluatePredicateBloomFiter(const BloomFilter * bloomFilter,
- bool hasNull) const;
+ TruthValue evaluatePredicateBloomFiter(const BloomFilter* bloomFilter, bool hasNull) const;
- private:
+ private:
Operator mOperator;
PredicateDataType mType;
std::string mColumnName;
@@ -180,6 +164,6 @@ namespace orc {
}
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_PREDICATELEAF_HH
+#endif // ORC_PREDICATELEAF_HH
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
index 42a554f5ca..7032a88126 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
@@ -22,8 +22,7 @@
namespace orc {
// find column id from column name
- uint64_t SargsApplier::findColumn(const Type& type,
- const std::string& colName) {
+ uint64_t SargsApplier::findColumn(const Type& type, const std::string& colName) {
for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) {
// Only STRUCT type has field names
if (type.getKind() == STRUCT && type.getFieldName(i) == colName) {
@@ -38,19 +37,18 @@ namespace orc {
return INVALID_COLUMN_ID;
}
- SargsApplier::SargsApplier(const Type& type,
- const SearchArgument * searchArgument,
- uint64_t rowIndexStride,
- WriterVersion writerVersion)
- : mType(type)
- , mSearchArgument(searchArgument)
- , mRowIndexStride(rowIndexStride)
- , mWriterVersion(writerVersion)
- , mStats(0, 0)
- , mHasEvaluatedFileStats(false)
- , mFileStatsEvalResult(true) {
- const SearchArgumentImpl * sargs =
- dynamic_cast<const SearchArgumentImpl *>(mSearchArgument);
+ SargsApplier::SargsApplier(const Type& type, const SearchArgument* searchArgument,
+ uint64_t rowIndexStride, WriterVersion writerVersion,
+ ReaderMetrics* metrics, const SchemaEvolution* schemaEvolution)
+ : mType(type),
+ mSearchArgument(searchArgument),
+ mSchemaEvolution(schemaEvolution),
+ mRowIndexStride(rowIndexStride),
+ mWriterVersion(writerVersion),
+ mHasEvaluatedFileStats(false),
+ mFileStatsEvalResult(true),
+ mMetrics(metrics) {
+ const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument);
// find the mapping from predicate leaves to columns
const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
@@ -64,13 +62,11 @@ namespace orc {
}
}
- bool SargsApplier::pickRowGroups(
- uint64_t rowsInStripe,
- const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes,
- const std::map<uint32_t, BloomFilterIndex>& bloomFilters) {
+ bool SargsApplier::pickRowGroups(uint64_t rowsInStripe,
+ const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes,
+ const std::map<uint32_t, BloomFilterIndex>& bloomFilters) {
// init state of each row group
- uint64_t groupsInStripe =
- (rowsInStripe + mRowIndexStride - 1) / mRowIndexStride;
+ uint64_t groupsInStripe = (rowsInStripe + mRowIndexStride - 1) / mRowIndexStride;
mNextSkippedRows.resize(groupsInStripe);
mTotalRowsInStripe = rowsInStripe;
@@ -79,10 +75,8 @@ namespace orc {
return true;
}
- const auto& leaves =
- dynamic_cast<const SearchArgumentImpl *>(mSearchArgument)->getLeaves();
- std::vector<TruthValue> leafValues(
- leaves.size(), TruthValue::YES_NO_NULL);
+ const auto& leaves = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument)->getLeaves();
+ std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL);
mHasSelected = false;
mHasSkipped = false;
uint64_t nextSkippedRowGroup = groupsInStripe;
@@ -95,10 +89,13 @@ namespace orc {
if (columnIdx == INVALID_COLUMN_ID || rowIndexIter == rowIndexes.cend()) {
// this column does not exist in current file
leafValues[pred] = TruthValue::YES_NO_NULL;
+ } else if (mSchemaEvolution && !mSchemaEvolution->isSafePPDConversion(columnIdx)) {
+ // cannot evaluate predicate when ppd is not safe
+ leafValues[pred] = TruthValue::YES_NO_NULL;
} else {
// get column statistics
const proto::ColumnStatistics& statistics =
- rowIndexIter->second.entry(static_cast<int>(rowGroup)).statistics();
+ rowIndexIter->second.entry(static_cast<int>(rowGroup)).statistics();
// get bloom filter
std::shared_ptr<BloomFilter> bloomFilter;
@@ -107,9 +104,7 @@ namespace orc {
bloomFilter = iter->second.entries.at(rowGroup);
}
- leafValues[pred] = leaves[pred].evaluate(mWriterVersion,
- statistics,
- bloomFilter.get());
+ leafValues[pred] = leaves[pred].evaluate(mWriterVersion, statistics, bloomFilter.get());
}
}
@@ -118,69 +113,76 @@ namespace orc {
mNextSkippedRows[rowGroup] = 0;
nextSkippedRowGroup = rowGroup;
} else {
- mNextSkippedRows[rowGroup] = (nextSkippedRowGroup == groupsInStripe) ?
- rowsInStripe : (nextSkippedRowGroup * mRowIndexStride);
+ mNextSkippedRows[rowGroup] = (nextSkippedRowGroup == groupsInStripe)
+ ? rowsInStripe
+ : (nextSkippedRowGroup * mRowIndexStride);
}
mHasSelected |= needed;
mHasSkipped |= !needed;
} while (rowGroup != 0);
// update stats
- mStats.first = std::accumulate(
- mNextSkippedRows.cbegin(), mNextSkippedRows.cend(), mStats.first,
- [](bool rg, uint64_t s) { return rg ? 1 : 0 + s; });
- mStats.second += groupsInStripe;
+ uint64_t selectedRGs = std::accumulate(
+ mNextSkippedRows.cbegin(), mNextSkippedRows.cend(), 0UL,
+ [](uint64_t initVal, uint64_t rg) { return rg > 0 ? initVal + 1 : initVal; });
+ if (mMetrics != nullptr) {
+ mMetrics->SelectedRowGroupCount.fetch_add(selectedRGs);
+ mMetrics->EvaluatedRowGroupCount.fetch_add(groupsInStripe);
+ }
return mHasSelected;
}
- bool SargsApplier::evaluateColumnStatistics(
- const PbColumnStatistics& colStats) const {
- const SearchArgumentImpl * sargs =
- dynamic_cast<const SearchArgumentImpl *>(mSearchArgument);
+ bool SargsApplier::evaluateColumnStatistics(const PbColumnStatistics& colStats) const {
+ const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument);
if (sargs == nullptr) {
throw InvalidArgument("Failed to cast to SearchArgumentImpl");
}
const std::vector<PredicateLeaf>& leaves = sargs->getLeaves();
- std::vector<TruthValue> leafValues(
- leaves.size(), TruthValue::YES_NO_NULL);
+ std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL);
for (size_t pred = 0; pred != leaves.size(); ++pred) {
uint64_t columnId = mFilterColumns[pred];
- if (columnId != INVALID_COLUMN_ID &&
- colStats.size() > static_cast<int>(columnId)) {
- leafValues[pred] = leaves[pred].evaluate(
- mWriterVersion, colStats.Get(static_cast<int>(columnId)), nullptr);
+ if (columnId != INVALID_COLUMN_ID && colStats.size() > static_cast<int>(columnId)) {
+ leafValues[pred] = leaves[pred].evaluate(mWriterVersion,
+ colStats.Get(static_cast<int>(columnId)), nullptr);
}
}
return isNeeded(mSearchArgument->evaluate(leafValues));
}
- bool SargsApplier::evaluateStripeStatistics(
- const proto::StripeStatistics& stripeStats) {
- if (stripeStats.colstats_size() == 0) {
+ bool SargsApplier::evaluateStripeStatistics(const proto::StripeStatistics& stripeStats,
+ uint64_t stripeRowGroupCount) {
+ if (stripeStats.col_stats_size() == 0) {
return true;
}
- bool ret = evaluateColumnStatistics(stripeStats.colstats());
+ bool ret = evaluateColumnStatistics(stripeStats.col_stats());
if (!ret) {
// reset mNextSkippedRows when the current stripe does not satisfy the PPD
mNextSkippedRows.clear();
+ if (mMetrics != nullptr) {
+ mMetrics->EvaluatedRowGroupCount.fetch_add(stripeRowGroupCount);
+ }
}
return ret;
}
- bool SargsApplier::evaluateFileStatistics(const proto::Footer& footer) {
+ bool SargsApplier::evaluateFileStatistics(const proto::Footer& footer,
+ uint64_t numRowGroupsInStripeRange) {
if (!mHasEvaluatedFileStats) {
if (footer.statistics_size() == 0) {
mFileStatsEvalResult = true;
} else {
mFileStatsEvalResult = evaluateColumnStatistics(footer.statistics());
+ if (!mFileStatsEvalResult && mMetrics != nullptr) {
+ mMetrics->EvaluatedRowGroupCount.fetch_add(numRowGroupsInStripeRange);
+ }
}
mHasEvaluatedFileStats = true;
}
return mFileStatsEvalResult;
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh
index d8bdf852d0..73703dcf6b 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh
@@ -19,62 +19,78 @@
#ifndef ORC_SARGSAPPLIER_HH
#define ORC_SARGSAPPLIER_HH
-#include "wrap/orc-proto-wrapper.hh"
#include <orc/Common.hh>
#include "orc/BloomFilter.hh"
+#include "orc/Reader.hh"
#include "orc/Type.hh"
+#include "wrap/orc-proto-wrapper.hh"
#include "sargs/SearchArgument.hh"
+#include "SchemaEvolution.hh"
+
#include <unordered_map>
namespace orc {
class SargsApplier {
- public:
- SargsApplier(const Type& type,
- const SearchArgument * searchArgument,
- uint64_t rowIndexStride,
- WriterVersion writerVersion);
+ public:
+ SargsApplier(const Type& type, const SearchArgument* searchArgument, uint64_t rowIndexStride,
+ WriterVersion writerVersion, ReaderMetrics* metrics,
+ const SchemaEvolution* schemaEvolution = nullptr);
/**
* Evaluate search argument on file statistics
+ * If file statistics don't satisfy the sargs,
+ * the EvaluatedRowGroupCount of Reader Metrics will be updated.
+ * Otherwise, Reader Metrics will not be updated and
+ * will require further evaluation.
* @return true if file statistics satisfy the sargs
*/
- bool evaluateFileStatistics(const proto::Footer& footer);
+ bool evaluateFileStatistics(const proto::Footer& footer, uint64_t numRowGroupsInStripeRange);
/**
* Evaluate search argument on stripe statistics
+ * If stripe statistics don't satisfy the sargs,
+ * the EvaluatedRowGroupCount of Reader Metrics will be updated.
+ * Otherwise, Reader Metrics will not be updated and
+ * will require further evaluation.
* @return true if stripe statistics satisfy the sargs
*/
- bool evaluateStripeStatistics(const proto::StripeStatistics& stripeStats);
+ bool evaluateStripeStatistics(const proto::StripeStatistics& stripeStats,
+ uint64_t stripeRowGroupCount);
/**
* TODO: use proto::RowIndex and proto::BloomFilter to do the evaluation
* Pick the row groups that we need to load from the current stripe.
* @return true if any row group is selected
*/
- bool pickRowGroups(
- uint64_t rowsInStripe,
- const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes,
- const std::map<uint32_t, BloomFilterIndex>& bloomFilters);
+ bool pickRowGroups(uint64_t rowsInStripe,
+ const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes,
+ const std::map<uint32_t, BloomFilterIndex>& bloomFilters);
/**
* Return a vector of the next skipped row for each RowGroup. Each value is the row id
* in stripe. 0 means the current RowGroup is entirely skipped.
* Only valid after invoking pickRowGroups().
*/
- const std::vector<uint64_t>& getNextSkippedRows() const { return mNextSkippedRows; }
+ const std::vector<uint64_t>& getNextSkippedRows() const {
+ return mNextSkippedRows;
+ }
/**
* Indicate whether any row group is selected in the last evaluation
*/
- bool hasSelected() const { return mHasSelected; }
+ bool hasSelected() const {
+ return mHasSelected;
+ }
/**
* Indicate whether any row group is skipped in the last evaluation
*/
- bool hasSkipped() const { return mHasSkipped; }
+ bool hasSkipped() const {
+ return mHasSkipped;
+ }
/**
* Whether any row group from current row in the stripe matches PPD.
@@ -90,13 +106,17 @@ namespace orc {
}
std::pair<uint64_t, uint64_t> getStats() const {
- return mStats;
+ if (mMetrics != nullptr) {
+ return std::make_pair(mMetrics->SelectedRowGroupCount.load(),
+ mMetrics->EvaluatedRowGroupCount.load());
+ } else {
+ return {0, 0};
+ }
}
- private:
+ private:
// evaluate column statistics in the form of protobuf::RepeatedPtrField
- typedef ::google::protobuf::RepeatedPtrField<proto::ColumnStatistics>
- PbColumnStatistics;
+ typedef ::google::protobuf::RepeatedPtrField<proto::ColumnStatistics> PbColumnStatistics;
bool evaluateColumnStatistics(const PbColumnStatistics& colStats) const;
friend class TestSargsApplier_findColumnTest_Test;
@@ -104,9 +124,10 @@ namespace orc {
friend class TestSargsApplier_findMapColumnTest_Test;
static uint64_t findColumn(const Type& type, const std::string& colName);
- private:
+ private:
const Type& mType;
- const SearchArgument * mSearchArgument;
+ const SearchArgument* mSearchArgument;
+ const SchemaEvolution* mSchemaEvolution;
uint64_t mRowIndexStride;
WriterVersion mWriterVersion;
// column ids for each predicate leaf in the search argument
@@ -119,13 +140,14 @@ namespace orc {
uint64_t mTotalRowsInStripe;
bool mHasSelected;
bool mHasSkipped;
- // keep stats of selected RGs and evaluated RGs
- std::pair<uint64_t, uint64_t> mStats;
// store result of file stats evaluation
bool mHasEvaluatedFileStats;
bool mFileStatsEvalResult;
+ // use the SelectedRowGroupCount and EvaluatedRowGroupCount to
+ // keep stats of selected RGs and evaluated RGs
+ ReaderMetrics* mMetrics;
};
-}
+} // namespace orc
-#endif //ORC_SARGSAPPLIER_HH
+#endif // ORC_SARGSAPPLIER_HH
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
index f6abb316b5..806727f0a0 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
@@ -33,14 +33,12 @@ namespace orc {
return mLeaves;
}
- const ExpressionTree * SearchArgumentImpl::getExpression() const {
+ const ExpressionTree* SearchArgumentImpl::getExpression() const {
return mExpressionTree.get();
}
- TruthValue SearchArgumentImpl::evaluate(
- const std::vector<TruthValue>& leaves) const {
- return mExpressionTree == nullptr ?
- TruthValue::YES : mExpressionTree->evaluate(leaves);
+ TruthValue SearchArgumentImpl::evaluate(const std::vector<TruthValue>& leaves) const {
+ return mExpressionTree == nullptr ? TruthValue::YES : mExpressionTree->evaluate(leaves);
}
std::string SearchArgumentImpl::toString() const {
@@ -61,8 +59,7 @@ namespace orc {
mCurrTree.push_back(mRoot);
}
- SearchArgumentBuilder&
- SearchArgumentBuilderImpl::start(ExpressionTree::Operator op) {
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::start(ExpressionTree::Operator op) {
TreeNode node = std::make_shared<ExpressionTree>(op);
mCurrTree.front()->addChild(node);
mCurrTree.push_front(node);
@@ -84,13 +81,13 @@ namespace orc {
SearchArgumentBuilder& SearchArgumentBuilderImpl::end() {
TreeNode& current = mCurrTree.front();
if (current->getChildren().empty()) {
- throw std::invalid_argument("Cannot create expression " +
- mRoot->toString() + " with no children.");
+ throw std::invalid_argument("Cannot create expression " + mRoot->toString() +
+ " with no children.");
}
if (current->getOperator() == ExpressionTree::Operator::NOT &&
current->getChildren().size() != 1) {
- throw std::invalid_argument("Can't create NOT expression " +
- current->toString() + " with more than 1 child.");
+ throw std::invalid_argument("Can't create NOT expression " + current->toString() +
+ " with more than 1 child.");
}
mCurrTree.pop_front();
return *this;
@@ -110,16 +107,14 @@ namespace orc {
return columnId == INVALID_COLUMN_ID;
}
- template<typename T>
- SearchArgumentBuilder&
- SearchArgumentBuilderImpl::compareOperator(PredicateLeaf::Operator op,
- T column,
- PredicateDataType type,
- Literal literal) {
+ template <typename T>
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::compareOperator(PredicateLeaf::Operator op,
+ T column,
+ PredicateDataType type,
+ Literal literal) {
TreeNode parent = mCurrTree.front();
if (isInvalidColumn(column)) {
- parent->addChild(
- std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
+ parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
} else {
PredicateLeaf leaf(op, type, column, literal);
parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf)));
@@ -130,29 +125,25 @@ namespace orc {
SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThan(const std::string& column,
PredicateDataType type,
Literal literal) {
- return compareOperator(
- PredicateLeaf::Operator::LESS_THAN, column, type, literal);
+ return compareOperator(PredicateLeaf::Operator::LESS_THAN, column, type, literal);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThan(uint64_t columnId,
PredicateDataType type,
Literal literal) {
- return compareOperator(
- PredicateLeaf::Operator::LESS_THAN, columnId, type, literal);
+ return compareOperator(PredicateLeaf::Operator::LESS_THAN, columnId, type, literal);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThanEquals(const std::string& column,
PredicateDataType type,
Literal literal) {
- return compareOperator(
- PredicateLeaf::Operator::LESS_THAN_EQUALS, column, type, literal);
+ return compareOperator(PredicateLeaf::Operator::LESS_THAN_EQUALS, column, type, literal);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThanEquals(uint64_t columnId,
PredicateDataType type,
Literal literal) {
- return compareOperator(
- PredicateLeaf::Operator::LESS_THAN_EQUALS, columnId, type, literal);
+ return compareOperator(PredicateLeaf::Operator::LESS_THAN_EQUALS, columnId, type, literal);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::equals(const std::string& column,
@@ -161,8 +152,7 @@ namespace orc {
if (literal.isNull()) {
return isNull(column, type);
} else {
- return compareOperator(
- PredicateLeaf::Operator::EQUALS, column, type, literal);
+ return compareOperator(PredicateLeaf::Operator::EQUALS, column, type, literal);
}
}
@@ -172,54 +162,46 @@ namespace orc {
if (literal.isNull()) {
return isNull(columnId, type);
} else {
- return compareOperator(
- PredicateLeaf::Operator::EQUALS, columnId, type, literal);
+ return compareOperator(PredicateLeaf::Operator::EQUALS, columnId, type, literal);
}
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::nullSafeEquals(const std::string& column,
PredicateDataType type,
Literal literal) {
- return compareOperator(
- PredicateLeaf::Operator::NULL_SAFE_EQUALS, column, type, literal);
+ return compareOperator(PredicateLeaf::Operator::NULL_SAFE_EQUALS, column, type, literal);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::nullSafeEquals(uint64_t columnId,
PredicateDataType type,
Literal literal) {
- return compareOperator(
- PredicateLeaf::Operator::NULL_SAFE_EQUALS, columnId, type, literal);
+ return compareOperator(PredicateLeaf::Operator::NULL_SAFE_EQUALS, columnId, type, literal);
}
- template<typename T, typename CONTAINER>
- SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIn(T column,
- PredicateDataType type,
- const CONTAINER& literals) {
- TreeNode &parent = mCurrTree.front();
+ template <typename T, typename CONTAINER>
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIn(T column, PredicateDataType type,
+ const CONTAINER& literals) {
+ TreeNode& parent = mCurrTree.front();
if (isInvalidColumn(column)) {
- parent->addChild(
- std::make_shared<ExpressionTree>((TruthValue::YES_NO_NULL)));
+ parent->addChild(std::make_shared<ExpressionTree>((TruthValue::YES_NO_NULL)));
} else {
if (literals.size() == 0) {
- throw std::invalid_argument(
- "Can't create in expression with no arguments");
+ throw std::invalid_argument("Can't create in expression with no arguments");
}
- PredicateLeaf leaf(
- PredicateLeaf::Operator::IN, type, column, literals);
+ PredicateLeaf leaf(PredicateLeaf::Operator::IN, type, column, literals);
parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf)));
}
return *this;
}
- SearchArgumentBuilder& SearchArgumentBuilderImpl::in(const std::string& column,
- PredicateDataType type,
- const std::initializer_list<Literal>& literals) {
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::in(
+ const std::string& column, PredicateDataType type,
+ const std::initializer_list<Literal>& literals) {
return addChildForIn(column, type, literals);
}
- SearchArgumentBuilder& SearchArgumentBuilderImpl::in(uint64_t columnId,
- PredicateDataType type,
- const std::initializer_list<Literal>& literals) {
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::in(
+ uint64_t columnId, PredicateDataType type, const std::initializer_list<Literal>& literals) {
return addChildForIn(columnId, type, literals);
}
@@ -229,23 +211,19 @@ namespace orc {
return addChildForIn(column, type, literals);
}
- SearchArgumentBuilder& SearchArgumentBuilderImpl::in(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::in(uint64_t columnId, PredicateDataType type,
const std::vector<Literal>& literals) {
return addChildForIn(columnId, type, literals);
}
- template<typename T>
- SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIsNull(T column, PredicateDataType type) {
+ template <typename T>
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIsNull(T column,
+ PredicateDataType type) {
TreeNode& parent = mCurrTree.front();
if (isInvalidColumn(column)) {
- parent->addChild(
- std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
+ parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
} else {
- PredicateLeaf leaf(PredicateLeaf::Operator::IS_NULL,
- type,
- column,
- {});
+ PredicateLeaf leaf(PredicateLeaf::Operator::IS_NULL, type, column, {});
parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf)));
}
return *this;
@@ -261,34 +239,29 @@ namespace orc {
return addChildForIsNull(columnId, type);
}
- template<typename T>
+ template <typename T>
SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForBetween(T column,
PredicateDataType type,
- Literal lower, Literal upper) {
+ Literal lower,
+ Literal upper) {
TreeNode& parent = mCurrTree.front();
if (isInvalidColumn(column)) {
- parent->addChild(
- std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
+ parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
} else {
- PredicateLeaf leaf(PredicateLeaf::Operator::BETWEEN,
- type,
- column,
- { lower, upper });
+ PredicateLeaf leaf(PredicateLeaf::Operator::BETWEEN, type, column, {lower, upper});
parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf)));
}
return *this;
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::between(const std::string& column,
- PredicateDataType type,
- Literal lower,
+ PredicateDataType type, Literal lower,
Literal upper) {
return addChildForBetween(column, type, lower, upper);
}
SearchArgumentBuilder& SearchArgumentBuilderImpl::between(uint64_t columnId,
- PredicateDataType type,
- Literal lower,
+ PredicateDataType type, Literal lower,
Literal upper) {
return addChildForBetween(columnId, type, lower, upper);
}
@@ -307,9 +280,7 @@ namespace orc {
* @param leafReorder buffer for leaf reorder
* @return the next available leaf id
*/
- static size_t compactLeaves(const TreeNode& tree,
- size_t next,
- size_t leafReorder[]) {
+ static size_t compactLeaves(const TreeNode& tree, size_t next, size_t leafReorder[]) {
if (tree->getOperator() == ExpressionTree::Operator::LEAF) {
size_t oldLeaf = tree->getLeaf();
if (leafReorder[oldLeaf] == UNUSED_LEAF) {
@@ -378,18 +349,16 @@ namespace orc {
case ExpressionTree::Operator::AND: {
TreeNode result(new ExpressionTree(ExpressionTree::Operator::OR));
for (auto& kid : child->getChildren()) {
- result->addChild(pushDownNot(std::make_shared<ExpressionTree>(
- ExpressionTree::Operator::NOT, NodeList{ kid })
- ));
+ result->addChild(pushDownNot(
+ std::make_shared<ExpressionTree>(ExpressionTree::Operator::NOT, NodeList{kid})));
}
return result;
}
case ExpressionTree::Operator::OR: {
TreeNode result(new ExpressionTree(ExpressionTree::Operator::AND));
for (auto& kid : child->getChildren()) {
- result->addChild(pushDownNot(std::make_shared<ExpressionTree>(
- ExpressionTree::Operator::NOT, NodeList{ kid })
- ));
+ result->addChild(pushDownNot(
+ std::make_shared<ExpressionTree>(ExpressionTree::Operator::NOT, NodeList{kid})));
}
return result;
}
@@ -432,8 +401,7 @@ namespace orc {
case ExpressionTree::Operator::LEAF:
case ExpressionTree::Operator::CONSTANT:
default:
- throw std::invalid_argument(
- "Got a maybe as child of " + expr->toString());
+ throw std::invalid_argument("Got a maybe as child of " + expr->toString());
}
} else {
expr->getChildren()[i] = child;
@@ -444,8 +412,9 @@ namespace orc {
if (!children.empty()) {
// eliminate removed maybe nodes from expr
std::vector<TreeNode> nodes;
- std::for_each(children.begin(), children.end(),
- [&](const TreeNode& node){ if (node) nodes.emplace_back(node); });
+ std::for_each(children.begin(), children.end(), [&](const TreeNode& node) {
+ if (node) nodes.emplace_back(node);
+ });
std::swap(children, nodes);
if (children.empty()) {
return std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL);
@@ -462,7 +431,7 @@ namespace orc {
* @return the flattened expression, which will always be root with
* potentially modified children.
*/
- TreeNode SearchArgumentBuilderImpl::flatten(TreeNode root) {
+ TreeNode SearchArgumentBuilderImpl::flatten(TreeNode root) {
if (root) {
std::vector<TreeNode> nodes;
for (size_t i = 0; i != root->getChildren().size(); ++i) {
@@ -524,10 +493,8 @@ namespace orc {
}
}
if (andList.size() > 1) {
- generateAllCombinations(
- result,
- std::vector<TreeNode>(andList.cbegin() + 1, andList.cend()),
- nonAndList);
+ generateAllCombinations(result, std::vector<TreeNode>(andList.cbegin() + 1, andList.cend()),
+ nonAndList);
}
}
@@ -576,8 +543,7 @@ namespace orc {
}
if (!andList.empty()) {
if (checkCombinationsThreshold(andList)) {
- root = std::make_shared<ExpressionTree>(
- ExpressionTree::Operator::AND);
+ root = std::make_shared<ExpressionTree>(ExpressionTree::Operator::AND);
generateAllCombinations(root->getChildren(), andList, nonAndList);
} else {
root = std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL);
@@ -588,17 +554,15 @@ namespace orc {
return root;
}
- SearchArgumentImpl::SearchArgumentImpl(TreeNode root,
- const std::vector<PredicateLeaf>& leaves)
- : mExpressionTree(root)
- , mLeaves(leaves) {
+ SearchArgumentImpl::SearchArgumentImpl(TreeNode root, const std::vector<PredicateLeaf>& leaves)
+ : mExpressionTree(root), mLeaves(leaves) {
// PASS
}
std::unique_ptr<SearchArgument> SearchArgumentBuilderImpl::build() {
if (mCurrTree.size() != 1) {
- throw std::invalid_argument("Failed to end " +
- std::to_string(mCurrTree.size()) + " operations.");
+ throw std::invalid_argument("Failed to end " + std::to_string(mCurrTree.size()) +
+ " operations.");
}
mRoot = pushDownNot(mRoot);
mRoot = foldMaybe(mRoot);
@@ -612,18 +576,17 @@ namespace orc {
std::vector<PredicateLeaf> leafList(newLeafCount, PredicateLeaf());
// build the new list
- for (auto & leaf : mLeaves) {
+ for (auto& leaf : mLeaves) {
size_t newLoc = leafReorder[leaf.second];
if (newLoc != UNUSED_LEAF) {
leafList[newLoc] = leaf.first;
}
}
- return std::unique_ptr<SearchArgument>(
- new SearchArgumentImpl(mRoot, leafList));
+ return std::make_unique<SearchArgumentImpl>(mRoot, leafList);
}
std::unique_ptr<SearchArgumentBuilder> SearchArgumentFactory::newBuilder() {
- return std::unique_ptr<SearchArgumentBuilder>(new SearchArgumentBuilderImpl());
+ return std::make_unique<SearchArgumentBuilderImpl>();
}
-} // namespace orc
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
index 57d765e1df..4b74b28743 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
@@ -19,10 +19,10 @@
#ifndef ORC_SRC_SEARCHARGUMENT_HH
#define ORC_SRC_SEARCHARGUMENT_HH
-#include "wrap/orc-proto-wrapper.hh"
#include "ExpressionTree.hh"
#include "orc/sargs/SearchArgument.hh"
#include "sargs/PredicateLeaf.hh"
+#include "wrap/orc-proto-wrapper.hh"
#include <deque>
#include <stdexcept>
@@ -40,7 +40,7 @@ namespace orc {
* (<a href="http://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF</a>).
*/
class SearchArgumentImpl : public SearchArgument {
- public:
+ public:
SearchArgumentImpl(TreeNode root, const std::vector<PredicateLeaf>& leaves);
/**
@@ -54,7 +54,7 @@ namespace orc {
* Get the expression tree. This should only needed for file formats that
* need to translate the expression to an internal form.
*/
- const ExpressionTree * getExpression() const;
+ const ExpressionTree* getExpression() const;
/**
* Evaluate the entire predicate based on the values for the leaf predicates.
@@ -65,7 +65,7 @@ namespace orc {
std::string toString() const override;
- private:
+ private:
std::shared_ptr<ExpressionTree> mExpressionTree;
std::vector<PredicateLeaf> mLeaves;
};
@@ -75,7 +75,7 @@ namespace orc {
* must call startOr, startAnd, or startNot before adding any leaves.
*/
class SearchArgumentBuilderImpl : public SearchArgumentBuilder {
- public:
+ public:
SearchArgumentBuilderImpl();
/**
@@ -110,8 +110,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& lessThan(const std::string& column,
- PredicateDataType type,
+ SearchArgumentBuilder& lessThan(const std::string& column, PredicateDataType type,
Literal literal) override;
/**
@@ -121,8 +120,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& lessThan(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& lessThan(uint64_t columnId, PredicateDataType type,
Literal literal) override;
/**
@@ -132,8 +130,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& lessThanEquals(const std::string& column,
- PredicateDataType type,
+ SearchArgumentBuilder& lessThanEquals(const std::string& column, PredicateDataType type,
Literal literal) override;
/**
@@ -143,8 +140,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& lessThanEquals(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& lessThanEquals(uint64_t columnId, PredicateDataType type,
Literal literal) override;
/**
@@ -154,8 +150,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& equals(const std::string& column,
- PredicateDataType type,
+ SearchArgumentBuilder& equals(const std::string& column, PredicateDataType type,
Literal literal) override;
/**
@@ -165,8 +160,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& equals(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& equals(uint64_t columnId, PredicateDataType type,
Literal literal) override;
/**
@@ -176,8 +170,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& nullSafeEquals(const std::string& column,
- PredicateDataType type,
+ SearchArgumentBuilder& nullSafeEquals(const std::string& column, PredicateDataType type,
Literal literal) override;
/**
@@ -187,8 +180,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- SearchArgumentBuilder& nullSafeEquals(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, PredicateDataType type,
Literal literal) override;
/**
@@ -198,8 +190,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- SearchArgumentBuilder& in(const std::string& column,
- PredicateDataType type,
+ SearchArgumentBuilder& in(const std::string& column, PredicateDataType type,
const std::initializer_list<Literal>& literals) override;
/**
@@ -209,8 +200,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- SearchArgumentBuilder& in(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type,
const std::initializer_list<Literal>& literals) override;
/**
@@ -220,8 +210,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- SearchArgumentBuilder& in(const std::string& column,
- PredicateDataType type,
+ SearchArgumentBuilder& in(const std::string& column, PredicateDataType type,
const std::vector<Literal>& literals) override;
/**
@@ -231,8 +220,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- SearchArgumentBuilder& in(uint64_t columnId,
- PredicateDataType type,
+ SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type,
const std::vector<Literal>& literals) override;
/**
@@ -241,8 +229,7 @@ namespace orc {
* @param type the type of the expression
* @return this
*/
- SearchArgumentBuilder& isNull(const std::string& column,
- PredicateDataType type) override;
+ SearchArgumentBuilder& isNull(const std::string& column, PredicateDataType type) override;
/**
* Add an is null leaf to the current item on the stack.
@@ -250,8 +237,7 @@ namespace orc {
* @param type the type of the expression
* @return this
*/
- SearchArgumentBuilder& isNull(uint64_t columnId,
- PredicateDataType type) override;
+ SearchArgumentBuilder& isNull(uint64_t columnId, PredicateDataType type) override;
/**
* Add a between leaf to the current item on the stack.
@@ -261,9 +247,7 @@ namespace orc {
* @param upper the literal
* @return this
*/
- SearchArgumentBuilder& between(const std::string& column,
- PredicateDataType type,
- Literal lower,
+ SearchArgumentBuilder& between(const std::string& column, PredicateDataType type, Literal lower,
Literal upper) override;
/**
@@ -274,9 +258,7 @@ namespace orc {
* @param upper the literal
* @return this
*/
- SearchArgumentBuilder& between(uint64_t columnId,
- PredicateDataType type,
- Literal lower,
+ SearchArgumentBuilder& between(uint64_t columnId, PredicateDataType type, Literal lower,
Literal upper) override;
/**
@@ -293,49 +275,40 @@ namespace orc {
*/
std::unique_ptr<SearchArgument> build() override;
- private:
+ private:
SearchArgumentBuilder& start(ExpressionTree::Operator op);
size_t addLeaf(PredicateLeaf leaf);
static bool isInvalidColumn(const std::string& column);
static bool isInvalidColumn(uint64_t columnId);
- template<typename T>
- SearchArgumentBuilder& compareOperator(PredicateLeaf::Operator op,
- T column,
- PredicateDataType type,
- Literal literal);
+ template <typename T>
+ SearchArgumentBuilder& compareOperator(PredicateLeaf::Operator op, T column,
+ PredicateDataType type, Literal literal);
- template<typename T, typename CONTAINER>
- SearchArgumentBuilder& addChildForIn(T column,
- PredicateDataType type,
+ template <typename T, typename CONTAINER>
+ SearchArgumentBuilder& addChildForIn(T column, PredicateDataType type,
const CONTAINER& literals);
- template<typename T>
- SearchArgumentBuilder& addChildForIsNull(T column,
- PredicateDataType type);
+ template <typename T>
+ SearchArgumentBuilder& addChildForIsNull(T column, PredicateDataType type);
- template<typename T>
- SearchArgumentBuilder& addChildForBetween(T column,
- PredicateDataType type,
- Literal lower,
+ template <typename T>
+ SearchArgumentBuilder& addChildForBetween(T column, PredicateDataType type, Literal lower,
Literal upper);
- public:
+ public:
static TreeNode pushDownNot(TreeNode root);
static TreeNode foldMaybe(TreeNode expr);
static TreeNode flatten(TreeNode root);
static TreeNode convertToCNF(TreeNode root);
- private:
+ private:
std::deque<TreeNode> mCurrTree;
- std::unordered_map<PredicateLeaf,
- size_t,
- PredicateLeafHash,
- PredicateLeafComparator> mLeaves;
+ std::unordered_map<PredicateLeaf, size_t, PredicateLeafHash, PredicateLeafComparator> mLeaves;
std::shared_ptr<ExpressionTree> mRoot;
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_SRC_SEARCHARGUMENT_HH
+#endif // ORC_SRC_SEARCHARGUMENT_HH
diff --git a/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc b/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc
index fe00ed9472..4b3eda7e90 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc
@@ -122,4 +122,4 @@ namespace orc {
}
}
-}
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
index 605fbf826c..1373c18924 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
@@ -1,15 +1,20 @@
/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#ifndef CODED_STREAM_WRAPPER_HH
@@ -20,12 +25,12 @@
DIAGNOSTIC_PUSH
#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
#endif
#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wconversion")
+DIAGNOSTIC_IGNORE("-Wconversion")
#endif
#include <google/protobuf/io/coded_stream.h>
diff --git a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
index 5c161660cc..014c7d6570 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
+++ b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
@@ -1,15 +1,20 @@
/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#ifndef ORC_PROTO_WRAPPER_HH
@@ -20,27 +25,27 @@
DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wconversion")
- DIAGNOSTIC_IGNORE("-Wdeprecated")
- DIAGNOSTIC_IGNORE("-Wsign-conversion")
- DIAGNOSTIC_IGNORE("-Wunused-parameter")
+DIAGNOSTIC_IGNORE("-Wconversion")
+DIAGNOSTIC_IGNORE("-Wdeprecated")
+DIAGNOSTIC_IGNORE("-Wsign-conversion")
+DIAGNOSTIC_IGNORE("-Wunused-parameter")
#endif
#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wnested-anon-types")
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
- DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
- DIAGNOSTIC_IGNORE("-Wunknown-warning-option")
- DIAGNOSTIC_IGNORE("-Wweak-vtables")
- DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant")
+DIAGNOSTIC_IGNORE("-Wnested-anon-types")
+DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+DIAGNOSTIC_IGNORE("-Wunknown-warning-option")
+DIAGNOSTIC_IGNORE("-Wweak-vtables")
+DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant")
#endif
#if defined(_MSC_VER)
- DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned
- DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false'
+DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned
+DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false'
#endif
-#include "contrib/libs/apache/orc/proto/orc_proto.pb.h"
+#include "orc_proto.pb.h"
DIAGNOSTIC_POP
diff --git a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h
index aeab0f0033..18166f7200 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h
@@ -1,15 +1,20 @@
/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#ifndef SNAPPY_WRAPPER_HH
@@ -20,7 +25,7 @@
DIAGNOSTIC_PUSH
#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
#endif
#include <snappy.h>
diff --git a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
index 1af0bd002d..0a42daaf84 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
@@ -1,15 +1,20 @@
/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
#ifndef ZERO_COPY_STREAM_WRAPPER_HH
@@ -20,13 +25,13 @@
DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wdeprecated")
- DIAGNOSTIC_IGNORE("-Wpadded")
- DIAGNOSTIC_IGNORE("-Wunused-parameter")
+DIAGNOSTIC_IGNORE("-Wdeprecated")
+DIAGNOSTIC_IGNORE("-Wpadded")
+DIAGNOSTIC_IGNORE("-Wunused-parameter")
#endif
#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
#endif
#include <google/protobuf/io/zero_copy_stream.h>
diff --git a/contrib/libs/apache/orc/proto/orc_proto.proto b/contrib/libs/apache/orc/proto/orc_proto.proto
deleted file mode 100644
index ff05657a54..0000000000
--- a/contrib/libs/apache/orc/proto/orc_proto.proto
+++ /dev/null
@@ -1,451 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-syntax = "proto2";
-
-package orc.proto;
-
-option java_package = "org.apache.orc";
-
-message IntegerStatistics {
- optional sint64 minimum = 1;
- optional sint64 maximum = 2;
- optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
- optional double minimum = 1;
- optional double maximum = 2;
- optional double sum = 3;
-}
-
-message StringStatistics {
- optional string minimum = 1;
- optional string maximum = 2;
- // sum will store the total length of all strings in a stripe
- optional sint64 sum = 3;
- // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper
- // bound instead of the minimum or maximum values above.
- optional string lowerBound = 4;
- optional string upperBound = 5;
-}
-
-message BucketStatistics {
- repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
- optional string minimum = 1;
- optional string maximum = 2;
- optional string sum = 3;
-}
-
-message DateStatistics {
- // min,max values saved as days since epoch
- optional sint32 minimum = 1;
- optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
- // min,max values saved as milliseconds since epoch
- optional sint64 minimum = 1;
- optional sint64 maximum = 2;
- optional sint64 minimumUtc = 3;
- optional sint64 maximumUtc = 4;
- // store the lower 6 TS digits for min/max to achieve nanosecond precision
- optional int32 minimumNanos = 5;
- optional int32 maximumNanos = 6;
-}
-
-message BinaryStatistics {
- // sum will store the total binary blob length in a stripe
- optional sint64 sum = 1;
-}
-
-// Statistics for list and map
-message CollectionStatistics {
- optional uint64 minChildren = 1;
- optional uint64 maxChildren = 2;
- optional uint64 totalChildren = 3;
-}
-
-message ColumnStatistics {
- optional uint64 numberOfValues = 1;
- optional IntegerStatistics intStatistics = 2;
- optional DoubleStatistics doubleStatistics = 3;
- optional StringStatistics stringStatistics = 4;
- optional BucketStatistics bucketStatistics = 5;
- optional DecimalStatistics decimalStatistics = 6;
- optional DateStatistics dateStatistics = 7;
- optional BinaryStatistics binaryStatistics = 8;
- optional TimestampStatistics timestampStatistics = 9;
- optional bool hasNull = 10;
- optional uint64 bytesOnDisk = 11;
- optional CollectionStatistics collectionStatistics = 12;
-}
-
-message RowIndexEntry {
- repeated uint64 positions = 1 [packed=true];
- optional ColumnStatistics statistics = 2;
-}
-
-message RowIndex {
- repeated RowIndexEntry entry = 1;
-}
-
-message BloomFilter {
- optional uint32 numHashFunctions = 1;
- repeated fixed64 bitset = 2;
- optional bytes utf8bitset = 3;
-}
-
-message BloomFilterIndex {
- repeated BloomFilter bloomFilter = 1;
-}
-
-message Stream {
- // if you add new index stream kinds, you need to make sure to update
- // StreamName to ensure it is added to the stripe in the right area
- enum Kind {
- PRESENT = 0;
- DATA = 1;
- LENGTH = 2;
- DICTIONARY_DATA = 3;
- DICTIONARY_COUNT = 4;
- SECONDARY = 5;
- ROW_INDEX = 6;
- BLOOM_FILTER = 7;
- BLOOM_FILTER_UTF8 = 8;
- // Virtual stream kinds to allocate space for encrypted index and data.
- ENCRYPTED_INDEX = 9;
- ENCRYPTED_DATA = 10;
-
- // stripe statistics streams
- STRIPE_STATISTICS = 100;
- // A virtual stream kind that is used for setting the encryption IV.
- FILE_STATISTICS = 101;
- }
- optional Kind kind = 1;
- optional uint32 column = 2;
- optional uint64 length = 3;
-}
-
-message ColumnEncoding {
- enum Kind {
- DIRECT = 0;
- DICTIONARY = 1;
- DIRECT_V2 = 2;
- DICTIONARY_V2 = 3;
- }
- optional Kind kind = 1;
- optional uint32 dictionarySize = 2;
-
- // The encoding of the bloom filters for this column:
- // 0 or missing = none or original
- // 1 = ORC-135 (utc for timestamps)
- optional uint32 bloomEncoding = 3;
-}
-
-message StripeEncryptionVariant {
- repeated Stream streams = 1;
- repeated ColumnEncoding encoding = 2;
-}
-
-// each stripe looks like:
-// index streams
-// unencrypted
-// variant 1..N
-// data streams
-// unencrypted
-// variant 1..N
-// footer
-
-message StripeFooter {
- repeated Stream streams = 1;
- repeated ColumnEncoding columns = 2;
- optional string writerTimezone = 3;
- // one for each column encryption variant
- repeated StripeEncryptionVariant encryption = 4;
-}
-
-// the file tail looks like:
-// encrypted stripe statistics: ColumnarStripeStatistics (order by variant)
-// stripe statistics: Metadata
-// footer: Footer
-// postscript: PostScript
-// psLen: byte
-
-message StringPair {
- optional string key = 1;
- optional string value = 2;
-}
-
-message Type {
- enum Kind {
- BOOLEAN = 0;
- BYTE = 1;
- SHORT = 2;
- INT = 3;
- LONG = 4;
- FLOAT = 5;
- DOUBLE = 6;
- STRING = 7;
- BINARY = 8;
- TIMESTAMP = 9;
- LIST = 10;
- MAP = 11;
- STRUCT = 12;
- UNION = 13;
- DECIMAL = 14;
- DATE = 15;
- VARCHAR = 16;
- CHAR = 17;
- TIMESTAMP_INSTANT = 18;
- }
- optional Kind kind = 1;
- repeated uint32 subtypes = 2 [packed=true];
- repeated string fieldNames = 3;
- optional uint32 maximumLength = 4;
- optional uint32 precision = 5;
- optional uint32 scale = 6;
- repeated StringPair attributes = 7;
-}
-
-message StripeInformation {
- // the global file offset of the start of the stripe
- optional uint64 offset = 1;
- // the number of bytes of index
- optional uint64 indexLength = 2;
- // the number of bytes of data
- optional uint64 dataLength = 3;
- // the number of bytes in the stripe footer
- optional uint64 footerLength = 4;
- // the number of rows in this stripe
- optional uint64 numberOfRows = 5;
- // If this is present, the reader should use this value for the encryption
- // stripe id for setting the encryption IV. Otherwise, the reader should
- // use one larger than the previous stripe's encryptStripeId.
- // For unmerged ORC files, the first stripe will use 1 and the rest of the
- // stripes won't have it set. For merged files, the stripe information
- // will be copied from their original files and thus the first stripe of
- // each of the input files will reset it to 1.
- // Note that 1 was choosen, because protobuf v3 doesn't serialize
- // primitive types that are the default (eg. 0).
- optional uint64 encryptStripeId = 6;
- // For each encryption variant, the new encrypted local key to use
- // until we find a replacement.
- repeated bytes encryptedLocalKeys = 7;
-}
-
-message UserMetadataItem {
- optional string name = 1;
- optional bytes value = 2;
-}
-
-// StripeStatistics (1 per a stripe), which each contain the
-// ColumnStatistics for each column.
-// This message type is only used in ORC v0 and v1.
-message StripeStatistics {
- repeated ColumnStatistics colStats = 1;
-}
-
-// This message type is only used in ORC v0 and v1.
-message Metadata {
- repeated StripeStatistics stripeStats = 1;
-}
-
-// In ORC v2 (and for encrypted columns in v1), each column has
-// their column statistics written separately.
-message ColumnarStripeStatistics {
- // one value for each stripe in the file
- repeated ColumnStatistics colStats = 1;
-}
-
-enum EncryptionAlgorithm {
- UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms
- AES_CTR_128 = 1;
- AES_CTR_256 = 2;
-}
-
-message FileStatistics {
- repeated ColumnStatistics column = 1;
-}
-
-// How was the data masked? This isn't necessary for reading the file, but
-// is documentation about how the file was written.
-message DataMask {
- // the kind of masking, which may include third party masks
- optional string name = 1;
- // parameters for the mask
- repeated string maskParameters = 2;
- // the unencrypted column roots this mask was applied to
- repeated uint32 columns = 3 [packed = true];
-}
-
-// Information about the encryption keys.
-message EncryptionKey {
- optional string keyName = 1;
- optional uint32 keyVersion = 2;
- optional EncryptionAlgorithm algorithm = 3;
-}
-
-// The description of an encryption variant.
-// Each variant is a single subtype that is encrypted with a single key.
-message EncryptionVariant {
- // the column id of the root
- optional uint32 root = 1;
- // The master key that was used to encrypt the local key, referenced as
- // an index into the Encryption.key list.
- optional uint32 key = 2;
- // the encrypted key for the file footer
- optional bytes encryptedKey = 3;
- // the stripe statistics for this variant
- repeated Stream stripeStatistics = 4;
- // encrypted file statistics as a FileStatistics
- optional bytes fileStatistics = 5;
-}
-
-// Which KeyProvider encrypted the local keys.
-enum KeyProviderKind {
- UNKNOWN = 0;
- HADOOP = 1;
- AWS = 2;
- GCP = 3;
- AZURE = 4;
-}
-
-message Encryption {
- // all of the masks used in this file
- repeated DataMask mask = 1;
- // all of the keys used in this file
- repeated EncryptionKey key = 2;
- // The encrypted variants.
- // Readers should prefer the first variant that the user has access to
- // the corresponding key. If they don't have access to any of the keys,
- // they should get the unencrypted masked data.
- repeated EncryptionVariant variants = 3;
- // How are the local keys encrypted?
- optional KeyProviderKind keyProvider = 4;
-}
-
-enum CalendarKind {
- UNKNOWN_CALENDAR = 0;
- // A hybrid Julian/Gregorian calendar with a cutover point in October 1582.
- JULIAN_GREGORIAN = 1;
- // A calendar that extends the Gregorian calendar back forever.
- PROLEPTIC_GREGORIAN = 2;
-}
-
-message Footer {
- optional uint64 headerLength = 1;
- optional uint64 contentLength = 2;
- repeated StripeInformation stripes = 3;
- repeated Type types = 4;
- repeated UserMetadataItem metadata = 5;
- optional uint64 numberOfRows = 6;
- repeated ColumnStatistics statistics = 7;
- optional uint32 rowIndexStride = 8;
-
- // Each implementation that writes ORC files should register for a code
- // 0 = ORC Java
- // 1 = ORC C++
- // 2 = Presto
- // 3 = Scritchley Go from https://github.com/scritchley/orc
- // 4 = Trino
- optional uint32 writer = 9;
-
- // information about the encryption in this file
- optional Encryption encryption = 10;
- optional CalendarKind calendar = 11;
-
- // informative description about the version of the software that wrote
- // the file. It is assumed to be within a given writer, so for example
- // ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT".
- optional string softwareVersion = 12;
-}
-
-enum CompressionKind {
- NONE = 0;
- ZLIB = 1;
- SNAPPY = 2;
- LZO = 3;
- LZ4 = 4;
- ZSTD = 5;
-}
-
-// Serialized length must be less that 255 bytes
-message PostScript {
- optional uint64 footerLength = 1;
- optional CompressionKind compression = 2;
- optional uint64 compressionBlockSize = 3;
- // the version of the file format
- // [0, 11] = Hive 0.11
- // [0, 12] = Hive 0.12
- repeated uint32 version = 4 [packed = true];
- optional uint64 metadataLength = 5;
-
- // The version of the writer that wrote the file. This number is
- // updated when we make fixes or large changes to the writer so that
- // readers can detect whether a given bug is present in the data.
- //
- // Only the Java ORC writer may use values under 6 (or missing) so that
- // readers that predate ORC-202 treat the new writers correctly. Each
- // writer should assign their own sequence of versions starting from 6.
- //
- // Version of the ORC Java writer:
- // 0 = original
- // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
- // string statistics use utf8 for min/max)
- // 2 = HIVE-4243 fixed (use real column names from Hive tables)
- // 3 = HIVE-12055 added (vectorized writer implementation)
- // 4 = HIVE-13083 fixed (decimals write present stream correctly)
- // 5 = ORC-101 fixed (bloom filters use utf8 consistently)
- // 6 = ORC-135 fixed (timestamp statistics use utc)
- // 7 = ORC-517 fixed (decimal64 min/max incorrect)
- // 8 = ORC-203 added (trim very long string statistics)
- // 9 = ORC-14 added (column encryption)
- //
- // Version of the ORC C++ writer:
- // 6 = original
- //
- // Version of the Presto writer:
- // 6 = original
- //
- // Version of the Scritchley Go writer:
- // 6 = original
- //
- // Version of the Trino writer:
- // 6 = original
- //
- optional uint32 writerVersion = 6;
-
- // the number of bytes in the encrypted stripe statistics
- optional uint64 stripeStatisticsLength = 7;
-
- // Leave this last in the record
- optional string magic = 8000;
-}
-
-// The contents of the file tail that must be serialized.
-// This gets serialized as part of OrcSplit, also used by footer cache.
-message FileTail {
- optional PostScript postscript = 1;
- optional Footer footer = 2;
- optional uint64 fileLength = 3;
- optional uint64 postscriptLength = 4;
-}
diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make
index be3b4d5a01..ec4d745340 100644
--- a/contrib/libs/apache/orc/ya.make
+++ b/contrib/libs/apache/orc/ya.make
@@ -6,11 +6,12 @@ LICENSE(Apache-2.0)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(1.8.0)
+VERSION(2.0.0)
-ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-1.8.0.tar.gz)
+ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.0.0.tar.gz)
PEERDIR(
+ contrib/libs/apache/orc-format
contrib/libs/lz4
contrib/libs/snappy
contrib/libs/zlib
@@ -20,7 +21,6 @@ PEERDIR(
ADDINCL(
GLOBAL contrib/libs/apache/orc/c++/include
contrib/libs/apache/orc/c++/src
- contrib/libs/apache/orc/proto
contrib/libs/lz4
contrib/libs/zstd/include
)
@@ -29,15 +29,23 @@ NO_COMPILER_WARNINGS()
NO_UTIL()
+CFLAGS(
+ -DENABLE_METRICS=0
+)
+
SRCS(
c++/src/Adaptor.cc
+ c++/src/BlockBuffer.cc
c++/src/BloomFilter.cc
+ c++/src/BpackingDefault.cc
c++/src/ByteRLE.cc
c++/src/ColumnPrinter.cc
c++/src/ColumnReader.cc
c++/src/ColumnWriter.cc
c++/src/Common.cc
c++/src/Compression.cc
+ c++/src/ConvertColumnReader.cc
+ c++/src/CpuInfoUtil.cc
c++/src/Exceptions.cc
c++/src/Int128.cc
c++/src/LzoDecompressor.cc
@@ -50,6 +58,7 @@ SRCS(
c++/src/Reader.cc
c++/src/RleDecoderV2.cc
c++/src/RleEncoderV2.cc
+ c++/src/SchemaEvolution.cc
c++/src/Statistics.cc
c++/src/StripeStream.cc
c++/src/Timezone.cc
@@ -64,7 +73,6 @@ SRCS(
c++/src/sargs/SargsApplier.cc
c++/src/sargs/SearchArgument.cc
c++/src/sargs/TruthValue.cc
- proto/orc_proto.proto
)
END()