diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-03-17 04:47:32 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2024-03-17 04:57:12 +0300 |
commit | 0816a937aebb4bb8ff5d68730c625cb1c99c9b4b (patch) | |
tree | 45dd2b2d18017590838384a1a7687279ac280444 /contrib/libs/apache/orc | |
parent | 6d5eb3aff8e43031b7dcb8be42d649799cd8a6c3 (diff) | |
download | ydb-0816a937aebb4bb8ff5d68730c625cb1c99c9b4b.tar.gz |
Update contrib/libs/apache/orc to 2.0.0
28031d32eb02ad8a790abc416b7db3264738c474
Diffstat (limited to 'contrib/libs/apache/orc')
94 files changed, 10066 insertions, 8196 deletions
diff --git a/contrib/libs/apache/orc/README.md b/contrib/libs/apache/orc/README.md index a7d959247e..60b0da5fcb 100644 --- a/contrib/libs/apache/orc/README.md +++ b/contrib/libs/apache/orc/README.md @@ -18,9 +18,9 @@ lists, maps, and unions. This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files. Releases: -* Latest: <a href="http://orc.apache.org/releases">Apache ORC releases</a> -* Maven Central: <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a> -* Downloads: <a href="http://orc.apache.org/downloads">Apache ORC downloads</a> +* Latest: <a href="https://orc.apache.org/releases">Apache ORC releases</a> +* Maven Central: <a href="https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a> +* Downloads: <a href="https://orc.apache.org/downloads">Apache ORC downloads</a> * Release tags: <a href="https://github.com/apache/orc/releases">Apache ORC release tags</a> * Plan: <a href="https://github.com/apache/orc/milestones">Apache ORC future release plan</a> @@ -28,7 +28,7 @@ The current build status: * Main branch <a href="https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain"> ![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main)</a> -Bug tracking: <a href="http://orc.apache.org/bugs">Apache Jira</a> +Bug tracking: <a href="https://orc.apache.org/bugs">Apache Jira</a> The subdirectories are: @@ -37,15 +37,14 @@ The subdirectories are: * docker - docker scripts to build and test on various linuxes * examples - various ORC example files that are used to test compatibility * java - the java reader and writer -* proto - the protocol buffer definition for the ORC metadata * site - the website and documentation * tools - the c++ tools for reading and inspecting ORC files ### Building -* Install java 1.8 or higher -* Install maven 3.8.6 or higher -* Install cmake +* Install java 17 or higher +* Install maven 3.9.6 or higher +* Install cmake 3.12 or higher To build a release version with debug information: ```shell @@ -93,3 +92,18 @@ To build only the C++ library: % make test-out ``` + +To build the C++ library with AVX512 enabled: +```shell +export ORC_USER_SIMD_LEVEL=AVX512 +% mkdir build +% cd build +% cmake .. -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON +% make package +% make test-out +``` +Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries. + +Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization. + +Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time. diff --git a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh index 91277392c7..d08f6deac7 100644 --- a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh @@ -27,11 +27,11 @@ namespace orc { class BloomFilter { - public: + public: virtual ~BloomFilter(); // test if the element exists in BloomFilter - virtual bool testBytes(const char * data, int64_t length) const = 0; + virtual bool testBytes(const char* data, int64_t length) const = 0; virtual bool testLong(int64_t data) const = 0; virtual bool testDouble(double data) const = 0; }; @@ -40,6 +40,6 @@ namespace orc { std::vector<std::shared_ptr<BloomFilter>> entries; }; -} +} // namespace orc -#endif //ORC_BLOOMFILTER_HH +#endif // ORC_BLOOMFILTER_HH diff --git a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh index aa19214738..328c0e84b6 100644 --- a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh @@ -19,12 +19,11 @@ #ifndef ORC_COLUMN_PRINTER_HH #define ORC_COLUMN_PRINTER_HH -#include "orc/orc-config.hh" #include "orc/OrcFile.hh" #include "orc/Vector.hh" +#include "orc/orc-config.hh" #include <stdio.h> -#include <string> #include <memory> #include <string> #include <vector> @@ -32,12 +31,12 @@ namespace orc { class ColumnPrinter { - protected: - std::string &buffer; - bool hasNulls ; + protected: + std::string& buffer; + bool hasNulls; const char* notNull; - public: + public: ColumnPrinter(std::string&); virtual ~ColumnPrinter(); virtual void printRow(uint64_t rowId) = 0; @@ -45,7 +44,6 @@ namespace orc { virtual void reset(const ColumnVectorBatch& batch); }; - ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&, - const Type* type); -} + std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&, const Type* type); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Common.hh b/contrib/libs/apache/orc/c++/include/orc/Common.hh index e51e37e710..9da67a3f19 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Common.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Common.hh @@ -19,47 +19,45 @@ #ifndef ORC_COMMON_HH #define ORC_COMMON_HH -#include "orc/Vector.hh" -#include "orc/Type.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" #include <string> namespace orc { class FileVersion { - private: + private: uint32_t majorVersion; uint32_t minorVersion; - public: + + public: static const FileVersion& v_0_11(); static const FileVersion& v_0_12(); static const FileVersion& UNSTABLE_PRE_2_0(); - FileVersion(uint32_t major, uint32_t minor) : - majorVersion(major), minorVersion(minor) { - } + FileVersion(uint32_t major, uint32_t minor) : majorVersion(major), minorVersion(minor) {} /** * Get major version */ uint32_t getMajor() const { - return this->majorVersion; + return this->majorVersion; } /** * Get minor version */ uint32_t getMinor() const { - return this->minorVersion; + return this->minorVersion; } - bool operator == (const FileVersion & right) const { - return this->majorVersion == right.getMajor() && - this->minorVersion == right.getMinor(); + bool operator==(const FileVersion& right) const { + return this->majorVersion == right.getMajor() && this->minorVersion == right.getMinor(); } - bool operator != (const FileVersion & right) const { + bool operator!=(const FileVersion& right) const { return !(*this == right); } @@ -72,6 +70,7 @@ namespace orc { PRESTO_WRITER = 2, SCRITCHLEY_GO = 3, TRINO_WRITER = 4, + CUDF_WRITER = 5, UNKNOWN_WRITER = INT32_MAX }; @@ -140,7 +139,7 @@ namespace orc { std::string streamKindToString(StreamKind kind); class StreamInformation { - public: + public: virtual ~StreamInformation(); virtual StreamKind getKind() const = 0; @@ -159,7 +158,7 @@ namespace orc { std::string columnEncodingKindToString(ColumnEncodingKind kind); class StripeInformation { - public: + public: virtual ~StripeInformation(); /** @@ -184,7 +183,7 @@ namespace orc { * Get the length of the stripe's data. * @return the number of bytes in the stripe */ - virtual uint64_t getDataLength()const = 0; + virtual uint64_t getDataLength() const = 0; /** * Get the length of the stripe's tail section, which contains its index. @@ -206,8 +205,7 @@ namespace orc { /** * Get the StreamInformation for the given stream. */ - virtual ORC_UNIQUE_PTR<StreamInformation> - getStreamInformation(uint64_t streamId) const = 0; + virtual std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId) const = 0; /** * Get the column encoding for the given column. @@ -238,10 +236,8 @@ namespace orc { template <> inline bool compare(Decimal val1, Decimal val2) { // compare integral parts - Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, - val1.scale); - Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, - val2.scale); + Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, val1.scale); + Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, val2.scale); if (integral1 < integral2) { return true; @@ -253,25 +249,17 @@ namespace orc { // unnecessary to check overflow here because the scaled number will not // exceed original ones bool overflow = false, positive = val1.value >= 0; - val1.value -= scaleUpInt128ByPowerOfTen(integral1, - val1.scale, - overflow); - val2.value -= scaleUpInt128ByPowerOfTen(integral2, - val2.scale, - overflow); + val1.value -= scaleUpInt128ByPowerOfTen(integral1, val1.scale, overflow); + val2.value -= scaleUpInt128ByPowerOfTen(integral2, val2.scale, overflow); int32_t diff = val1.scale - val2.scale; if (diff > 0) { - val2.value = scaleUpInt128ByPowerOfTen(val2.value, - diff, - overflow); + val2.value = scaleUpInt128ByPowerOfTen(val2.value, diff, overflow); if (overflow) { return positive ? true : false; } } else { - val1.value = scaleUpInt128ByPowerOfTen(val1.value, - -diff, - overflow); + val1.value = scaleUpInt128ByPowerOfTen(val1.value, -diff, overflow); if (overflow) { return positive ? false : true; } @@ -317,6 +305,6 @@ namespace orc { return !(lhs != rhs); } -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh index 9765d4fd6b..0536dbd164 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh @@ -26,35 +26,47 @@ namespace orc { - class NotImplementedYet: public std::logic_error { - public: + class NotImplementedYet : public std::logic_error { + public: explicit NotImplementedYet(const std::string& what_arg); explicit NotImplementedYet(const char* what_arg); - virtual ~NotImplementedYet() ORC_NOEXCEPT; + ~NotImplementedYet() noexcept override; NotImplementedYet(const NotImplementedYet&); - private: + + private: NotImplementedYet& operator=(const NotImplementedYet&); }; - class ParseError: public std::runtime_error { - public: + class ParseError : public std::runtime_error { + public: explicit ParseError(const std::string& what_arg); explicit ParseError(const char* what_arg); - virtual ~ParseError() ORC_NOEXCEPT; + ~ParseError() noexcept override; ParseError(const ParseError&); - private: + + private: ParseError& operator=(const ParseError&); }; - class InvalidArgument: public std::runtime_error { - public: + class InvalidArgument : public std::runtime_error { + public: explicit InvalidArgument(const std::string& what_arg); explicit InvalidArgument(const char* what_arg); - virtual ~InvalidArgument() ORC_NOEXCEPT; + ~InvalidArgument() noexcept override; InvalidArgument(const InvalidArgument&); - private: + + private: InvalidArgument& operator=(const InvalidArgument&); }; -} + + class SchemaEvolutionError : public std::logic_error { + public: + explicit SchemaEvolutionError(const std::string& what_arg); + explicit SchemaEvolutionError(const char* what_arg); + virtual ~SchemaEvolutionError() noexcept override; + SchemaEvolutionError(const SchemaEvolutionError&); + SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete; + }; +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh index 1f68b2b119..bcb4a58e22 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh @@ -35,7 +35,7 @@ namespace orc { * */ class Int128 { - public: + public: Int128() { highbits = 0; lowbits = 0; @@ -110,7 +110,7 @@ namespace orc { * @param right the number to add * @return *this */ - Int128& operator+=(const Int128 &right) { + Int128& operator+=(const Int128& right) { uint64_t sum = lowbits + right.lowbits; highbits += right.highbits; if (sum < lowbits) { @@ -125,7 +125,7 @@ namespace orc { * @param right the number to subtract * @return *this */ - Int128& operator-=(const Int128 &right) { + Int128& operator-=(const Int128& right) { uint64_t diff = lowbits - right.lowbits; highbits -= right.highbits; if (diff > lowbits) { @@ -140,7 +140,7 @@ namespace orc { * @param right the number to multiply by * @return *this */ - Int128& operator*=(const Int128 &right); + Int128& operator*=(const Int128& right); /** * Divide this number by right and return the result. This operation is @@ -154,14 +154,14 @@ namespace orc { * @param right the number to divide by * @param remainder the remainder after the division */ - Int128 divide(const Int128 &right, Int128& remainder) const; + Int128 divide(const Int128& right, Int128& remainder) const; /** * Logical or between two Int128. * @param right the number to or in * @return *this */ - Int128& operator|=(const Int128 &right) { + Int128& operator|=(const Int128& right) { lowbits |= right.lowbits; highbits |= right.highbits; return *this; @@ -172,7 +172,7 @@ namespace orc { * @param right the number to and in * @return *this */ - Int128& operator&=(const Int128 &right) { + Int128& operator&=(const Int128& right) { lowbits &= right.lowbits; highbits &= right.highbits; return *this; @@ -183,7 +183,7 @@ namespace orc { * @param right the number to and in * @return logical and result */ - Int128 operator&(const Int128 &right) { + Int128 operator&(const Int128& right) { Int128 value = *this; value &= right; return value; @@ -219,8 +219,7 @@ namespace orc { if (bits < 64) { lowbits >>= bits; lowbits |= static_cast<uint64_t>(highbits << (64 - bits)); - highbits = static_cast<int64_t> - (static_cast<uint64_t>(highbits) >> bits); + highbits = static_cast<int64_t>(static_cast<uint64_t>(highbits) >> bits); } else if (bits < 128) { lowbits = static_cast<uint64_t>(highbits >> (bits - 64)); highbits = highbits >= 0 ? 0 : -1l; @@ -240,7 +239,7 @@ namespace orc { return highbits != right.highbits || lowbits != right.lowbits; } - bool operator<(const Int128 &right) const { + bool operator<(const Int128& right) const { if (highbits == right.highbits) { return lowbits < right.lowbits; } else { @@ -248,7 +247,7 @@ namespace orc { } } - bool operator<=(const Int128 &right) const { + bool operator<=(const Int128& right) const { if (highbits == right.highbits) { return lowbits <= right.lowbits; } else { @@ -256,7 +255,7 @@ namespace orc { } } - bool operator>(const Int128 &right) const { + bool operator>(const Int128& right) const { if (highbits == right.highbits) { return lowbits > right.lowbits; } else { @@ -264,7 +263,7 @@ namespace orc { } } - bool operator>=(const Int128 &right) const { + bool operator>=(const Int128& right) const { if (highbits == right.highbits) { return lowbits >= right.lowbits; } else { @@ -273,10 +272,8 @@ namespace orc { } uint32_t hash() const { - return static_cast<uint32_t>(highbits >> 32) ^ - static_cast<uint32_t>(highbits) ^ - static_cast<uint32_t>(lowbits >> 32) ^ - static_cast<uint32_t>(lowbits); + return static_cast<uint32_t>(highbits >> 32) ^ static_cast<uint32_t>(highbits) ^ + static_cast<uint32_t>(lowbits >> 32) ^ static_cast<uint32_t>(lowbits); } /** @@ -284,17 +281,17 @@ namespace orc { */ bool fitsInLong() const { switch (highbits) { - case 0: - return 0 == (lowbits & LONG_SIGN_BIT); - case -1: - return 0 != (lowbits & LONG_SIGN_BIT); - default: - return false; + case 0: + return 0 == (lowbits & LONG_SIGN_BIT); + case -1: + return 0 != (lowbits & LONG_SIGN_BIT); + default: + return false; } } /** - * Convert the value to a long and + * Convert the value to a long and throw std::range_error on overflow. */ int64_t toLong() const { if (fitsInLong()) { @@ -304,6 +301,11 @@ namespace orc { } /** + * Convert the value to a double, the return value may not be precise. + */ + double toDouble() const; + + /** * Return the base 10 string representation of the integer. */ std::string toString() const; @@ -316,8 +318,7 @@ namespace orc { * @param trimTrailingZeros whether or not to trim trailing zeros * @return converted string representation */ - std::string toDecimalString(int32_t scale = 0, - bool trimTrailingZeros = false) const; + std::string toDecimalString(int32_t scale = 0, bool trimTrailingZeros = false) const; /** * Return the base 16 string representation of the two's complement with @@ -329,14 +330,14 @@ namespace orc { /** * Get the high bits of the twos complement representation of the number. */ - int64_t getHighBits() { + int64_t getHighBits() const { return highbits; } /** * Get the low bits of the twos complement representation of the number. */ - uint64_t getLowBits() { + uint64_t getLowBits() const { return lowbits; } @@ -347,15 +348,14 @@ namespace orc { * @param wasNegative set to true if the original number was negative * @return the number of elements that were set in the array (1 to 4) */ - int64_t fillInArray(uint32_t* array, bool &wasNegative) const; + int64_t fillInArray(uint32_t* array, bool& wasNegative) const; - private: + private: static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u; int64_t highbits; uint64_t lowbits; }; - /** * Scales up an Int128 value * @param value the Int128 value to scale @@ -363,9 +363,7 @@ namespace orc { * @param overflow returns whether the result overflows or not * @return the scaled value */ - Int128 scaleUpInt128ByPowerOfTen(Int128 value, - int32_t power, - bool &overflow); + Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power, bool& overflow); /** * Scales down an Int128 value * @param value the Int128 value to scale @@ -373,5 +371,35 @@ namespace orc { * @return the scaled value */ Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power); -} + + /** + * Converts decimal value to different precision/scale + * @param value the Int128 value to convert + * @param fromScale the scale of the value + * @param toPrecision the precision to convert to + * @param toScale the scale to convert to + * @param round whether to round the value or truncate + * @return whether the conversion overflows and the converted value if does not overflow + */ + std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision, + int32_t toScale, bool round = true); + + /** + * Converts a float value to decimal + * @param value the float value to convert + * @param precision the precision of the decimal + * @param scale the scale of the decimal + * @return whether the conversion overflows and the converted value if does not overflow + */ + template <typename T> + std::enable_if_t<std::is_floating_point_v<T>, std::pair<bool, Int128>> convertDecimal( + T value, int32_t precision, int32_t scale); + + extern template std::pair<bool, Int128> convertDecimal<float>(float value, int32_t precision, + int32_t scale); + + extern template std::pair<bool, Int128> convertDecimal<double>(double value, int32_t precision, + int32_t scale); + +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh index 71d76c438a..6d999d3aa8 100644 --- a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh +++ b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh @@ -19,15 +19,13 @@ #ifndef MEMORYPOOL_HH_ #define MEMORYPOOL_HH_ -#include "orc/orc-config.hh" -#include "orc/Int128.hh" - #include <memory> - +#include "orc/Int128.hh" +#include "orc/orc-config.hh" namespace orc { class MemoryPool { - public: + public: virtual ~MemoryPool(); virtual char* malloc(uint64_t size) = 0; @@ -37,7 +35,7 @@ namespace orc { template <class T> class DataBuffer { - private: + private: MemoryPool& memoryPool; T* buf; // current size @@ -49,10 +47,10 @@ namespace orc { DataBuffer(DataBuffer& buffer); DataBuffer& operator=(DataBuffer& buffer); - public: + public: DataBuffer(MemoryPool& pool, uint64_t _size = 0); - DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT; + DataBuffer(DataBuffer<T>&& buffer) noexcept; virtual ~DataBuffer(); @@ -64,20 +62,25 @@ namespace orc { return buf; } - uint64_t size() { + uint64_t size() const { return currentSize; } - uint64_t capacity() { + uint64_t capacity() const { return currentCapacity; } + const T& operator[](uint64_t i) const { + return buf[i]; + } + T& operator[](uint64_t i) { return buf[i]; } void reserve(uint64_t _size); void resize(uint64_t _size); + void zeroOut(); }; // Specializations for char @@ -104,6 +107,14 @@ namespace orc { template <> void DataBuffer<double>::resize(uint64_t newSize); + // Specializations for float + + template <> + DataBuffer<float>::~DataBuffer(); + + template <> + void DataBuffer<float>::resize(uint64_t newSize); + // Specializations for int64_t template <> @@ -112,6 +123,30 @@ namespace orc { template <> void DataBuffer<int64_t>::resize(uint64_t newSize); + // Specializations for int32_t + + template <> + DataBuffer<int32_t>::~DataBuffer(); + + template <> + void DataBuffer<int32_t>::resize(uint64_t newSize); + + // Specializations for int16_t + + template <> + DataBuffer<int16_t>::~DataBuffer(); + + template <> + void DataBuffer<int16_t>::resize(uint64_t newSize); + + // Specializations for int8_t + + template <> + DataBuffer<int8_t>::~DataBuffer(); + + template <> + void DataBuffer<int8_t>::resize(uint64_t newSize); + // Specializations for uint64_t template <> @@ -128,23 +163,31 @@ namespace orc { template <> void DataBuffer<unsigned char>::resize(uint64_t newSize); - #ifdef __clang__ - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wweak-template-vtables" - #endif + // Specializations for Int128 + + template <> + void DataBuffer<Int128>::zeroOut(); + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wweak-template-vtables" +#endif extern template class DataBuffer<char>; extern template class DataBuffer<char*>; extern template class DataBuffer<double>; + extern template class DataBuffer<float>; extern template class DataBuffer<Int128>; extern template class DataBuffer<int64_t>; + extern template class DataBuffer<int32_t>; + extern template class DataBuffer<int16_t>; + extern template class DataBuffer<int8_t>; extern template class DataBuffer<uint64_t>; extern template class DataBuffer<unsigned char>; - #ifdef __clang__ - #pragma clang diagnostic pop - #endif -} // namespace orc - +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +} // namespace orc #endif /* MEMORYPOOL_HH_ */ diff --git a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh index c64853168a..6e4a07bf7c 100644 --- a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh +++ b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh @@ -21,9 +21,9 @@ #include <string> -#include "orc/orc-config.hh" #include "orc/Reader.hh" #include "orc/Writer.hh" +#include "orc/orc-config.hh" /** /file orc/OrcFile.hh @brief The top level interface to ORC. @@ -35,7 +35,7 @@ namespace orc { * An abstract interface for providing ORC readers a stream of bytes. */ class InputStream { - public: + public: virtual ~InputStream(); /** @@ -56,9 +56,7 @@ namespace orc { * @param length the number of bytes to read. * @param offset the position in the stream to read from. */ - virtual void read(void* buf, - uint64_t length, - uint64_t offset) = 0; + virtual void read(void* buf, uint64_t length, uint64_t offset) = 0; /** * Get the name of the stream for error messages. @@ -70,7 +68,7 @@ namespace orc { * An abstract interface for providing ORC writer a stream of bytes. */ class OutputStream { - public: + public: virtual ~OutputStream(); /** @@ -100,38 +98,50 @@ namespace orc { * Close the stream and flush any pending data to the disk. */ virtual void close() = 0; + + /** + * Flush any pending data to the disk. + */ + virtual void flush() { + throw NotImplementedYet("Not supported"); + } }; /** * Create a stream to a local file or HDFS file if path begins with "hdfs://" * @param path the name of the file in the local file system or HDFS + * @param metrics the metrics of the reader */ - ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path); + std::unique_ptr<InputStream> readFile(const std::string& path, ReaderMetrics* metrics = nullptr); /** * Create a stream to a local file. * @param path the name of the file in the local file system + * @param metrics the metrics of the reader */ - ORC_UNIQUE_PTR<InputStream> readLocalFile(const std::string& path); + std::unique_ptr<InputStream> readLocalFile(const std::string& path, + ReaderMetrics* metrics = nullptr); /** * Create a stream to an HDFS file. * @param path the uri of the file in HDFS + * @param metrics the metrics of the reader */ - ORC_UNIQUE_PTR<InputStream> readHdfsFile(const std::string& path); + std::unique_ptr<InputStream> readHdfsFile(const std::string& path, + ReaderMetrics* metrics = nullptr); /** * Create a reader to read the ORC file. * @param stream the stream to read * @param options the options for reading the file */ - ORC_UNIQUE_PTR<Reader> createReader(ORC_UNIQUE_PTR<InputStream> stream, - const ReaderOptions& options); + std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream, + const ReaderOptions& options); /** * Create a stream to write to a local file. * @param path the name of the file in the local file system */ - ORC_UNIQUE_PTR<OutputStream> writeLocalFile(const std::string& path); + std::unique_ptr<OutputStream> writeLocalFile(const std::string& path); /** * Create a writer to write the ORC file. @@ -139,10 +149,8 @@ namespace orc { * @param stream the stream to write to * @param options the options for writing the file */ - ORC_UNIQUE_PTR<Writer> createWriter( - const Type& type, - OutputStream* stream, - const WriterOptions& options); -} + std::unique_ptr<Writer> createWriter(const Type& type, OutputStream* stream, + const WriterOptions& options); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh index ddc8b55055..b631c2c6ea 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh @@ -21,12 +21,13 @@ #include "orc/BloomFilter.hh" #include "orc/Common.hh" -#include "orc/orc-config.hh" #include "orc/Statistics.hh" -#include "orc/sargs/SearchArgument.hh" #include "orc/Type.hh" #include "orc/Vector.hh" +#include "orc/orc-config.hh" +#include "orc/sargs/SearchArgument.hh" +#include <atomic> #include <map> #include <memory> #include <set> @@ -40,13 +41,35 @@ namespace orc { struct RowReaderOptionsPrivate; /** + * Expose the reader metrics including the latency and + * number of calls of the decompression/decoding/IO modules. + */ + struct ReaderMetrics { + std::atomic<uint64_t> ReaderCall{0}; + // ReaderInclusiveLatencyUs contains the latency of + // the decompression/decoding/IO modules. + std::atomic<uint64_t> ReaderInclusiveLatencyUs{0}; + std::atomic<uint64_t> DecompressionCall{0}; + std::atomic<uint64_t> DecompressionLatencyUs{0}; + std::atomic<uint64_t> DecodingCall{0}; + std::atomic<uint64_t> DecodingLatencyUs{0}; + std::atomic<uint64_t> ByteDecodingCall{0}; + std::atomic<uint64_t> ByteDecodingLatencyUs{0}; + std::atomic<uint64_t> IOCount{0}; + std::atomic<uint64_t> IOBlockingLatencyUs{0}; + std::atomic<uint64_t> SelectedRowGroupCount{0}; + std::atomic<uint64_t> EvaluatedRowGroupCount{0}; + }; + ReaderMetrics* getDefaultReaderMetrics(); + + /** * Options for creating a Reader. */ class ReaderOptions { - private: - ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits; + private: + std::unique_ptr<ReaderOptionsPrivate> privateBits; - public: + public: ReaderOptions(); ReaderOptions(const ReaderOptions&); ReaderOptions(ReaderOptions&); @@ -77,6 +100,14 @@ namespace orc { ReaderOptions& setMemoryPool(MemoryPool& pool); /** + * Set the reader metrics. + * + * Defaults to nullptr. + * When set to nullptr, the reader metrics will be disabled. + */ + ReaderOptions& setReaderMetrics(ReaderMetrics* metrics); + + /** * Set the location of the tail as defined by the logical length of the * file. */ @@ -102,16 +133,21 @@ namespace orc { * Get the memory allocator. */ MemoryPool* getMemoryPool() const; + + /** + * Get the reader metrics. + */ + ReaderMetrics* getReaderMetrics() const; }; /** * Options for creating a RowReader. */ class RowReaderOptions { - private: - ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits; + private: + std::unique_ptr<RowReaderOptionsPrivate> privateBits; - public: + public: RowReaderOptions(); RowReaderOptions(const RowReaderOptions&); RowReaderOptions(RowReaderOptions&); @@ -164,8 +200,7 @@ namespace orc { * @param idReadIntentMap a map of IdReadIntentMap. * @return this */ - RowReaderOptions& - includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap); + RowReaderOptions& includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap); /** * Set the section of the file to process. @@ -289,8 +324,39 @@ namespace orc { * Get the IdReadIntentMap map that was supplied by client. */ const IdReadIntentMap getIdReadIntentMap() const; - }; + /** + * Set whether use fixed width numeric vectorBatch or not, such as int32_t / int16_t / int8_t / + * float vectorBatch. + */ + RowReaderOptions& setUseTightNumericVector(bool useTightNumericVector); + + /** + * Get whether or not to use fixed width numeric columnVectorBatch. + * @return if not set, the default is false + */ + bool getUseTightNumericVector() const; + + /** + * Set read type for schema evolution + */ + RowReaderOptions& setReadType(std::shared_ptr<Type> type); + + /** + * Get read type for schema evolution + */ + std::shared_ptr<Type>& getReadType() const; + + /** + * Set whether reader throws or returns null when value overflows for schema evolution. + */ + RowReaderOptions& throwOnSchemaEvolutionOverflow(bool shouldThrow); + + /** + * Whether reader throws or returns null when value overflows for schema evolution. + */ + bool getThrowOnSchemaEvolutionOverflow() const; + }; class RowReader; @@ -299,7 +365,7 @@ namespace orc { * This is an an abstract class that will be subclassed as necessary. */ class Reader { - public: + public: virtual ~Reader(); /** @@ -389,8 +455,7 @@ namespace orc { * @param stripeIndex the index of the stripe (0 to N-1) to get information about * @return the information about that stripe */ - virtual ORC_UNIQUE_PTR<StripeInformation> - getStripe(uint64_t stripeIndex) const = 0; + virtual std::unique_ptr<StripeInformation> getStripe(uint64_t stripeIndex) const = 0; /** * Get the number of stripe statistics in the file. @@ -403,8 +468,7 @@ namespace orc { * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about * @return the statistics about that stripe */ - virtual ORC_UNIQUE_PTR<StripeStatistics> - getStripeStatistics(uint64_t stripeIndex) const = 0; + virtual std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const = 0; /** * Get the length of the data stripes in the file. @@ -440,15 +504,14 @@ namespace orc { * Get the statistics about the columns in the file. * @return the information about the column */ - virtual ORC_UNIQUE_PTR<Statistics> getStatistics() const = 0; + virtual std::unique_ptr<Statistics> getStatistics() const = 0; /** * Get the statistics about a single column in the file. * @param columnId id of the column * @return the information about the column */ - virtual ORC_UNIQUE_PTR<ColumnStatistics> - getColumnStatistics(uint32_t columnId) const = 0; + virtual std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId) const = 0; /** * Check if the file has correct column statistics. @@ -456,6 +519,12 @@ namespace orc { virtual bool hasCorrectStatistics() const = 0; /** + * Get metrics of the reader + * @return the accumulated reader metrics to current state. + */ + virtual const ReaderMetrics* getReaderMetrics() const = 0; + + /** * Get the serialized file tail. * Usefull if another reader of the same file wants to avoid re-reading * the file tail. See ReaderOptions.setSerializedFileTail(). @@ -474,14 +543,14 @@ namespace orc { * Create a RowReader based on this reader with the default options. * @return a RowReader to read the rows */ - virtual ORC_UNIQUE_PTR<RowReader> createRowReader() const = 0; + virtual std::unique_ptr<RowReader> createRowReader() const = 0; /** * Create a RowReader based on this reader. * @param options RowReader Options * @return a RowReader to read the rows */ - virtual ORC_UNIQUE_PTR<RowReader> createRowReader(const RowReaderOptions& options) const = 0; + virtual std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options) const = 0; /** * Get the name of the input stream. @@ -493,13 +562,13 @@ namespace orc { * based on the information in the file footer. * The bound is less tight if only few columns are read or compression is * used. - */ + */ /** * @param stripeIx index of the stripe to be read (if not specified, * all stripes are considered). * @return upper bound on memory use by all columns */ - virtual uint64_t getMemoryUse(int stripeIx=-1) = 0; + virtual uint64_t getMemoryUse(int stripeIx = -1) = 0; /** * @param include Column Field Ids @@ -507,7 +576,8 @@ namespace orc { * all stripes are considered). * @return upper bound on memory use by selected columns */ - virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; + virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, + int stripeIx = -1) = 0; /** * @param names Column Names @@ -515,7 +585,7 @@ namespace orc { * all stripes are considered). * @return upper bound on memory use by selected columns */ - virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0; + virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx = -1) = 0; /** * @param include Column Type Ids @@ -523,7 +593,8 @@ namespace orc { * all stripes are considered). * @return upper bound on memory use by selected columns */ - virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; + virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, + int stripeIx = -1) = 0; /** * Get BloomFiters of all selected columns in the specified stripe @@ -532,8 +603,8 @@ namespace orc { * all columns that have bloom filters are considered). * @return map of bloom filters with the key standing for the index of column. */ - virtual std::map<uint32_t, BloomFilterIndex> - getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; + virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters( + uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; }; /** @@ -541,7 +612,7 @@ namespace orc { * This is an an abstract class that will be subclassed as necessary. */ class RowReader { - public: + public: virtual ~RowReader(); /** * Get the selected type of the rows in the file. The file's row type @@ -563,8 +634,7 @@ namespace orc { * @param size the number of rows to read * @return a new ColumnVectorBatch to read into */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; + virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const = 0; /** * Read the next row batch from the current position. @@ -587,8 +657,7 @@ namespace orc { * @param rowNumber the next row the reader should return */ virtual void seekToRow(uint64_t rowNumber) = 0; - }; -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh index 4d7caeab3d..4ba8c35f7d 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh @@ -19,9 +19,11 @@ #ifndef ORC_STATISTICS_HH #define ORC_STATISTICS_HH -#include "orc/orc-config.hh" #include "orc/Type.hh" #include "orc/Vector.hh" +#include "orc/orc-config.hh" + +#include <sstream> namespace orc { @@ -29,7 +31,7 @@ namespace orc { * Statistics that are available for all types of columns. */ class ColumnStatistics { - public: + public: virtual ~ColumnStatistics(); /** @@ -54,9 +56,9 @@ namespace orc { /** * Statistics for binary columns. */ - class BinaryColumnStatistics: public ColumnStatistics { - public: - virtual ~BinaryColumnStatistics(); + class BinaryColumnStatistics : public ColumnStatistics { + public: + ~BinaryColumnStatistics() override; /** * Check whether column has total length. @@ -70,9 +72,9 @@ namespace orc { /** * Statistics for boolean columns. */ - class BooleanColumnStatistics: public ColumnStatistics { - public: - virtual ~BooleanColumnStatistics(); + class BooleanColumnStatistics : public ColumnStatistics { + public: + ~BooleanColumnStatistics() override; /** * Check whether column has true/false count. @@ -87,9 +89,9 @@ namespace orc { /** * Statistics for date columns. */ - class DateColumnStatistics: public ColumnStatistics { - public: - virtual ~DateColumnStatistics(); + class DateColumnStatistics : public ColumnStatistics { + public: + ~DateColumnStatistics() override; /** * Check whether column has minimum. @@ -119,9 +121,9 @@ namespace orc { /** * Statistics for decimal columns. */ - class DecimalColumnStatistics: public ColumnStatistics { - public: - virtual ~DecimalColumnStatistics(); + class DecimalColumnStatistics : public ColumnStatistics { + public: + ~DecimalColumnStatistics() override; /** * Check whether column has minimum. @@ -163,9 +165,9 @@ namespace orc { /** * Statistics for float and double columns. */ - class DoubleColumnStatistics: public ColumnStatistics { - public: - virtual ~DoubleColumnStatistics(); + class DoubleColumnStatistics : public ColumnStatistics { + public: + ~DoubleColumnStatistics() override; /** * Check whether column has minimum. @@ -210,9 +212,9 @@ namespace orc { * Statistics for all of the integer columns, such as byte, short, int, and * long. */ - class IntegerColumnStatistics: public ColumnStatistics { - public: - virtual ~IntegerColumnStatistics(); + class IntegerColumnStatistics : public ColumnStatistics { + public: + ~IntegerColumnStatistics() override; /** * Check whether column has minimum. @@ -256,9 +258,9 @@ namespace orc { /** * Statistics for string columns. */ - class StringColumnStatistics: public ColumnStatistics { - public: - virtual ~StringColumnStatistics(); + class StringColumnStatistics : public ColumnStatistics { + public: + ~StringColumnStatistics() override; /** * Check whether column has minimum. @@ -282,13 +284,13 @@ namespace orc { * Get the minimum value for the column. * @return minimum value */ - virtual const std::string & getMinimum() const = 0; + virtual const std::string& getMinimum() const = 0; /** * Get the maximum value for the column. * @return maximum value */ - virtual const std::string & getMaximum() const = 0; + virtual const std::string& getMaximum() const = 0; /** * Get the total length of all values. @@ -300,9 +302,9 @@ namespace orc { /** * Statistics for timestamp columns. */ - class TimestampColumnStatistics: public ColumnStatistics { - public: - virtual ~TimestampColumnStatistics(); + class TimestampColumnStatistics : public ColumnStatistics { + public: + ~TimestampColumnStatistics() override; /** * Check whether minimum timestamp exists. @@ -366,7 +368,7 @@ namespace orc { }; class Statistics { - public: + public: virtual ~Statistics(); /** @@ -374,8 +376,7 @@ namespace orc { * @param colId id of the column * @return one column's statistics */ - virtual const ColumnStatistics* getColumnStatistics(uint32_t colId - ) const = 0; + virtual const ColumnStatistics* getColumnStatistics(uint32_t colId) const = 0; /** * Get the number of columns. @@ -388,8 +389,8 @@ namespace orc { * Statistics for all of collections such as Map and List. */ class CollectionColumnStatistics : public ColumnStatistics { - public: - virtual ~CollectionColumnStatistics(); + public: + ~CollectionColumnStatistics() override; /** * check whether column has minimum number of children @@ -453,8 +454,8 @@ namespace orc { }; class StripeStatistics : public Statistics { - public: - virtual ~StripeStatistics(); + public: + ~StripeStatistics() override; /** * Get the statistics of a given RowIndex entry in a given column. @@ -462,9 +463,8 @@ namespace orc { * @param rowIndexId RowIndex entry id * @return statistics of the given RowIndex entry */ - virtual const ColumnStatistics* - getRowIndexStatistics( - uint32_t columnId, uint32_t rowIndexId) const = 0; + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, + uint32_t rowIndexId) const = 0; /** * Get the number of RowIndex statistics in a given column. @@ -473,6 +473,6 @@ namespace orc { */ virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0; }; -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Type.hh b/contrib/libs/apache/orc/c++/include/orc/Type.hh index a7df8307e6..82e0e3cc86 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Type.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Type.hh @@ -19,9 +19,9 @@ #ifndef ORC_TYPE_HH #define ORC_TYPE_HH -#include "orc/orc-config.hh" -#include "orc/Vector.hh" #include "MemoryPool.hh" +#include "orc/Vector.hh" +#include "orc/orc-config.hh" namespace orc { @@ -48,7 +48,7 @@ namespace orc { }; class Type { - public: + public: virtual ~Type(); virtual uint64_t getColumnId() const = 0; virtual uint64_t getMaximumColumnId() const = 0; @@ -59,21 +59,28 @@ namespace orc { virtual uint64_t getMaximumLength() const = 0; virtual uint64_t getPrecision() const = 0; virtual uint64_t getScale() const = 0; - virtual Type& setAttribute(const std::string& key, - const std::string& value) = 0; + virtual Type& setAttribute(const std::string& key, const std::string& value) = 0; virtual bool hasAttributeKey(const std::string& key) const = 0; virtual Type& removeAttribute(const std::string& key) = 0; virtual std::vector<std::string> getAttributeKeys() const = 0; virtual std::string getAttributeValue(const std::string& key) const = 0; virtual std::string toString() const = 0; + /** + * Get the Type with the given column ID + * @param colId the column ID + * @return the type corresponding to the column Id, nullptr if not exists + */ + virtual const Type* getTypeByColumnId(uint64_t colId) const = 0; /** * Create a row batch for this type. */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size, - MemoryPool& pool, - bool encoded = false - ) const = 0; + virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& pool, + bool encoded = false) const = 0; + + virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& pool, + bool encoded, + bool useTightNumericVector) const = 0; /** * Add a new field to a struct type. @@ -81,38 +88,33 @@ namespace orc { * @param fieldType the type of the new field * @return a reference to the struct type */ - virtual Type* addStructField(const std::string& fieldName, - ORC_UNIQUE_PTR<Type> fieldType) = 0; + virtual Type* addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) = 0; /** * Add a new child to a union type. * @param fieldType the type of the new field * @return a reference to the union type */ - virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0; + virtual Type* addUnionChild(std::unique_ptr<Type> fieldType) = 0; /** * Build a Type object from string text representation. */ - static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input); + static std::unique_ptr<Type> buildTypeFromString(const std::string& input); }; const int64_t DEFAULT_DECIMAL_SCALE = 18; const int64_t DEFAULT_DECIMAL_PRECISION = 38; - ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind); - ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind, - uint64_t maxLength); - ORC_UNIQUE_PTR<Type> - createDecimalType(uint64_t precision= - DEFAULT_DECIMAL_PRECISION, - uint64_t scale=DEFAULT_DECIMAL_SCALE); + std::unique_ptr<Type> createPrimitiveType(TypeKind kind); + std::unique_ptr<Type> createCharType(TypeKind kind, uint64_t maxLength); + std::unique_ptr<Type> createDecimalType(uint64_t precision = DEFAULT_DECIMAL_PRECISION, + uint64_t scale = DEFAULT_DECIMAL_SCALE); - ORC_UNIQUE_PTR<Type> createStructType(); - ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements); - ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key, - ORC_UNIQUE_PTR<Type> value); - ORC_UNIQUE_PTR<Type> createUnionType(); + std::unique_ptr<Type> createStructType(); + std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements); + std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, std::unique_ptr<Type> value); + std::unique_ptr<Type> createUnionType(); -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Vector.hh b/contrib/libs/apache/orc/c++/include/orc/Vector.hh index 752e1af78a..0dfe926965 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Vector.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Vector.hh @@ -19,17 +19,17 @@ #ifndef ORC_VECTOR_HH #define ORC_VECTOR_HH -#include "orc/orc-config.hh" -#include "MemoryPool.hh" #include "Int128.hh" +#include "MemoryPool.hh" +#include "orc/orc-config.hh" +#include <cstdlib> +#include <cstring> #include <list> #include <memory> -#include <cstring> -#include <vector> +#include <sstream> #include <stdexcept> -#include <cstdlib> -#include <iostream> +#include <vector> namespace orc { @@ -37,6 +37,11 @@ namespace orc { * The base class for each of the column vectors. This class handles * the generic attributes such as number of elements, capacity, and * notNull vector. + * Note: If hasNull is false, the values in the notNull buffer are not required. + * On the writer side, it does not read values from notNull buffer so users are + * not expected to write notNull buffer if hasNull is false. On the reader side, + * it does not set notNull buffer if hasNull is false, meaning that it is undefined + * behavior to consume values from notNull buffer in this case by downstream users. */ struct ColumnVectorBatch { ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); @@ -83,40 +88,128 @@ namespace orc { */ virtual bool hasVariableLength(); - private: + private: ColumnVectorBatch(const ColumnVectorBatch&); ColumnVectorBatch& operator=(const ColumnVectorBatch&); }; - struct LongVectorBatch: public ColumnVectorBatch { - LongVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~LongVectorBatch(); + template <typename ValueType> + struct IntegerVectorBatch : public ColumnVectorBatch { + IntegerVectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), data(pool, cap) { + // PASS + } + + ~IntegerVectorBatch() override = default; - DataBuffer<int64_t> data; - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); + inline std::string toString() const override; + + void resize(uint64_t cap) override { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + } + } + + void clear() override { + numElements = 0; + } + + uint64_t getMemoryUsage() override { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(ValueType)); + } + + DataBuffer<ValueType> data; }; - struct DoubleVectorBatch: public ColumnVectorBatch { - DoubleVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~DoubleVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); + using LongVectorBatch = IntegerVectorBatch<int64_t>; + using IntVectorBatch = IntegerVectorBatch<int32_t>; + using ShortVectorBatch = IntegerVectorBatch<int16_t>; + using ByteVectorBatch = IntegerVectorBatch<int8_t>; + + template <> + inline std::string LongVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Long vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + template <> + inline std::string IntVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Int vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + template <> + inline std::string ShortVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Short vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + template <> + inline std::string ByteVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Byte vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + template <typename FloatType> + struct FloatingVectorBatch : public ColumnVectorBatch { + FloatingVectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), data(pool, cap) { + // PASS + } - DataBuffer<double> data; + ~FloatingVectorBatch() override = default; + + inline std::string toString() const override; + + void resize(uint64_t cap) override { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + } + } + + void clear() override { + numElements = 0; + } + + uint64_t getMemoryUsage() override { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(FloatType)); + } + + DataBuffer<FloatType> data; }; - struct StringVectorBatch: public ColumnVectorBatch { + using DoubleVectorBatch = FloatingVectorBatch<double>; + using FloatVectorBatch = FloatingVectorBatch<float>; + + template <> + inline std::string DoubleVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Double vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + template <> + inline std::string FloatVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Float vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + struct StringVectorBatch : public ColumnVectorBatch { StringVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~StringVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); + ~StringVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; // pointers to the start of each string DataBuffer<char*> data; @@ -152,35 +245,35 @@ namespace orc { */ struct EncodedStringVectorBatch : public StringVectorBatch { EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~EncodedStringVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); + ~EncodedStringVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; std::shared_ptr<StringDictionary> dictionary; // index for dictionary entry DataBuffer<int64_t> index; }; - struct StructVectorBatch: public ColumnVectorBatch { + struct StructVectorBatch : public ColumnVectorBatch { StructVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~StructVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); + ~StructVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; + bool hasVariableLength() override; std::vector<ColumnVectorBatch*> fields; }; - struct ListVectorBatch: public ColumnVectorBatch { + struct ListVectorBatch : public ColumnVectorBatch { ListVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~ListVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); + ~ListVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; + bool hasVariableLength() override; /** * The offset of the first element of each list. @@ -189,17 +282,17 @@ namespace orc { DataBuffer<int64_t> offsets; // the concatenated elements - ORC_UNIQUE_PTR<ColumnVectorBatch> elements; + std::unique_ptr<ColumnVectorBatch> elements; }; - struct MapVectorBatch: public ColumnVectorBatch { + struct MapVectorBatch : public ColumnVectorBatch { MapVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~MapVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); + ~MapVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; + bool hasVariableLength() override; /** * The offset of the first element of each map. @@ -208,19 +301,19 @@ namespace orc { DataBuffer<int64_t> offsets; // the concatenated keys - ORC_UNIQUE_PTR<ColumnVectorBatch> keys; + std::unique_ptr<ColumnVectorBatch> keys; // the concatenated elements - ORC_UNIQUE_PTR<ColumnVectorBatch> elements; + std::unique_ptr<ColumnVectorBatch> elements; }; - struct UnionVectorBatch: public ColumnVectorBatch { + struct UnionVectorBatch : public ColumnVectorBatch { UnionVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~UnionVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); + ~UnionVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; + bool hasVariableLength() override; /** * For each value, which element of children has the value. @@ -246,13 +339,13 @@ namespace orc { int32_t scale; }; - struct Decimal64VectorBatch: public ColumnVectorBatch { + struct Decimal64VectorBatch : public ColumnVectorBatch { Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~Decimal64VectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); + ~Decimal64VectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; // total number of digits int32_t precision; @@ -262,7 +355,7 @@ namespace orc { // the numeric values DataBuffer<int64_t> values; - protected: + protected: /** * Contains the scales that were read from the file. Should NOT be * used. @@ -272,13 +365,13 @@ namespace orc { friend class Decimal64ColumnWriter; }; - struct Decimal128VectorBatch: public ColumnVectorBatch { + struct Decimal128VectorBatch : public ColumnVectorBatch { Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~Decimal128VectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); + ~Decimal128VectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; // total number of digits int32_t precision; @@ -288,7 +381,7 @@ namespace orc { // the numeric values DataBuffer<Int128> values; - protected: + protected: /** * Contains the scales that were read from the file. Should NOT be * used. @@ -304,13 +397,13 @@ namespace orc { * The timestamps are stored split into the time_t value (seconds since * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. */ - struct TimestampVectorBatch: public ColumnVectorBatch { + struct TimestampVectorBatch : public ColumnVectorBatch { TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~TimestampVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); + ~TimestampVectorBatch() override; + std::string toString() const override; + void resize(uint64_t capacity) override; + void clear() override; + uint64_t getMemoryUsage() override; // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) // Note that we always assume data is in GMT timezone; therefore it is @@ -322,6 +415,6 @@ namespace orc { DataBuffer<int64_t> nanoseconds; }; -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Writer.hh b/contrib/libs/apache/orc/c++/include/orc/Writer.hh index 78b0b97d25..047ee9ffc5 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Writer.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Writer.hh @@ -20,10 +20,11 @@ #define ORC_WRITER_HH #include "orc/Common.hh" -#include "orc/orc-config.hh" #include "orc/Type.hh" #include "orc/Vector.hh" +#include "orc/orc-config.hh" +#include <atomic> #include <memory> #include <set> #include <string> @@ -34,26 +35,29 @@ namespace orc { // classes that hold data members so we can maintain binary compatibility struct WriterOptionsPrivate; - enum CompressionStrategy { - CompressionStrategy_SPEED = 0, - CompressionStrategy_COMPRESSION - }; + enum CompressionStrategy { CompressionStrategy_SPEED = 0, CompressionStrategy_COMPRESSION }; - enum RleVersion { - RleVersion_1 = 0, - RleVersion_2 = 1 - }; + enum RleVersion { RleVersion_1 = 0, RleVersion_2 = 1 }; class Timezone; /** + * Expose the IO metrics for write operation. + */ + struct WriterMetrics { + // Record the number of IO requests written to the output file + std::atomic<uint64_t> IOCount{0}; + // Record the lantency of IO blocking + std::atomic<uint64_t> IOBlockingLatencyUs{0}; + }; + /** * Options for creating a Writer. */ class WriterOptions { - private: - ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits; + private: + std::unique_ptr<WriterOptionsPrivate> privateBits; - public: + public: WriterOptions(); WriterOptions(const WriterOptions&); WriterOptions(WriterOptions&); @@ -73,6 +77,8 @@ namespace orc { /** * Set the data compression block size. + * Should less then 1 << 23 bytes (8M) which is limited by the + * 3 bytes size of compression block header (1 bit for isOriginal and 23 bits for length) */ WriterOptions& setCompressionBlockSize(uint64_t size); @@ -83,7 +89,8 @@ namespace orc { uint64_t getCompressionBlockSize() const; /** - * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index. + * Set row index stride (the number of rows per an entry in the row index). Use value 0 to + * disable row index. */ WriterOptions& setRowIndexStride(uint64_t stride); @@ -157,13 +164,13 @@ namespace orc { /** * Set the memory pool. */ - WriterOptions& setMemoryPool(MemoryPool * memoryPool); + WriterOptions& setMemoryPool(MemoryPool* memoryPool); /** * Get the memory pool. * @return if not set, return default memory pool. */ - MemoryPool * getMemoryPool() const; + MemoryPool* getMemoryPool() const; /** * Set the error stream. @@ -174,7 +181,7 @@ namespace orc { * Get the error stream. * @return if not set, return std::err. */ - std::ostream * getErrorStream() const; + std::ostream* getErrorStream() const; /** * Get the RLE version. @@ -235,10 +242,45 @@ namespace orc { * @param zone writer timezone name */ WriterOptions& setTimezoneName(const std::string& zone); + + /** + * Set the writer metrics. + */ + WriterOptions& setWriterMetrics(WriterMetrics* metrics); + + /** + * Get the writer metrics. + * @return if not set, return nullptr. + */ + WriterMetrics* getWriterMetrics() const; + + /** + * Set use tight numeric vectorBatch or not. + */ + WriterOptions& setUseTightNumericVector(bool useTightNumericVector); + + /** + * Get whether or not to use dedicated columnVectorBatch + * @return if not set, the default is false + */ + bool getUseTightNumericVector() const; + + /** + * Set the initial capacity of output buffer in the class BufferedOutputStream. + * Each column contains one or more BufferOutputStream depending on its type, + * and these buffers will automatically expand when more memory is required. + */ + WriterOptions& setOutputBufferCapacity(uint64_t capacity); + + /** + * Get the initial capacity of output buffer in the class BufferedOutputStream. + * @return if not set, return default value which is 1 MB. + */ + uint64_t getOutputBufferCapacity() const; }; class Writer { - public: + public: virtual ~Writer(); /** @@ -246,8 +288,7 @@ namespace orc { * @param size the number of rows to write. * @return a new ColumnVectorBatch to write into. */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; + virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const = 0; /** * Add a row batch into current writer. @@ -263,8 +304,15 @@ namespace orc { /** * Add user metadata to the writer. */ - virtual void addUserMetadata(const std::string name, const std::string value) = 0; + virtual void addUserMetadata(const std::string& name, const std::string& value) = 0; + + /** + * Write an intermediate footer on the file such that if the file is + * truncated to the returned offset, it would be a valid ORC file. + * @return the offset that would be a valid end location for an ORC file + */ + virtual uint64_t writeIntermediateFooter() = 0; }; -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh index b8fb9fbd4e..ab1e16fa15 100644 --- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh +++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh @@ -1,7 +1,11 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -15,14 +19,9 @@ #ifndef ORC_CONFIG_HH #define ORC_CONFIG_HH -#define ORC_VERSION "1.8.0" +#define ORC_VERSION "2.0.0" #define ORC_CXX_HAS_CSTDINT -#define ORC_CXX_HAS_INITIALIZER_LIST -#define ORC_CXX_HAS_NOEXCEPT -#define ORC_CXX_HAS_NULLPTR -#define ORC_CXX_HAS_OVERRIDE -#define ORC_CXX_HAS_UNIQUE_PTR #ifdef ORC_CXX_HAS_CSTDINT #include <cstdint> @@ -30,49 +29,10 @@ #include <stdint.h> #endif -#ifdef ORC_CXX_HAS_NOEXCEPT - #define ORC_NOEXCEPT noexcept -#else - #define ORC_NOEXCEPT throw () -#endif - -#ifdef ORC_CXX_HAS_NULLPTR - #define ORC_NULLPTR nullptr -#else - namespace orc { - class nullptr_t { - public: - template<class T> - operator T*() const { - return 0; - } - - template<class C, class T> - operator T C::*() const { - return 0; - } - private: - void operator&() const; // whose address can't be taken - }; - const nullptr_t nullptr = {}; - } - #define ORC_NULLPTR orc::nullptr -#endif - -#ifdef ORC_CXX_HAS_OVERRIDE - #define ORC_OVERRIDE override -#else - #define ORC_OVERRIDE -#endif - -#ifdef ORC_CXX_HAS_UNIQUE_PTR - #define ORC_UNIQUE_PTR std::unique_ptr -#else - #define ORC_UNIQUE_PTR std::auto_ptr - namespace std { - template<typename T> - inline T move(T& x) { return x; } - } -#endif +// Following MACROS should be keeped for backward compatibility. +#define ORC_NOEXCEPT noexcept +#define ORC_NULLPTR nullptr +#define ORC_OVERRIDE override +#define ORC_UNIQUE_PTR std::unique_ptr #endif diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh index 36c9b37e3f..9ce958302d 100644 --- a/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh +++ b/contrib/libs/apache/orc/c++/include/orc/sargs/Literal.hh @@ -27,21 +27,19 @@ namespace orc { /** * Possible data types for predicates */ - enum class PredicateDataType { - LONG = 0, FLOAT, STRING, DATE, DECIMAL, TIMESTAMP, BOOLEAN - }; + enum class PredicateDataType { LONG = 0, FLOAT, STRING, DATE, DECIMAL, TIMESTAMP, BOOLEAN }; /** * Represents a literal value in a predicate */ class Literal { - public: + public: struct Timestamp { Timestamp() = default; Timestamp(const Timestamp&) = default; Timestamp(Timestamp&&) = default; ~Timestamp() = default; - Timestamp(int64_t second_, int32_t nanos_): second(second_), nanos(nanos_) { + Timestamp(int64_t second_, int32_t nanos_) : second(second_), nanos(nanos_) { // PASS } Timestamp& operator=(const Timestamp&) = default; @@ -55,15 +53,23 @@ namespace orc { bool operator<=(const Timestamp& r) const { return second < r.second || (second == r.second && nanos <= r.nanos); } - bool operator!=(const Timestamp& r) const { return !(*this == r); } - bool operator>(const Timestamp& r) const { return r < *this; } - bool operator>=(const Timestamp& r) const { return r <= *this; } - int64_t getMillis() const { return second * 1000 + nanos / 1000000; } + bool operator!=(const Timestamp& r) const { + return !(*this == r); + } + bool operator>(const Timestamp& r) const { + return r < *this; + } + bool operator>=(const Timestamp& r) const { + return r <= *this; + } + int64_t getMillis() const { + return second * 1000 + nanos / 1000000; + } int64_t second; int32_t nanos; }; - Literal(const Literal &r); + Literal(const Literal& r); ~Literal(); Literal& operator=(const Literal& r); bool operator==(const Literal& r) const; @@ -102,7 +108,7 @@ namespace orc { /** * Create a literal of STRING type */ - Literal(const char * str, size_t size); + Literal(const char* str, size_t size); /** * Create a literal of DECIMAL type @@ -123,38 +129,44 @@ namespace orc { /** * Check if a literal is null */ - bool isNull() const { return mIsNull; } + bool isNull() const { + return mIsNull; + } - PredicateDataType getType() const { return mType; } + PredicateDataType getType() const { + return mType; + } std::string toString() const; - size_t getHashCode() const { return mHashCode; } + size_t getHashCode() const { + return mHashCode; + } - private: + private: size_t hashCode() const; union LiteralVal { int64_t IntVal; double DoubleVal; int64_t DateVal; - char * Buffer; + char* Buffer; Timestamp TimeStampVal; Int128 DecimalVal; bool BooleanVal; // explicitly define default constructor - LiteralVal(): DecimalVal(0) {} + LiteralVal() : DecimalVal(0) {} }; - private: - LiteralVal mValue; // data value for this literal if not null - PredicateDataType mType; // data type of the literal - size_t mSize; // size of mValue if it is Buffer - int32_t mPrecision; // precision of decimal type - int32_t mScale; // scale of decimal type - bool mIsNull; // whether this literal is null - size_t mHashCode; // precomputed hash code for the literal + private: + LiteralVal mValue; // data value for this literal if not null + PredicateDataType mType; // data type of the literal + size_t mSize; // size of mValue if it is Buffer + int32_t mPrecision; // precision of decimal type + int32_t mScale; // scale of decimal type + bool mIsNull; // whether this literal is null + size_t mHashCode; // precomputed hash code for the literal }; -} // namespace orc +} // namespace orc -#endif //ORC_LITERAL_HH +#endif // ORC_LITERAL_HH diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh index 44fde8f5e9..6493840a92 100644 --- a/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh +++ b/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh @@ -34,7 +34,7 @@ namespace orc { * (<a href="http://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF</a>). */ class SearchArgument { - public: + public: virtual ~SearchArgument(); /** @@ -52,7 +52,7 @@ namespace orc { * must call startOr, startAnd, or startNot before adding any leaves. */ class SearchArgumentBuilder { - public: + public: virtual ~SearchArgumentBuilder(); /** @@ -87,8 +87,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& lessThan(const std::string& column, - PredicateDataType type, + virtual SearchArgumentBuilder& lessThan(const std::string& column, PredicateDataType type, Literal literal) = 0; /** @@ -98,8 +97,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& lessThan(uint64_t columnId, - PredicateDataType type, + virtual SearchArgumentBuilder& lessThan(uint64_t columnId, PredicateDataType type, Literal literal) = 0; /** @@ -109,8 +107,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& lessThanEquals(const std::string& column, - PredicateDataType type, + virtual SearchArgumentBuilder& lessThanEquals(const std::string& column, PredicateDataType type, Literal literal) = 0; /** @@ -120,8 +117,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& lessThanEquals(uint64_t columnId, - PredicateDataType type, + virtual SearchArgumentBuilder& lessThanEquals(uint64_t columnId, PredicateDataType type, Literal literal) = 0; /** @@ -131,8 +127,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& equals(const std::string& column, - PredicateDataType type, + virtual SearchArgumentBuilder& equals(const std::string& column, PredicateDataType type, Literal literal) = 0; /** @@ -142,8 +137,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& equals(uint64_t columnId, - PredicateDataType type, + virtual SearchArgumentBuilder& equals(uint64_t columnId, PredicateDataType type, Literal literal) = 0; /** @@ -153,8 +147,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& nullSafeEquals(const std::string& column, - PredicateDataType type, + virtual SearchArgumentBuilder& nullSafeEquals(const std::string& column, PredicateDataType type, Literal literal) = 0; /** @@ -164,8 +157,7 @@ namespace orc { * @param literal the literal * @return this */ - virtual SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, - PredicateDataType type, + virtual SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, PredicateDataType type, Literal literal) = 0; /** @@ -175,8 +167,7 @@ namespace orc { * @param literals the literals * @return this */ - virtual SearchArgumentBuilder& in(const std::string& column, - PredicateDataType type, + virtual SearchArgumentBuilder& in(const std::string& column, PredicateDataType type, const std::initializer_list<Literal>& literals) = 0; /** @@ -186,8 +177,7 @@ namespace orc { * @param literals the literals * @return this */ - virtual SearchArgumentBuilder& in(uint64_t columnId, - PredicateDataType type, + virtual SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type, const std::initializer_list<Literal>& literals) = 0; /** @@ -197,8 +187,7 @@ namespace orc { * @param literals the literals * @return this */ - virtual SearchArgumentBuilder& in(const std::string& column, - PredicateDataType type, + virtual SearchArgumentBuilder& in(const std::string& column, PredicateDataType type, const std::vector<Literal>& literals) = 0; /** @@ -208,8 +197,7 @@ namespace orc { * @param literals the literals * @return this */ - virtual SearchArgumentBuilder& in(uint64_t columnId, - PredicateDataType type, + virtual SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type, const std::vector<Literal>& literals) = 0; /** @@ -218,8 +206,7 @@ namespace orc { * @param type the type of the expression * @return this */ - virtual SearchArgumentBuilder& isNull(const std::string& column, - PredicateDataType type) = 0; + virtual SearchArgumentBuilder& isNull(const std::string& column, PredicateDataType type) = 0; /** * Add an is null leaf to the current item on the stack. @@ -227,8 +214,7 @@ namespace orc { * @param type the type of the expression * @return this */ - virtual SearchArgumentBuilder& isNull(uint64_t columnId, - PredicateDataType type) = 0; + virtual SearchArgumentBuilder& isNull(uint64_t columnId, PredicateDataType type) = 0; /** * Add a between leaf to the current item on the stack. @@ -238,10 +224,8 @@ namespace orc { * @param upper the literal * @return this */ - virtual SearchArgumentBuilder& between(const std::string& column, - PredicateDataType type, - Literal lower, - Literal upper) = 0; + virtual SearchArgumentBuilder& between(const std::string& column, PredicateDataType type, + Literal lower, Literal upper) = 0; /** * Add a between leaf to the current item on the stack. @@ -251,9 +235,7 @@ namespace orc { * @param upper the literal * @return this */ - virtual SearchArgumentBuilder& between(uint64_t columnId, - PredicateDataType type, - Literal lower, + virtual SearchArgumentBuilder& between(uint64_t columnId, PredicateDataType type, Literal lower, Literal upper) = 0; /** @@ -275,10 +257,10 @@ namespace orc { * Factory to create SearchArgumentBuilder which builds SearchArgument */ class SearchArgumentFactory { - public: + public: static std::unique_ptr<SearchArgumentBuilder> newBuilder(); }; -} // namespace orc +} // namespace orc -#endif //ORC_SEARCHARGUMENT_HH +#endif // ORC_SEARCHARGUMENT_HH diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh index b3ea6b76ce..fa3dce06f8 100644 --- a/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh +++ b/contrib/libs/apache/orc/c++/include/orc/sargs/TruthValue.hh @@ -25,13 +25,13 @@ namespace orc { * The potential result sets of logical operations. */ enum class TruthValue { - YES, // all rows satisfy the predicate - NO, // all rows dissatisfy the predicate - IS_NULL, // all rows are null value - YES_NULL, // null values exist, not-null rows satisfy the predicate - NO_NULL, // null values exist, not-null rows dissatisfy the predicate - YES_NO, // some rows satisfy the predicate and the others not - YES_NO_NULL // null values exist, some rows satisfy predicate and some not + YES, // all rows satisfy the predicate + NO, // all rows dissatisfy the predicate + IS_NULL, // all rows are null value + YES_NULL, // null values exist, not-null rows satisfy the predicate + NO_NULL, // null values exist, not-null rows dissatisfy the predicate + YES_NO, // some rows satisfy the predicate and the others not + YES_NO_NULL // null values exist, some rows satisfy predicate and some not }; // Compute logical or between the two values. @@ -46,6 +46,6 @@ namespace orc { // Do we need to read the data based on the TruthValue? bool isNeeded(TruthValue val); -} // namespace orc +} // namespace orc -#endif //ORC_TRUTHVALUE_HH +#endif // ORC_TRUTHVALUE_HH diff --git a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh index 625c1befb2..b11cdf74cd 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh +++ b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh @@ -19,20 +19,15 @@ #ifndef ADAPTER_HH #define ADAPTER_HH -/* #undef INT64_IS_LL */ -#define HAS_CONSTEXPR #define HAS_PREAD #define HAS_STRPTIME -#define HAS_STOLL #define HAS_DIAGNOSTIC_PUSH #define HAS_DOUBLE_TO_STRING #define HAS_INT64_TO_STRING #define HAS_PRE_1970 #define HAS_POST_2038 #define HAS_STD_ISNAN -#define HAS_STD_MUTEX #define HAS_BUILTIN_OVERFLOW_CHECK -/* #undef NEEDS_REDUNDANT_MOVE */ /* #undef NEEDS_Z_PREFIX */ #include "orc/orc-config.hh" @@ -46,13 +41,6 @@ typedef SSIZE_T ssize_t; #define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf) #endif -#ifndef HAS_STOLL - // A poor man's stoll that converts str to a long long int base 10 - namespace std { - int64_t stoll(std::string str); - } -#endif - #ifndef HAS_STRPTIME char* strptime(const char* buf, const char* format, struct tm* tm); #endif @@ -61,20 +49,6 @@ typedef SSIZE_T ssize_t; ssize_t pread(int fd, void* buf, size_t count, off_t offset); #endif -#ifdef INT64_IS_LL - #define INT64_FORMAT_STRING "ll" -#else - #define INT64_FORMAT_STRING "l" -#endif - -#ifndef ORC_CXX_HAS_NOEXCEPT - #define noexcept ORC_NOEXCEPT -#endif - -#ifndef ORC_CXX_HAS_OVERRIDE - #define override ORC_OVERRIDE -#endif - #ifdef HAS_DIAGNOSTIC_PUSH #ifdef __clang__ #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") @@ -105,10 +79,6 @@ typedef SSIZE_T ssize_t; #define DIAGNOSTIC_IGNORE(XXX) #endif -#ifndef ORC_CXX_HAS_UNIQUE_PTR - #define unique_ptr auto_ptr -#endif - #ifndef UINT32_MAX #define UINT32_MAX 0xffffffff #endif @@ -123,12 +93,6 @@ typedef SSIZE_T ssize_t; #define GTEST_LANG_CXX11 0 -#ifdef NEEDS_REDUNDANT_MOVE - #define REDUNDANT_MOVE(XXX) std::move(XXX) -#else - #define REDUNDANT_MOVE(XXX) XXX -#endif - #ifndef HAS_STD_ISNAN #include <math.h> #define std::isnan(XXX) isnan(XXX) @@ -136,34 +100,7 @@ typedef SSIZE_T ssize_t; #include <cmath> #endif -#ifndef HAS_STD_MUTEX - #include <pthread.h> - namespace orc { - /** - * Lock guard for pthread_mutex_t object using RAII - * The Lock is automatically release when exiting current scope. - */ - class LockORC { - public: - explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) { - pthread_mutex_lock(&mutex_ref_); - } - ~LockORC() { pthread_mutex_unlock(&mutex_ref_); } - private: - // no default constructor - LockORC(); - // prohibit copying - LockORC(const LockORC&); - LockORC& operator=(const LockORC&); - - pthread_mutex_t& mutex_ref_; - }; - } - #define std::mutex pthread_mutex_t - #define std::lock_guard<std::mutex> LockORC -#else - #include <mutex> -#endif +#include <mutex> #ifdef NEEDS_Z_PREFIX #define Z_PREFIX 1 @@ -208,8 +145,4 @@ namespace orc { } #endif -#ifndef HAS_CONSTEXPR -#define constexpr const -#endif - #endif /* ADAPTER_HH */ diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.cc b/contrib/libs/apache/orc/c++/src/Adaptor.cc index bf3a3e181b..d9390131b6 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor.cc +++ b/contrib/libs/apache/orc/c++/src/Adaptor.cc @@ -1,36 +1,24 @@ /** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "Adaptor.hh" -#include <sstream> #include <iomanip> - -#ifndef HAS_STOLL -namespace std { - int64_t std::stoll(std::string str) { - int64_t val = 0; - stringstream ss; - ss << str; - ss >> val; - return val; - } -} -#endif +#include <sstream> #ifndef HAS_STRPTIME char* strptime(const char* s, const char* f, struct tm* tm) { @@ -43,7 +31,7 @@ char* strptime(const char* s, const char* f, struct tm* tm) { #endif #ifndef HAS_PREAD - #ifdef _WIN32 +#ifdef _WIN32 #include <Windows.h> #include <io.h> ssize_t pread(int fd, void* buf, size_t size, off_t offset) { @@ -60,9 +48,9 @@ ssize_t pread(int fd, void* buf, size_t size, off_t offset) { } return static_cast<ssize_t>(rt); } - #else - #error("pread() undefined: unknown environment") - #endif +#else +#error("pread() undefined: unknown environment") +#endif #endif namespace orc { @@ -85,4 +73,4 @@ namespace orc { return std::to_string(static_cast<long long int>(val)); } #endif -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.cc b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc new file mode 100644 index 0000000000..1f7843fad7 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.cc @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BlockBuffer.hh" +#include "orc/OrcFile.hh" +#include "orc/Writer.hh" + +#include <algorithm> + +namespace orc { + + BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t _blockSize) + : memoryPool(pool), currentSize(0), currentCapacity(0), blockSize(_blockSize) { + if (blockSize == 0) { + throw std::logic_error("Block size cannot be zero"); + } + reserve(blockSize); + } + + BlockBuffer::~BlockBuffer() { + for (size_t i = 0; i < blocks.size(); ++i) { + memoryPool.free(blocks[i]); + } + blocks.clear(); + currentSize = currentCapacity = 0; + } + + BlockBuffer::Block BlockBuffer::getBlock(uint64_t blockIndex) const { + if (blockIndex >= getBlockNumber()) { + throw std::out_of_range("Block index out of range"); + } + return Block(blocks[blockIndex], std::min(currentSize - blockIndex * blockSize, blockSize)); + } + + BlockBuffer::Block BlockBuffer::getNextBlock() { + if (currentSize < currentCapacity) { + Block emptyBlock(blocks[currentSize / blockSize] + currentSize % blockSize, + blockSize - currentSize % blockSize); + currentSize = (currentSize / blockSize + 1) * blockSize; + return emptyBlock; + } else { + resize(currentSize + blockSize); + return Block(blocks.back(), blockSize); + } + } + + void BlockBuffer::resize(uint64_t size) { + reserve(size); + if (currentCapacity >= size) { + currentSize = size; + } else { + throw std::logic_error("Block buffer resize error"); + } + } + + void BlockBuffer::reserve(uint64_t newCapacity) { + while (currentCapacity < newCapacity) { + char* newBlockPtr = memoryPool.malloc(blockSize); + if (newBlockPtr != nullptr) { + blocks.push_back(newBlockPtr); + currentCapacity += blockSize; + } else { + break; + } + } + } + + void BlockBuffer::writeTo(OutputStream* output, WriterMetrics* metrics) { + if (currentSize == 0) { + return; + } + static uint64_t MAX_CHUNK_SIZE = 1024 * 1024 * 1024; + uint64_t chunkSize = std::min(output->getNaturalWriteSize(), MAX_CHUNK_SIZE); + if (chunkSize == 0) { + throw std::logic_error("Natural write size cannot be zero"); + } + uint64_t ioCount = 0; + uint64_t blockNumber = getBlockNumber(); + // if only exists one block, currentSize is equal to first block size + if (blockNumber == 1 && currentSize <= chunkSize) { + Block block = getBlock(0); + output->write(block.data, block.size); + ++ioCount; + } else { + char* chunk = memoryPool.malloc(chunkSize); + uint64_t chunkOffset = 0; + for (uint64_t i = 0; i < blockNumber; ++i) { + Block block = getBlock(i); + uint64_t blockOffset = 0; + while (blockOffset < block.size) { + // copy current block into chunk + uint64_t copySize = std::min(chunkSize - chunkOffset, block.size - blockOffset); + memcpy(chunk + chunkOffset, block.data + blockOffset, copySize); + chunkOffset += copySize; + blockOffset += copySize; + + // chunk is full + if (chunkOffset >= chunkSize) { + output->write(chunk, chunkSize); + chunkOffset = 0; + ++ioCount; + } + } + } + if (chunkOffset != 0) { + output->write(chunk, chunkOffset); + ++ioCount; + } + memoryPool.free(chunk); + } + + if (metrics != nullptr) { + metrics->IOCount.fetch_add(ioCount); + } + } +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/BlockBuffer.hh b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh new file mode 100644 index 0000000000..0f5f78e3fe --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/BlockBuffer.hh @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BLOCK_BUFFER_HH +#define ORC_BLOCK_BUFFER_HH + +#include "orc/MemoryPool.hh" + +#include <vector> + +namespace orc { + + class OutputStream; + struct WriterMetrics; + /** + * BlockBuffer implements a memory allocation policy based on + * equal-length blocks. BlockBuffer will reserve multiple blocks + * for allocation. + */ + class BlockBuffer { + private: + MemoryPool& memoryPool; + // current buffer size + uint64_t currentSize; + // maximal capacity (actual allocated memory) + uint64_t currentCapacity; + // unit for buffer expansion + const uint64_t blockSize; + // pointers to the start of each block + std::vector<char*> blocks; + + // non-copy-constructible + BlockBuffer(BlockBuffer& buffer) = delete; + BlockBuffer& operator=(BlockBuffer& buffer) = delete; + BlockBuffer(BlockBuffer&& buffer) = delete; + BlockBuffer& operator=(BlockBuffer&& buffer) = delete; + + public: + BlockBuffer(MemoryPool& pool, uint64_t blockSize); + + ~BlockBuffer(); + + /** + * Block points to a section of memory allocated by BlockBuffer, + * containing the corresponding physical memory address and available size. + */ + struct Block { + // the start of block + char* data; + // number of bytes available at data + uint64_t size; + + Block() : data(nullptr), size(0) {} + Block(char* _data, uint64_t _size) : data(_data), size(_size) {} + Block(const Block& block) = default; + ~Block() = default; + }; + + /** + * Get the allocated block object. + * The last allocated block size may be less than blockSize, + * and the rest of the blocks are all of size blockSize. + * @param blockIndex the index of blocks + * @return the allocated block object + */ + Block getBlock(uint64_t blockIndex) const; + + /** + * Get a empty block or allocate a new block to write. + * If the last allocated block size is less than blockSize, + * the size of empty block is equal to blockSize minus the size of + * the last allocated block size. Otherwise, the size of + * the empty block is equal to blockSize. + * @return a empty block object + */ + Block getNextBlock(); + + /** + * Get the number of blocks that are fully or partially occupied + */ + uint64_t getBlockNumber() const { + return (currentSize + blockSize - 1) / blockSize; + } + + uint64_t size() const { + return currentSize; + } + + uint64_t capacity() const { + return currentCapacity; + } + + void resize(uint64_t size); + /** + * Requests the BlockBuffer to contain at least newCapacity bytes. + * Reallocation happens if there is need of more space. + * @param newCapacity new capacity of BlockBuffer + */ + void reserve(uint64_t newCapacity); + /** + * Write the BlockBuffer content into OutputStream + * @param output the output stream to write to + * @param metrics the metrics of the writer + */ + void writeTo(OutputStream* output, WriterMetrics* metrics); + }; +} // namespace orc + +#endif diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc index 8a1f1880e7..882c6f4252 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc @@ -22,11 +22,14 @@ namespace orc { constexpr uint64_t BITS_OF_LONG = 64; - constexpr uint8_t SHIFT_6_BITS = 6; - constexpr uint8_t SHIFT_3_BITS = 3; + constexpr uint8_t SHIFT_6_BITS = 6; + constexpr uint8_t SHIFT_3_BITS = 3; static bool isLittleEndian() { - static union { uint32_t i; char c[4]; } num = { 0x01020304 }; + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; return num.c[0] == 4; } @@ -34,11 +37,10 @@ namespace orc { * Implementation of BitSet */ BitSet::BitSet(uint64_t numBits) { - mData.resize(static_cast<size_t>(ceil( - static_cast<double>(numBits) / BITS_OF_LONG)), 0); + mData.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0); } - BitSet::BitSet(const uint64_t * bits, uint64_t numBits) { + BitSet::BitSet(const uint64_t* bits, uint64_t numBits) { // caller should make sure numBits is multiple of 64 mData.resize(numBits >> SHIFT_6_BITS, 0); memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS); @@ -59,8 +61,8 @@ namespace orc { void BitSet::merge(const BitSet& other) { if (mData.size() != other.mData.size()) { std::stringstream ss; - ss << "BitSet must be of equal length (" - << mData.size() << " != " << other.mData.size() << ")"; + ss << "BitSet must be of equal length (" << mData.size() << " != " << other.mData.size() + << ")"; throw std::logic_error(ss.str()); } @@ -73,7 +75,7 @@ namespace orc { memset(mData.data(), 0, sizeof(uint64_t) * mData.size()); } - const uint64_t * BitSet::getData() const { + const uint64_t* BitSet::getData() const { return mData.data(); } @@ -92,8 +94,8 @@ namespace orc { int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) { double n = static_cast<double>(expectedEntries); - return std::max<int32_t>(1, static_cast<int32_t>( - std::round(static_cast<double>(numBits) / n * std::log(2.0)))); + return std::max<int32_t>( + 1, static_cast<int32_t>(std::round(static_cast<double>(numBits) / n * std::log(2.0)))); } int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) { @@ -108,23 +110,20 @@ namespace orc { // probability' // Lets split up 64-bit hashcode into two 32-bit hash codes and employ // the technique mentioned in the above paper - inline uint64_t getBytesHash(const char * data, int64_t length) { + inline uint64_t getBytesHash(const char* data, int64_t length) { if (data == nullptr) { return Murmur3::NULL_HASHCODE; } - return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data), - static_cast<uint32_t>(length)); + return Murmur3::hash64(reinterpret_cast<const uint8_t*>(data), static_cast<uint32_t>(length)); } /** * Implementation of BloomFilter */ BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) { - checkArgument(expectedEntries > 0, - "expectedEntries should be > 0"); - checkArgument(fpp > 0.0 && fpp < 1.0, - "False positive probability should be > 0.0 & < 1.0"); + checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); + checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0"); uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp)); // make 'mNumBits' multiple of 64 @@ -133,7 +132,7 @@ namespace orc { mBitSet.reset(new BitSet(mNumBits)); } - void BloomFilterImpl::addBytes(const char * data, int64_t length) { + void BloomFilterImpl::addBytes(const char* data, int64_t length) { uint64_t hash64 = getBytesHash(data, length); addHash(static_cast<int64_t>(hash64)); } @@ -142,7 +141,7 @@ namespace orc { addHash(getLongHash(data)); } - bool BloomFilterImpl::testBytes(const char * data, int64_t length) const { + bool BloomFilterImpl::testBytes(const char* data, int64_t length) const { uint64_t hash64 = getBytesHash(data, length); return testHash(static_cast<int64_t>(hash64)); } @@ -176,13 +175,13 @@ namespace orc { // caller should make sure input proto::BloomFilter is valid since // no check will be performed in the following constructor BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) { - mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions()); + mNumHashFunctions = static_cast<int32_t>(bloomFilter.num_hash_functions()); const std::string& bitsetStr = bloomFilter.utf8bitset(); mNumBits = bitsetStr.size() << SHIFT_3_BITS; checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); - const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data()); + const uint64_t* bitset = reinterpret_cast<const uint64_t*>(bitsetStr.data()); if (isLittleEndian()) { mBitSet.reset(new BitSet(bitset, mNumBits)); } else { @@ -204,7 +203,7 @@ namespace orc { addLong(reinterpret_cast<int64_t&>(data)); } - bool BloomFilterImpl::testDouble(double data) const{ + bool BloomFilterImpl::testDouble(double data) const { return testLong(reinterpret_cast<int64_t&>(data)); } @@ -227,7 +226,7 @@ namespace orc { } } - bool BloomFilterImpl::testHash(int64_t hash64) const{ + bool BloomFilterImpl::testHash(int64_t hash64) const { int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. // So we cast hash64 to uint64_t here for an unsigned right shift. @@ -251,10 +250,8 @@ namespace orc { if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) { std::stringstream ss; ss << "BloomFilters are not compatible for merging: " - << "this: numBits:" << mNumBits - << ",numHashFunctions:" << mNumHashFunctions - << ", that: numBits:" << other.mNumBits - << ",numHashFunctions:" << other.mNumHashFunctions; + << "this: numBits:" << mNumBits << ",numHashFunctions:" << mNumHashFunctions + << ", that: numBits:" << other.mNumBits << ",numHashFunctions:" << other.mNumHashFunctions; throw std::logic_error(ss.str()); } @@ -266,17 +263,17 @@ namespace orc { } void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const { - bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions)); + bloomFilter.set_num_hash_functions(static_cast<uint32_t>(mNumHashFunctions)); // According to ORC standard, the encoding is a sequence of bytes with // a little endian encoding in the utf8bitset field. if (isLittleEndian()) { // bytes are already organized in little endian; thus no conversion needed - const char * bitset = reinterpret_cast<const char *>(mBitSet->getData()); + const char* bitset = reinterpret_cast<const char*>(mBitSet->getData()); bloomFilter.set_utf8bitset(bitset, sizeInBytes()); } else { std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0); - const uint64_t * longs = mBitSet->getData(); + const uint64_t* longs = mBitSet->getData(); for (size_t i = 0; i != bitset.size(); ++i) { uint64_t& dst = bitset[i]; const uint64_t src = longs[i]; @@ -290,8 +287,7 @@ namespace orc { } bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const { - return mNumBits == other.mNumBits && - mNumHashFunctions == other.mNumHashFunctions && + return mNumBits == other.mNumBits && mNumHashFunctions == other.mNumHashFunctions && *mBitSet == *other.mBitSet; } @@ -300,29 +296,24 @@ namespace orc { } std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize( - const proto::Stream_Kind& streamKind, - const proto::ColumnEncoding& encoding, - const proto::BloomFilter& bloomFilter) { - - std::unique_ptr<BloomFilter> ret(nullptr); - + const proto::Stream_Kind& streamKind, const proto::ColumnEncoding& encoding, + const proto::BloomFilter& bloomFilter) { // only BLOOM_FILTER_UTF8 is supported if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) { - return ret; + return nullptr; } // make sure we don't use unknown encodings or original timestamp encodings - if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) { - return ret; + if (!encoding.has_bloom_encoding() || encoding.bloom_encoding() != 1) { + return nullptr; } // make sure all required fields exist - if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) { - return ret; + if (!bloomFilter.has_num_hash_functions() || !bloomFilter.has_utf8bitset()) { + return nullptr; } - ret.reset(new BloomFilterImpl(bloomFilter)); - return ret; + return std::make_unique<BloomFilterImpl>(bloomFilter); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh index cf18a46fd9..d72961a83c 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh @@ -33,7 +33,7 @@ namespace orc { * for index bounds nor expand the bit set size if the specified index is greater than the size. */ class BitSet { - public: + public: /** * Creates an empty BitSet * @@ -47,7 +47,7 @@ namespace orc { * @param bits - serialized uint64_t buffer of bitset * @param numBits - number of bits used */ - BitSet(const uint64_t * bits, uint64_t numBits); + BitSet(const uint64_t* bits, uint64_t numBits); /** * Sets the bit at specified index. @@ -82,14 +82,14 @@ namespace orc { /** * Gets underlying raw data */ - const uint64_t * getData() const; + const uint64_t* getData() const; /** * Compares two BitSets */ bool operator==(const BitSet& other) const; - private: + private: std::vector<uint64_t> mData; }; @@ -120,14 +120,14 @@ namespace orc { * BloomFilterUtf8, which always uses UTF8 for the encoding. */ class BloomFilterImpl : public BloomFilter { - public: + public: /** * Creates an empty BloomFilter * * @param expectedEntries - number of entries it will hold * @param fpp - false positive probability */ - BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP); + BloomFilterImpl(uint64_t expectedEntries, double fpp = DEFAULT_FPP); /** * Creates a BloomFilter by deserializing the proto-buf version @@ -139,14 +139,14 @@ namespace orc { /** * Adds a new element to the BloomFilter */ - void addBytes(const char * data, int64_t length); + void addBytes(const char* data, int64_t length); void addLong(int64_t data); void addDouble(double data); /** * Test if the element exists in BloomFilter */ - bool testBytes(const char * data, int64_t length) const override; + bool testBytes(const char* data, int64_t length) const override; bool testLong(int64_t data) const override; bool testDouble(double data) const override; @@ -160,7 +160,7 @@ namespace orc { bool operator==(const BloomFilterImpl& other) const; - private: + private: friend struct BloomFilterUTF8Utils; friend class TestBloomFilter_testBloomFilterBasicOperations_Test; @@ -172,7 +172,7 @@ namespace orc { void serialize(proto::BloomFilter& bloomFilter) const; - private: + private: static constexpr double DEFAULT_FPP = 0.05; uint64_t mNumBits; int32_t mNumHashFunctions; @@ -186,25 +186,24 @@ namespace orc { } // deserialize BloomFilter from protobuf - static std::unique_ptr<BloomFilter> - deserialize(const proto::Stream_Kind& streamKind, - const proto::ColumnEncoding& columnEncoding, - const proto::BloomFilter& bloomFilter); + static std::unique_ptr<BloomFilter> deserialize(const proto::Stream_Kind& streamKind, + const proto::ColumnEncoding& columnEncoding, + const proto::BloomFilter& bloomFilter); }; // Thomas Wang's integer hash function // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm // Put this in header file so tests can use it as well. inline int64_t getLongHash(int64_t key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = (~key) + (key << 21); // key = (key << 21) - key - 1; key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 + key = (key + (key << 3)) + (key << 8); // key * 265 key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 + key = (key + (key << 2)) + (key << 4); // key * 21 key = key ^ (key >> 28); key = key + (key << 31); return key; } -} +} // namespace orc -#endif //ORC_BLOOMFILTER_IMPL_HH +#endif // ORC_BLOOMFILTER_IMPL_HH diff --git a/contrib/libs/apache/orc/c++/src/Bpacking.hh b/contrib/libs/apache/orc/c++/src/Bpacking.hh new file mode 100644 index 0000000000..f55e986d8d --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/Bpacking.hh @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKING_HH +#define ORC_BPACKING_HH + +#include <cstdint> + +namespace orc { + class RleDecoderV2; + + class BitUnpack { + public: + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); + }; +} // namespace orc + +#endif diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.cc b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc new file mode 100644 index 0000000000..5a80bc6fb1 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.cc @@ -0,0 +1,368 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BpackingDefault.hh" +#include "RLEv2.hh" +#include "Utils.hh" + +namespace orc { + + UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder(dec) { + // PASS + } + + UnpackDefault::~UnpackDefault() { + // PASS + } + + void UnpackDefault::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. + while (decoder->getBitsLeft() > 0 && curIdx < offset + len) { + decoder->setBitsLeft(decoder->getBitsLeft() - 4); + data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15; + } + if (curIdx == offset + len) return; + + // Exhaust the buffer + uint64_t numGroups = (offset + len - curIdx) / 2; + numGroups = std::min(numGroups, static_cast<uint64_t>(decoder->bufLength())); + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + uint32_t localByte; + for (uint64_t i = 0; i < numGroups; ++i) { + localByte = *buffer++; + data[curIdx] = (localByte >> 4) & 15; + data[curIdx + 1] = localByte & 15; + curIdx += 2; + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd' + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); + } + } + + void UnpackDefault::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength(); + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + data[curIdx++] = *buffer++; + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd'. + data[curIdx++] = decoder->readByte(); + } + } + + void UnpackDefault::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 2; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint16_t b0, b1; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint16_t>(*buffer); + b1 = static_cast<uint16_t>(*(buffer + 1)); + buffer += 2; + data[curIdx++] = (b0 << 8) | b1; + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + data[curIdx++] = (b0 << 8) | b1; + } + } + + void UnpackDefault::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 3; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint32_t b0, b1, b2; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint32_t>(*buffer); + b1 = static_cast<uint32_t>(*(buffer + 1)); + b2 = static_cast<uint32_t>(*(buffer + 2)); + buffer += 3; + data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2); + } + //////decoder->bufferStart += bufferNum * 3; + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2); + } + } + + void UnpackDefault::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 4; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint32_t b0, b1, b2, b3; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint32_t>(*buffer); + b1 = static_cast<uint32_t>(*(buffer + 1)); + b2 = static_cast<uint32_t>(*(buffer + 2)); + b3 = static_cast<uint32_t>(*(buffer + 3)); + buffer += 4; + data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + } + + void UnpackDefault::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 5; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint32_t>(*buffer); + b1 = static_cast<uint32_t>(*(buffer + 1)); + b2 = static_cast<uint32_t>(*(buffer + 2)); + b3 = static_cast<uint32_t>(*(buffer + 3)); + b4 = static_cast<uint32_t>(*(buffer + 4)); + buffer += 5; + data[curIdx++] = + static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + } + + void UnpackDefault::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 6; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint32_t>(*buffer); + b1 = static_cast<uint32_t>(*(buffer + 1)); + b2 = static_cast<uint32_t>(*(buffer + 2)); + b3 = static_cast<uint32_t>(*(buffer + 3)); + b4 = static_cast<uint32_t>(*(buffer + 4)); + b5 = static_cast<uint32_t>(*(buffer + 5)); + buffer += 6; + data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | + (b4 << 8) | b5); + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + data[curIdx++] = + static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + } + } + + void UnpackDefault::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 7; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint32_t>(*buffer); + b1 = static_cast<uint32_t>(*(buffer + 1)); + b2 = static_cast<uint32_t>(*(buffer + 2)); + b3 = static_cast<uint32_t>(*(buffer + 3)); + b4 = static_cast<uint32_t>(*(buffer + 4)); + b5 = static_cast<uint32_t>(*(buffer + 5)); + b6 = static_cast<uint32_t>(*(buffer + 6)); + buffer += 7; + data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + b6 = decoder->readByte(); + data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + } + + void UnpackDefault::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 8; + bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6, b7; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast<unsigned char*>(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast<uint32_t>(*buffer); + b1 = static_cast<uint32_t>(*(buffer + 1)); + b2 = static_cast<uint32_t>(*(buffer + 2)); + b3 = static_cast<uint32_t>(*(buffer + 3)); + b4 = static_cast<uint32_t>(*(buffer + 4)); + b5 = static_cast<uint32_t>(*(buffer + 5)); + b6 = static_cast<uint32_t>(*(buffer + 6)); + b7 = static_cast<uint32_t>(*(buffer + 7)); + buffer += 8; + data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + decoder->setBufStart(reinterpret_cast<char*>(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + b6 = decoder->readByte(); + b7 = decoder->readByte(); + data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + } + + void UnpackDefault::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > decoder->getBitsLeft()) { + result <<= decoder->getBitsLeft(); + result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); + bitsLeftToRead -= decoder->getBitsLeft(); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead)); + result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast<int64_t>(result); + } + } + + void BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { + UnpackDefault unpackDefault(decoder); + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + } + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/BpackingDefault.hh b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh new file mode 100644 index 0000000000..0a58234495 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/BpackingDefault.hh @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKINGDEFAULT_HH +#define ORC_BPACKINGDEFAULT_HH + +#include <cstdint> +#include <cstdlib> + +#include "Bpacking.hh" + +namespace orc { + class RleDecoderV2; + + class UnpackDefault { + public: + UnpackDefault(RleDecoderV2* dec); + ~UnpackDefault(); + + void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); + + void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + + private: + RleDecoderV2* decoder; + }; + + class BitUnpackDefault : public BitUnpack { + public: + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); + }; + +} // namespace orc + +#endif diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc index 1c4a645167..b81d282e35 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc @@ -16,12 +16,13 @@ * limitations under the License. */ +#include <string.h> #include <algorithm> #include <iostream> -#include <string.h> #include <utility> #include "ByteRLE.hh" +#include "Utils.hh" #include "orc/Exceptions.hh" namespace orc { @@ -35,7 +36,7 @@ namespace orc { } class ByteRleEncoderImpl : public ByteRleEncoder { - public: + public: ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); virtual ~ByteRleEncoderImpl() override; @@ -46,8 +47,7 @@ namespace orc { * @param notNull If the pointer is null, all values are read. If the * pointer is not null, positions that are false are skipped. */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) override; + virtual void add(const char* data, uint64_t numValues, const char* notNull) override; /** * Get size of buffer used so far. @@ -68,7 +68,7 @@ namespace orc { */ void reset(); - protected: + protected: std::unique_ptr<BufferedOutputStream> outputStream; char* literals; int numLiterals; @@ -83,22 +83,21 @@ namespace orc { void write(char c); }; - ByteRleEncoderImpl::ByteRleEncoderImpl( - std::unique_ptr<BufferedOutputStream> output) - : outputStream(std::move(output)) { + ByteRleEncoderImpl::ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output) + : outputStream(std::move(output)) { literals = new char[MAX_LITERAL_SIZE]; reset(); } ByteRleEncoderImpl::~ByteRleEncoderImpl() { // PASS - delete [] literals; + delete[] literals; } void ByteRleEncoderImpl::writeByte(char c) { if (bufferPosition == bufferLength) { int addedSize = 0; - if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { + if (!outputStream->Next(reinterpret_cast<void**>(&buffer), &addedSize)) { throw std::bad_alloc(); } bufferPosition = 0; @@ -107,10 +106,7 @@ namespace orc { buffer[bufferPosition++] = c; } - void ByteRleEncoderImpl::add( - const char* data, - uint64_t numValues, - const char* notNull) { + void ByteRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) { for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { write(data[i]); @@ -121,8 +117,7 @@ namespace orc { void ByteRleEncoderImpl::writeValues() { if (numLiterals != 0) { if (repeat) { - writeByte( - static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT))); + writeByte(static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT))); writeByte(literals[0]); } else { writeByte(static_cast<char>(-numLiterals)); @@ -189,7 +184,7 @@ namespace orc { return outputStream->getSize(); } - void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const { + void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); if (outputStream->isCompressed()) { @@ -220,14 +215,13 @@ namespace orc { reset(); } - std::unique_ptr<ByteRleEncoder> createByteRleEncoder - (std::unique_ptr<BufferedOutputStream> output) { - return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl - (std::move(output))); + std::unique_ptr<ByteRleEncoder> createByteRleEncoder( + std::unique_ptr<BufferedOutputStream> output) { + return std::make_unique<ByteRleEncoderImpl>(std::move(output)); } class BooleanRleEncoderImpl : public ByteRleEncoderImpl { - public: + public: BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); virtual ~BooleanRleEncoderImpl() override; @@ -238,8 +232,7 @@ namespace orc { * @param notNull If the pointer is null, all values are read. If the * pointer is not null, positions that are false are skipped. */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) override; + virtual void add(const char* data, uint64_t numValues, const char* notNull) override; /** * Flushing underlying BufferedOutputStream @@ -248,15 +241,15 @@ namespace orc { virtual void recordPosition(PositionRecorder* recorder) const override; - private: + virtual void suppress() override; + + private: int bitsRemained; char current; - }; - BooleanRleEncoderImpl::BooleanRleEncoderImpl( - std::unique_ptr<BufferedOutputStream> output) - : ByteRleEncoderImpl(std::move(output)) { + BooleanRleEncoderImpl::BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output) + : ByteRleEncoderImpl(std::move(output)) { bitsRemained = 8; current = static_cast<char>(0); } @@ -265,10 +258,7 @@ namespace orc { // PASS } - void BooleanRleEncoderImpl::add( - const char* data, - uint64_t numValues, - const char* notNull) { + void BooleanRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) { for (uint64_t i = 0; i < numValues; ++i) { if (bitsRemained == 0) { write(current); @@ -277,8 +267,7 @@ namespace orc { } if (!notNull || notNull[i]) { if (!data || data[i]) { - current = - static_cast<char>(current | (0x80 >> (8 - bitsRemained))); + current = static_cast<char>(current | (0x80 >> (8 - bitsRemained))); } --bitsRemained; } @@ -304,43 +293,49 @@ namespace orc { recorder->add(static_cast<uint64_t>(8 - bitsRemained)); } - std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder - (std::unique_ptr<BufferedOutputStream> output) { - BooleanRleEncoderImpl* encoder = - new BooleanRleEncoderImpl(std::move(output)) ; - return std::unique_ptr<ByteRleEncoder>( - reinterpret_cast<ByteRleEncoder*>(encoder)); + void BooleanRleEncoderImpl::suppress() { + ByteRleEncoderImpl::suppress(); + bitsRemained = 8; + current = static_cast<char>(0); + } + + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder( + std::unique_ptr<BufferedOutputStream> output) { + BooleanRleEncoderImpl* encoder = new BooleanRleEncoderImpl(std::move(output)); + return std::unique_ptr<ByteRleEncoder>(reinterpret_cast<ByteRleEncoder*>(encoder)); } ByteRleDecoder::~ByteRleDecoder() { // PASS } - class ByteRleDecoderImpl: public ByteRleDecoder { - public: - ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); + class ByteRleDecoderImpl : public ByteRleDecoder { + public: + ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics); - virtual ~ByteRleDecoderImpl(); + ~ByteRleDecoderImpl() override; /** * Seek to a particular spot. */ - virtual void seek(PositionProvider&); + virtual void seek(PositionProvider&) override; /** * Seek over a given number of values. */ - virtual void skip(uint64_t numValues); + virtual void skip(uint64_t numValues) override; /** * Read a number of values into the batch. */ - virtual void next(char* data, uint64_t numValues, char* notNull); + virtual void next(char* data, uint64_t numValues, char* notNull) override; - protected: + protected: + void nextInternal(char* data, uint64_t numValues, char* notNull); inline void nextBuffer(); inline signed char readByte(); inline void readHeader(); + inline void reset(); std::unique_ptr<SeekableInputStream> inputStream; size_t remainingValues; @@ -348,9 +343,11 @@ namespace orc { const char* bufferStart; const char* bufferEnd; bool repeating; + ReaderMetrics* metrics; }; void ByteRleDecoderImpl::nextBuffer() { + SCOPED_MINUS_STOPWATCH(metrics, ByteDecodingLatencyUs); int bufferLength; const void* bufferPointer; bool result = inputStream->Next(&bufferPointer, &bufferLength); @@ -365,7 +362,7 @@ namespace orc { if (bufferStart == bufferEnd) { nextBuffer(); } - return *(bufferStart++); + return static_cast<signed char>(*(bufferStart++)); } void ByteRleDecoderImpl::readHeader() { @@ -376,13 +373,11 @@ namespace orc { } else { remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT; repeating = true; - value = readByte(); + value = static_cast<char>(readByte()); } } - ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> - input) { - inputStream = std::move(input); + void ByteRleDecoderImpl::reset() { repeating = false; remainingValues = 0; value = 0; @@ -390,6 +385,13 @@ namespace orc { bufferEnd = nullptr; } + ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, + ReaderMetrics* _metrics) + : metrics(_metrics) { + inputStream = std::move(input); + reset(); + } + ByteRleDecoderImpl::~ByteRleDecoderImpl() { // PASS } @@ -397,15 +399,14 @@ namespace orc { void ByteRleDecoderImpl::seek(PositionProvider& location) { // move the input stream inputStream->seek(location); - // force a re-read from the stream - bufferEnd = bufferStart; - // read a new header - readHeader(); + // reset the decoder status and lazily call readHeader() + reset(); // skip ahead the given number of records ByteRleDecoderImpl::skip(location.next()); } void ByteRleDecoderImpl::skip(uint64_t numValues) { + SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall); while (numValues > 0) { if (remainingValues == 0) { readHeader(); @@ -422,8 +423,7 @@ namespace orc { nextBuffer(); } size_t skipSize = std::min(static_cast<size_t>(consumedBytes), - static_cast<size_t>(bufferEnd - - bufferStart)); + static_cast<size_t>(bufferEnd - bufferStart)); bufferStart += skipSize; consumedBytes -= skipSize; } @@ -431,8 +431,12 @@ namespace orc { } } - void ByteRleDecoderImpl::next(char* data, uint64_t numValues, - char* notNull) { + void ByteRleDecoderImpl::next(char* data, uint64_t numValues, char* notNull) { + SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall); + nextInternal(data, numValues, notNull); + } + + void ByteRleDecoderImpl::nextInternal(char* data, uint64_t numValues, char* notNull) { uint64_t position = 0; // skip over null values while (notNull && position < numValues && !notNull[position]) { @@ -444,12 +448,11 @@ namespace orc { readHeader(); } // how many do we read out of this block? - size_t count = std::min(static_cast<size_t>(numValues - position), - remainingValues); + size_t count = std::min(static_cast<size_t>(numValues - position), remainingValues); uint64_t consumed = 0; if (repeating) { if (notNull) { - for(uint64_t i=0; i < count; ++i) { + for (uint64_t i = 0; i < count; ++i) { if (notNull[position + i]) { data[position + i] = value; consumed += 1; @@ -461,9 +464,9 @@ namespace orc { } } else { if (notNull) { - for(uint64_t i=0; i < count; ++i) { + for (uint64_t i = 0; i < count; ++i) { if (notNull[position + i]) { - data[position + i] = readByte(); + data[position + i] = static_cast<char>(readByte()); consumed += 1; } } @@ -473,9 +476,8 @@ namespace orc { if (bufferStart == bufferEnd) { nextBuffer(); } - uint64_t copyBytes = - std::min(static_cast<uint64_t>(count - i), - static_cast<uint64_t>(bufferEnd - bufferStart)); + uint64_t copyBytes = std::min(static_cast<uint64_t>(count - i), + static_cast<uint64_t>(bufferEnd - bufferStart)); memcpy(data + position + i, bufferStart, copyBytes); bufferStart += copyBytes; i += copyBytes; @@ -492,41 +494,40 @@ namespace orc { } } - std::unique_ptr<ByteRleDecoder> createByteRleDecoder - (std::unique_ptr<SeekableInputStream> input) { - return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl - (std::move(input))); + std::unique_ptr<ByteRleDecoder> createByteRleDecoder(std::unique_ptr<SeekableInputStream> input, + ReaderMetrics* metrics) { + return std::make_unique<ByteRleDecoderImpl>(std::move(input), metrics); } - class BooleanRleDecoderImpl: public ByteRleDecoderImpl { - public: - BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); + class BooleanRleDecoderImpl : public ByteRleDecoderImpl { + public: + BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics); - virtual ~BooleanRleDecoderImpl(); + ~BooleanRleDecoderImpl() override; /** * Seek to a particular spot. */ - virtual void seek(PositionProvider&); + virtual void seek(PositionProvider&) override; /** * Seek over a given number of values. */ - virtual void skip(uint64_t numValues); + virtual void skip(uint64_t numValues) override; /** * Read a number of values into the batch. */ - virtual void next(char* data, uint64_t numValues, char* notNull); + virtual void next(char* data, uint64_t numValues, char* notNull) override; - protected: + protected: size_t remainingBits; char lastByte; }; - BooleanRleDecoderImpl::BooleanRleDecoderImpl - (std::unique_ptr<SeekableInputStream> input - ): ByteRleDecoderImpl(std::move(input)) { + BooleanRleDecoderImpl::BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, + ReaderMetrics* _metrics) + : ByteRleDecoderImpl(std::move(input), _metrics) { remainingBits = 0; lastByte = 0; } @@ -564,35 +565,33 @@ namespace orc { } } - void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, - char* notNull) { + void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, char* notNull) { + SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall); // next spot to fill in uint64_t position = 0; // use up any remaining bits if (notNull) { - while(remainingBits > 0 && position < numValues) { + while (remainingBits > 0 && position < numValues) { if (notNull[position]) { remainingBits -= 1; - data[position] = (static_cast<unsigned char>(lastByte) >> - remainingBits) & 0x1; + data[position] = (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1; } else { data[position] = 0; } position += 1; } } else { - while(remainingBits > 0 && position < numValues) { + while (remainingBits > 0 && position < numValues) { remainingBits -= 1; - data[position++] = (static_cast<unsigned char>(lastByte) >> - remainingBits) & 0x1; + data[position++] = (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1; } } // count the number of nonNulls remaining uint64_t nonNulls = numValues - position; if (notNull) { - for(uint64_t i=position; i < numValues; ++i) { + for (uint64_t i = position; i < numValues; ++i) { if (!notNull[i]) { nonNulls -= 1; } @@ -607,14 +606,14 @@ namespace orc { } else if (position < numValues) { // read the new bytes into the array uint64_t bytesRead = (nonNulls + 7) / 8; - ByteRleDecoderImpl::next(data + position, bytesRead, nullptr); + ByteRleDecoderImpl::nextInternal(data + position, bytesRead, nullptr); lastByte = data[position + bytesRead - 1]; remainingBits = bytesRead * 8 - nonNulls; // expand the array backwards so that we don't clobber the data uint64_t bitsLeft = bytesRead * 8 - remainingBits; if (notNull) { - for(int64_t i=static_cast<int64_t>(numValues) - 1; - i >= static_cast<int64_t>(position); --i) { + for (int64_t i = static_cast<int64_t>(numValues) - 1; i >= static_cast<int64_t>(position); + --i) { if (notNull[i]) { uint64_t shiftPosn = (-bitsLeft) % 8; data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; @@ -624,8 +623,8 @@ namespace orc { } } } else { - for(int64_t i=static_cast<int64_t>(numValues) - 1; - i >= static_cast<int64_t>(position); --i, --bitsLeft) { + for (int64_t i = static_cast<int64_t>(numValues) - 1; i >= static_cast<int64_t>(position); + --i, --bitsLeft) { uint64_t shiftPosn = (-bitsLeft) % 8; data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; } @@ -633,11 +632,8 @@ namespace orc { } } - std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder - (std::unique_ptr<SeekableInputStream> input) { - BooleanRleDecoderImpl* decoder = - new BooleanRleDecoderImpl(std::move(input)); - return std::unique_ptr<ByteRleDecoder>( - reinterpret_cast<ByteRleDecoder*>(decoder)); + std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder( + std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics) { + return std::make_unique<BooleanRleDecoderImpl>(std::move(input), metrics); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh index 2f6e2eb4df..bd19f52ecc 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh @@ -27,7 +27,7 @@ namespace orc { class ByteRleEncoder { - public: + public: virtual ~ByteRleEncoder(); /** @@ -37,8 +37,7 @@ namespace orc { * @param notNull If the pointer is null, all values are read. If the * pointer is not null, positions that are false are skipped. */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) = 0; + virtual void add(const char* data, uint64_t numValues, const char* notNull) = 0; /** * Get size of buffer used so far. @@ -63,7 +62,7 @@ namespace orc { }; class ByteRleDecoder { - public: + public: virtual ~ByteRleDecoder(); /** @@ -90,22 +89,23 @@ namespace orc { * Create a byte RLE encoder. * @param output the output stream to write to */ - std::unique_ptr<ByteRleEncoder> createByteRleEncoder - (std::unique_ptr<BufferedOutputStream> output); + std::unique_ptr<ByteRleEncoder> createByteRleEncoder( + std::unique_ptr<BufferedOutputStream> output); /** * Create a boolean RLE encoder. * @param output the output stream to write to */ - std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder - (std::unique_ptr<BufferedOutputStream> output); + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder( + std::unique_ptr<BufferedOutputStream> output); /** * Create a byte RLE decoder. * @param input the input stream to read from + * @param metrics the metrics of the decoder */ - std::unique_ptr<ByteRleDecoder> createByteRleDecoder - (std::unique_ptr<SeekableInputStream> input); + std::unique_ptr<ByteRleDecoder> createByteRleDecoder(std::unique_ptr<SeekableInputStream> input, + ReaderMetrics* metrics); /** * Create a boolean RLE decoder. @@ -114,9 +114,10 @@ namespace orc { * if the value is masked by notNull. This is required for the notNull stream * processing to properly apply multiple masks from nested types. * @param input the input stream to read from + * @param metrics the metrics of the decoder */ - std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder - (std::unique_ptr<SeekableInputStream> input); -} + std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder( + std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc index ab6b690c57..5297f80371 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc @@ -21,167 +21,174 @@ #include "Adaptor.hh" +#include <time.h> #include <limits> #include <sstream> #include <stdexcept> -#include <time.h> #include <typeinfo> #ifdef __clang__ - #pragma clang diagnostic ignored "-Wformat-security" +#pragma clang diagnostic ignored "-Wformat-security" #endif namespace orc { - class VoidColumnPrinter: public ColumnPrinter { - public: + class VoidColumnPrinter : public ColumnPrinter { + public: VoidColumnPrinter(std::string&); ~VoidColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class BooleanColumnPrinter: public ColumnPrinter { - private: + class BooleanColumnPrinter : public ColumnPrinter { + private: const int64_t* data; - public: + + public: BooleanColumnPrinter(std::string&); ~BooleanColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class LongColumnPrinter: public ColumnPrinter { - private: + class LongColumnPrinter : public ColumnPrinter { + private: const int64_t* data; - public: + + public: LongColumnPrinter(std::string&); ~LongColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class DoubleColumnPrinter: public ColumnPrinter { - private: + class DoubleColumnPrinter : public ColumnPrinter { + private: const double* data; const bool isFloat; - public: + public: DoubleColumnPrinter(std::string&, const Type& type); virtual ~DoubleColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class TimestampColumnPrinter: public ColumnPrinter { - private: + class TimestampColumnPrinter : public ColumnPrinter { + private: const int64_t* seconds; const int64_t* nanoseconds; - public: + public: TimestampColumnPrinter(std::string&); ~TimestampColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class DateColumnPrinter: public ColumnPrinter { - private: + class DateColumnPrinter : public ColumnPrinter { + private: const int64_t* data; - public: + public: DateColumnPrinter(std::string&); ~DateColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class Decimal64ColumnPrinter: public ColumnPrinter { - private: + class Decimal64ColumnPrinter : public ColumnPrinter { + private: const int64_t* data; int32_t scale; - public: + + public: Decimal64ColumnPrinter(std::string&); ~Decimal64ColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class Decimal128ColumnPrinter: public ColumnPrinter { - private: + class Decimal128ColumnPrinter : public ColumnPrinter { + private: const Int128* data; int32_t scale; - public: + + public: Decimal128ColumnPrinter(std::string&); ~Decimal128ColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class StringColumnPrinter: public ColumnPrinter { - private: - const char* const * start; + class StringColumnPrinter : public ColumnPrinter { + private: + const char* const* start; const int64_t* length; - public: + + public: StringColumnPrinter(std::string&); virtual ~StringColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class BinaryColumnPrinter: public ColumnPrinter { - private: - const char* const * start; + class BinaryColumnPrinter : public ColumnPrinter { + private: + const char* const* start; const int64_t* length; - public: + + public: BinaryColumnPrinter(std::string&); virtual ~BinaryColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class ListColumnPrinter: public ColumnPrinter { - private: + class ListColumnPrinter : public ColumnPrinter { + private: const int64_t* offsets; std::unique_ptr<ColumnPrinter> elementPrinter; - public: + public: ListColumnPrinter(std::string&, const Type& type); virtual ~ListColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class MapColumnPrinter: public ColumnPrinter { - private: + class MapColumnPrinter : public ColumnPrinter { + private: const int64_t* offsets; std::unique_ptr<ColumnPrinter> keyPrinter; std::unique_ptr<ColumnPrinter> elementPrinter; - public: + public: MapColumnPrinter(std::string&, const Type& type); virtual ~MapColumnPrinter() override {} void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class UnionColumnPrinter: public ColumnPrinter { - private: - const unsigned char *tags; + class UnionColumnPrinter : public ColumnPrinter { + private: + const unsigned char* tags; const uint64_t* offsets; std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter; - public: + public: UnionColumnPrinter(std::string&, const Type& type); void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; }; - class StructColumnPrinter: public ColumnPrinter { - private: + class StructColumnPrinter : public ColumnPrinter { + private: std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter; std::vector<std::string> fieldNames; - public: + + public: StructColumnPrinter(std::string&, const Type& type); void printRow(uint64_t rowId) override; void reset(const ColumnVectorBatch& batch) override; @@ -191,13 +198,12 @@ namespace orc { file += ch; } - void writeString(std::string& file, const char *ptr) { + void writeString(std::string& file, const char* ptr) { size_t len = strlen(ptr); file.append(ptr, len); } - ColumnPrinter::ColumnPrinter(std::string& _buffer - ): buffer(_buffer) { + ColumnPrinter::ColumnPrinter(std::string& _buffer) : buffer(_buffer) { notNull = nullptr; hasNulls = false; } @@ -211,89 +217,87 @@ namespace orc { if (hasNulls) { notNull = batch.notNull.data(); } else { - notNull = nullptr ; + notNull = nullptr; } } - std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, - const Type* type) { - ColumnPrinter *result = nullptr; + std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type) { + std::unique_ptr<ColumnPrinter> result; if (type == nullptr) { - result = new VoidColumnPrinter(buffer); + result = std::make_unique<VoidColumnPrinter>(buffer); } else { - switch(static_cast<int64_t>(type->getKind())) { - case BOOLEAN: - result = new BooleanColumnPrinter(buffer); - break; - - case BYTE: - case SHORT: - case INT: - case LONG: - result = new LongColumnPrinter(buffer); - break; - - case FLOAT: - case DOUBLE: - result = new DoubleColumnPrinter(buffer, *type); - break; - - case STRING: - case VARCHAR : - case CHAR: - result = new StringColumnPrinter(buffer); - break; - - case BINARY: - result = new BinaryColumnPrinter(buffer); - break; - - case TIMESTAMP: - case TIMESTAMP_INSTANT: - result = new TimestampColumnPrinter(buffer); - break; - - case LIST: - result = new ListColumnPrinter(buffer, *type); - break; - - case MAP: - result = new MapColumnPrinter(buffer, *type); - break; - - case STRUCT: - result = new StructColumnPrinter(buffer, *type); - break; - - case DECIMAL: - if (type->getPrecision() == 0 || type->getPrecision() > 18) { - result = new Decimal128ColumnPrinter(buffer); - } else { - result = new Decimal64ColumnPrinter(buffer); - } - break; + switch (static_cast<int64_t>(type->getKind())) { + case BOOLEAN: + result = std::make_unique<BooleanColumnPrinter>(buffer); + break; + + case BYTE: + case SHORT: + case INT: + case LONG: + result = std::make_unique<LongColumnPrinter>(buffer); + break; + + case FLOAT: + case DOUBLE: + result = std::make_unique<DoubleColumnPrinter>(buffer, *type); + break; - case DATE: - result = new DateColumnPrinter(buffer); - break; + case STRING: + case VARCHAR: + case CHAR: + result = std::make_unique<StringColumnPrinter>(buffer); + break; - case UNION: - result = new UnionColumnPrinter(buffer, *type); - break; + case BINARY: + result = std::make_unique<BinaryColumnPrinter>(buffer); + break; + + case TIMESTAMP: + case TIMESTAMP_INSTANT: + result = std::make_unique<TimestampColumnPrinter>(buffer); + break; + + case LIST: + result = std::make_unique<ListColumnPrinter>(buffer, *type); + break; + + case MAP: + result = std::make_unique<MapColumnPrinter>(buffer, *type); + break; + + case STRUCT: + result = std::make_unique<StructColumnPrinter>(buffer, *type); + break; + + case DECIMAL: + if (type->getPrecision() == 0 || type->getPrecision() > 18) { + result = std::make_unique<Decimal128ColumnPrinter>(buffer); + } else { + result = std::make_unique<Decimal64ColumnPrinter>(buffer); + } + break; + + case DATE: + result = std::make_unique<DateColumnPrinter>(buffer); + break; + + case UNION: + result = std::make_unique<UnionColumnPrinter>(buffer, *type); + break; - default: - throw std::logic_error("unknown batch type"); + default: + throw std::logic_error("unknown batch type"); } } - return std::unique_ptr<ColumnPrinter>(result); + return result; } - VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer) { + VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer) : ColumnPrinter(_buffer) { // PASS } - void VoidColumnPrinter::reset(const ColumnVectorBatch&) { + void VoidColumnPrinter::reset(const ColumnVectorBatch&) { // PASS } @@ -301,13 +305,12 @@ namespace orc { writeString(buffer, "null"); } - LongColumnPrinter::LongColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { + LongColumnPrinter::LongColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), data(nullptr) { // PASS } - void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { + void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); } @@ -316,22 +319,17 @@ namespace orc { if (hasNulls && !notNull[rowId]) { writeString(buffer, "null"); } else { - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", - static_cast<int64_t >(data[rowId])); - writeString(buffer, numBuffer); + const auto numBuffer = std::to_string(static_cast<int64_t>(data[rowId])); + writeString(buffer, numBuffer.c_str()); } } - DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - data(nullptr), - isFloat(type.getKind() == FLOAT){ + DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, const Type& type) + : ColumnPrinter(_buffer), data(nullptr), isFloat(type.getKind() == FLOAT) { // PASS } - void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { + void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); } @@ -341,20 +339,17 @@ namespace orc { writeString(buffer, "null"); } else { char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", - data[rowId]); + snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", data[rowId]); writeString(buffer, numBuffer); } } - Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr), - scale(0) { + Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), data(nullptr), scale(0) { // PASS } - void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { + void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; @@ -376,13 +371,12 @@ namespace orc { int32_t len = static_cast<int32_t>(str.length()); if (len > scale) { return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); + str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(scale)); } else if (len == scale) { return sign + "0." + str; } else { std::string result = sign + "0."; - for(int32_t i=0; i < scale - len; ++i) { + for (int32_t i = 0; i < scale - len; ++i) { result += "0"; } return result + str; @@ -397,31 +391,27 @@ namespace orc { } } - Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr), - scale(0) { - // PASS - } - - void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; - } - - void Decimal128ColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, data[rowId].toDecimalString(scale).c_str()); - } - } - - StringColumnPrinter::StringColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - start(nullptr), - length(nullptr) { + Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), data(nullptr), scale(0) { + // PASS + } + + void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); + scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; + } + + void Decimal128ColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, data[rowId].toDecimalString(scale).c_str()); + } + } + + StringColumnPrinter::StringColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), start(nullptr), length(nullptr) { // PASS } @@ -436,51 +426,48 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '"'); - for(int64_t i=0; i < length[rowId]; ++i) { + for (int64_t i = 0; i < length[rowId]; ++i) { char ch = static_cast<char>(start[rowId][i]); switch (ch) { - case '\\': - writeString(buffer, "\\\\"); - break; - case '\b': - writeString(buffer, "\\b"); - break; - case '\f': - writeString(buffer, "\\f"); - break; - case '\n': - writeString(buffer, "\\n"); - break; - case '\r': - writeString(buffer, "\\r"); - break; - case '\t': - writeString(buffer, "\\t"); - break; - case '"': - writeString(buffer, "\\\""); - break; - default: - writeChar(buffer, ch); - break; + case '\\': + writeString(buffer, "\\\\"); + break; + case '\b': + writeString(buffer, "\\b"); + break; + case '\f': + writeString(buffer, "\\f"); + break; + case '\n': + writeString(buffer, "\\n"); + break; + case '\r': + writeString(buffer, "\\r"); + break; + case '\t': + writeString(buffer, "\\t"); + break; + case '"': + writeString(buffer, "\\\""); + break; + default: + writeChar(buffer, ch); + break; } } writeChar(buffer, '"'); } } - ListColumnPrinter::ListColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - offsets(nullptr) { + ListColumnPrinter::ListColumnPrinter(std::string& _buffer, const Type& type) + : ColumnPrinter(_buffer), offsets(nullptr) { elementPrinter = createColumnPrinter(buffer, type.getSubtype(0)); } - void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { + void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); - elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch). - elements); + elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements); } void ListColumnPrinter::printRow(uint64_t rowId) { @@ -488,7 +475,7 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '['); - for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { + for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) { if (i != offsets[rowId]) { writeString(buffer, ", "); } @@ -498,15 +485,13 @@ namespace orc { } } - MapColumnPrinter::MapColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - offsets(nullptr) { + MapColumnPrinter::MapColumnPrinter(std::string& _buffer, const Type& type) + : ColumnPrinter(_buffer), offsets(nullptr) { keyPrinter = createColumnPrinter(buffer, type.getSubtype(0)); elementPrinter = createColumnPrinter(buffer, type.getSubtype(1)); } - void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { + void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch); offsets = myBatch.offsets.data(); @@ -519,7 +504,7 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '['); - for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { + for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) { if (i != offsets[rowId]) { writeString(buffer, ", "); } @@ -533,23 +518,19 @@ namespace orc { } } - UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - tags(nullptr), - offsets(nullptr) { - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, const Type& type) + : ColumnPrinter(_buffer), tags(nullptr), offsets(nullptr) { + for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))); } } void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - const UnionVectorBatch& unionBatch = - dynamic_cast<const UnionVectorBatch&>(batch); + const UnionVectorBatch& unionBatch = dynamic_cast<const UnionVectorBatch&>(batch); tags = unionBatch.tags.data(); offsets = unionBatch.offsets.data(); - for(size_t i=0; i < fieldPrinter.size(); ++i) { + for (size_t i = 0; i < fieldPrinter.size(); ++i) { fieldPrinter[i]->reset(*(unionBatch.children[i])); } } @@ -559,20 +540,17 @@ namespace orc { writeString(buffer, "null"); } else { writeString(buffer, "{\"tag\": "); - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", - static_cast<int64_t>(tags[rowId])); - writeString(buffer, numBuffer); + const auto numBuffer = std::to_string(static_cast<int64_t>(tags[rowId])); + writeString(buffer, numBuffer.c_str()); writeString(buffer, ", \"value\": "); fieldPrinter[tags[rowId]]->printRow(offsets[rowId]); writeChar(buffer, '}'); } } - StructColumnPrinter::StructColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer) { - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + StructColumnPrinter::StructColumnPrinter(std::string& _buffer, const Type& type) + : ColumnPrinter(_buffer) { + for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { fieldNames.push_back(type.getFieldName(i)); fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))); } @@ -580,9 +558,8 @@ namespace orc { void StructColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - const StructVectorBatch& structBatch = - dynamic_cast<const StructVectorBatch&>(batch); - for(size_t i=0; i < fieldPrinter.size(); ++i) { + const StructVectorBatch& structBatch = dynamic_cast<const StructVectorBatch&>(batch); + for (size_t i = 0; i < fieldPrinter.size(); ++i) { fieldPrinter[i]->reset(*(structBatch.fields[i])); } } @@ -592,7 +569,7 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '{'); - for(unsigned int i=0; i < fieldPrinter.size(); ++i) { + for (unsigned int i = 0; i < fieldPrinter.size(); ++i) { if (i != 0) { writeString(buffer, ", "); } @@ -605,9 +582,8 @@ namespace orc { } } - DateColumnPrinter::DateColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { + DateColumnPrinter::DateColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), data(nullptr) { // PASS } @@ -631,9 +607,8 @@ namespace orc { data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); } - BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { + BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), data(nullptr) { // PASS } @@ -650,10 +625,8 @@ namespace orc { data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); } - BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - start(nullptr), - length(nullptr) { + BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), start(nullptr), length(nullptr) { // PASS } @@ -662,14 +635,12 @@ namespace orc { writeString(buffer, "null"); } else { writeChar(buffer, '['); - for(int64_t i=0; i < length[rowId]; ++i) { + for (int64_t i = 0; i < length[rowId]; ++i) { if (i != 0) { writeString(buffer, ", "); } - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%d", - (static_cast<const int>(start[rowId][i]) & 0xff)); - writeString(buffer, numBuffer); + const auto numBuffer = std::to_string(static_cast<int>(start[rowId][i]) & 0xff); + writeString(buffer, numBuffer.c_str()); } writeChar(buffer, ']'); } @@ -681,10 +652,8 @@ namespace orc { length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); } - TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - seconds(nullptr), - nanoseconds(nullptr) { + TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer) + : ColumnPrinter(_buffer), seconds(nullptr), nanoseconds(nullptr) { // PASS } @@ -712,20 +681,20 @@ namespace orc { zeroDigits += 1; } } - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), - "%0*" INT64_FORMAT_STRING "d\"", - static_cast<int>(NANO_DIGITS - zeroDigits), - static_cast<int64_t >(nanos)); - writeString(buffer, numBuffer); + const auto numBuffer = std::to_string(static_cast<int64_t>(nanos)); + const int64_t padDigits = NANO_DIGITS - zeroDigits - static_cast<int64_t>(numBuffer.size()); + for (int i = 0; i < padDigits; ++i) { + writeChar(buffer, '0'); + } + writeString(buffer, numBuffer.c_str()); + writeChar(buffer, '"'); } } void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) { ColumnPrinter::reset(batch); - const TimestampVectorBatch& ts = - dynamic_cast<const TimestampVectorBatch&>(batch); + const TimestampVectorBatch& ts = dynamic_cast<const TimestampVectorBatch&>(batch); seconds = ts.data.data(); nanoseconds = ts.nanoseconds.data(); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc index 873b54c618..a6bbdabedc 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc @@ -21,8 +21,10 @@ #include "Adaptor.hh" #include "ByteRLE.hh" #include "ColumnReader.hh" -#include "orc/Exceptions.hh" +#include "ConvertColumnReader.hh" #include "RLE.hh" +#include "SchemaEvolution.hh" +#include "orc/Exceptions.hh" #include <math.h> #include <iostream> @@ -35,25 +37,25 @@ namespace orc { inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { switch (static_cast<int64_t>(kind)) { - case proto::ColumnEncoding_Kind_DIRECT: - case proto::ColumnEncoding_Kind_DICTIONARY: - return RleVersion_1; - case proto::ColumnEncoding_Kind_DIRECT_V2: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - return RleVersion_2; - default: - throw ParseError("Unknown encoding in convertRleVersion"); + case proto::ColumnEncoding_Kind_DIRECT: + case proto::ColumnEncoding_Kind_DICTIONARY: + return RleVersion_1; + case proto::ColumnEncoding_Kind_DIRECT_V2: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + return RleVersion_2; + default: + throw ParseError("Unknown encoding in convertRleVersion"); } } - ColumnReader::ColumnReader(const Type& type, - StripeStreams& stripe - ): columnId(type.getColumnId()), - memoryPool(stripe.getMemoryPool()) { + ColumnReader::ColumnReader(const Type& type, StripeStreams& stripe) + : columnId(type.getColumnId()), + memoryPool(stripe.getMemoryPool()), + metrics(stripe.getReaderMetrics()) { std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); + stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); if (stream.get()) { - notNullDecoder = createBooleanRleDecoder(std::move(stream)); + notNullDecoder = createBooleanRleDecoder(std::move(stream), metrics); } } @@ -67,17 +69,14 @@ namespace orc { // page through the values that we want to skip // and count how many are non-null const size_t MAX_BUFFER_SIZE = 32768; - size_t bufferSize = std::min(MAX_BUFFER_SIZE, - static_cast<size_t>(numValues)); + size_t bufferSize = std::min(MAX_BUFFER_SIZE, static_cast<size_t>(numValues)); char buffer[MAX_BUFFER_SIZE]; uint64_t remaining = numValues; while (remaining > 0) { - uint64_t chunkSize = - std::min(remaining, - static_cast<uint64_t>(bufferSize)); + uint64_t chunkSize = std::min(remaining, static_cast<uint64_t>(bufferSize)); decoder->next(buffer, chunkSize, nullptr); remaining -= chunkSize; - for(uint64_t i=0; i < chunkSize; ++i) { + for (uint64_t i = 0; i < chunkSize; ++i) { if (!buffer[i]) { numValues -= 1; } @@ -87,9 +86,7 @@ namespace orc { return numValues; } - void ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* incomingMask) { + void ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* incomingMask) { if (numValues > rowBatch.capacity) { rowBatch.resize(numValues); } @@ -99,7 +96,7 @@ namespace orc { char* notNullArray = rowBatch.notNull.data(); decoder->next(notNullArray, numValues, incomingMask); // check to see if there are nulls in this batch - for(uint64_t i=0; i < numValues; ++i) { + for (uint64_t i = 0; i < numValues; ++i) { if (!notNullArray[i]) { rowBatch.hasNulls = true; return; @@ -114,240 +111,195 @@ namespace orc { rowBatch.hasNulls = false; } - void ColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + void ColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) { if (notNullDecoder.get()) { notNullDecoder->seek(positions.at(columnId)); } } /** - * Expand an array of bytes in place to the corresponding array of longs. + * Expand an array of bytes in place to the corresponding array of integer. * Has to work backwards so that they data isn't clobbered during the * expansion. * @param buffer the array of chars and array of longs that need to be * expanded * @param numValues the number of bytes to convert to longs */ - void expandBytesToLongs(int64_t* buffer, uint64_t numValues) { - for(size_t i=numValues - 1; i < numValues; --i) { - buffer[i] = reinterpret_cast<char *>(buffer)[i]; + template <typename T> + void expandBytesToIntegers(T* buffer, uint64_t numValues) { + if (sizeof(T) == sizeof(int8_t)) { + return; + } + for (uint64_t i = 0UL; i < numValues; ++i) { + buffer[numValues - 1 - i] = reinterpret_cast<int8_t*>(buffer)[numValues - 1 - i]; } } - class BooleanColumnReader: public ColumnReader { - private: + template <typename BatchType> + class BooleanColumnReader : public ColumnReader { + private: std::unique_ptr<orc::ByteRleDecoder> rle; - public: + public: BooleanColumnReader(const Type& type, StripeStreams& stipe); ~BooleanColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; }; - BooleanColumnReader::BooleanColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe){ + template <typename BatchType> + BooleanColumnReader<BatchType>::BooleanColumnReader(const Type& type, StripeStreams& stripe) + : ColumnReader(type, stripe) { std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Boolean column"); - rle = createBooleanRleDecoder(std::move(stream)); + if (stream == nullptr) throw ParseError("DATA stream not found in Boolean column"); + rle = createBooleanRleDecoder(std::move(stream), metrics); } - BooleanColumnReader::~BooleanColumnReader() { + template <typename BatchType> + BooleanColumnReader<BatchType>::~BooleanColumnReader() { // PASS } - uint64_t BooleanColumnReader::skip(uint64_t numValues) { + template <typename BatchType> + uint64_t BooleanColumnReader<BatchType>::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); rle->skip(numValues); return numValues; } - void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + template <typename BatchType> + void BooleanColumnReader<BatchType>::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); - // Since the byte rle places the output in a char* instead of long*, - // we cheat here and use the long* and then expand it in a second pass. - int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - expandBytesToLongs(ptr, numValues); + // Since the byte rle places the output in a char* and BatchType here may be + // LongVectorBatch with long*. We cheat here in that case and use the long* + // and then expand it in a second pass.. + auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data(); + rle->next(reinterpret_cast<char*>(ptr), numValues, + rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + expandBytesToIntegers(ptr, numValues); } - void BooleanColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + template <typename BatchType> + void BooleanColumnReader<BatchType>::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); rle->seek(positions.at(columnId)); } - class ByteColumnReader: public ColumnReader { - private: + template <typename BatchType> + class ByteColumnReader : public ColumnReader { + private: std::unique_ptr<orc::ByteRleDecoder> rle; - public: - ByteColumnReader(const Type& type, StripeStreams& stipe); - ~ByteColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - ByteColumnReader::ByteColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe){ - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Byte column"); - rle = createByteRleDecoder(std::move(stream)); - } + public: + ByteColumnReader(const Type& type, StripeStreams& stripe) : ColumnReader(type, stripe) { + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) throw ParseError("DATA stream not found in Byte column"); + rle = createByteRleDecoder(std::move(stream), metrics); + } - ByteColumnReader::~ByteColumnReader() { - // PASS - } + ~ByteColumnReader() override = default; - uint64_t ByteColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } + uint64_t skip(uint64_t numValues) override { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } - void ByteColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // Since the byte rle places the output in a char* instead of long*, - // we cheat here and use the long* and then expand it in a second pass. - int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - expandBytesToLongs(ptr, numValues); - } + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ColumnReader::next(rowBatch, numValues, notNull); + // Since the byte rle places the output in a char* instead of long*, + // we cheat here and use the long* and then expand it in a second pass. + auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data(); + rle->next(reinterpret_cast<char*>(ptr), numValues, + rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + expandBytesToIntegers(ptr, numValues); + } - void ByteColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + }; - class IntegerColumnReader: public ColumnReader { - protected: + template <typename BatchType> + class IntegerColumnReader : public ColumnReader { + protected: std::unique_ptr<orc::RleDecoder> rle; - public: - IntegerColumnReader(const Type& type, StripeStreams& stripe); - ~IntegerColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - IntegerColumnReader::IntegerColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Integer column"); - rle = createRleDecoder(std::move(stream), true, vers, memoryPool); - } + public: + IntegerColumnReader(const Type& type, StripeStreams& stripe) : ColumnReader(type, stripe) { + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) throw ParseError("DATA stream not found in Integer column"); + rle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics); + } - IntegerColumnReader::~IntegerColumnReader() { - // PASS - } + ~IntegerColumnReader() override { + // PASS + } - uint64_t IntegerColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } + uint64_t skip(uint64_t numValues) override { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } - void IntegerColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - } + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ColumnReader::next(rowBatch, numValues, notNull); + rle->next(dynamic_cast<BatchType&>(rowBatch).data.data(), numValues, + rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + } - void IntegerColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + }; - class TimestampColumnReader: public ColumnReader { - private: + class TimestampColumnReader : public ColumnReader { + private: std::unique_ptr<orc::RleDecoder> secondsRle; std::unique_ptr<orc::RleDecoder> nanoRle; - const Timezone& writerTimezone; - const Timezone& readerTimezone; + const Timezone* writerTimezone; + const Timezone* readerTimezone; const int64_t epochOffset; const bool sameTimezone; - public: - TimestampColumnReader(const Type& type, - StripeStreams& stripe, - bool isInstantType); + public: + TimestampColumnReader(const Type& type, StripeStreams& stripe, bool isInstantType); ~TimestampColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; }; - - TimestampColumnReader::TimestampColumnReader(const Type& type, - StripeStreams& stripe, - bool isInstantType - ): ColumnReader(type, stripe), - writerTimezone(isInstantType ? - getTimezoneByName("GMT") : - stripe.getWriterTimezone()), - readerTimezone(isInstantType ? - getTimezoneByName("GMT") : - stripe.getReaderTimezone()), - epochOffset(writerTimezone.getEpoch()), - sameTimezone(&writerTimezone == &readerTimezone){ + TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe, + bool isInstantType) + : ColumnReader(type, stripe), + writerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getWriterTimezone()), + readerTimezone(isInstantType ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()), + epochOffset(writerTimezone->getEpoch()), + sameTimezone(writerTimezone == readerTimezone) { RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Timestamp column"); - secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool); + if (stream == nullptr) throw ParseError("DATA stream not found in Timestamp column"); + secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics); stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); - if (stream == nullptr) - throw ParseError("SECONDARY stream not found in Timestamp column"); - nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool); + if (stream == nullptr) throw ParseError("SECONDARY stream not found in Timestamp column"); + nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); } TimestampColumnReader::~TimestampColumnReader() { @@ -361,25 +313,22 @@ namespace orc { return numValues; } - void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - TimestampVectorBatch& timestampBatch = - dynamic_cast<TimestampVectorBatch&>(rowBatch); - int64_t *secsBuffer = timestampBatch.data.data(); + TimestampVectorBatch& timestampBatch = dynamic_cast<TimestampVectorBatch&>(rowBatch); + int64_t* secsBuffer = timestampBatch.data.data(); secondsRle->next(secsBuffer, numValues, notNull); - int64_t *nanoBuffer = timestampBatch.nanoseconds.data(); + int64_t* nanoBuffer = timestampBatch.nanoseconds.data(); nanoRle->next(nanoBuffer, numValues, notNull); // Construct the values - for(uint64_t i=0; i < numValues; i++) { + for (uint64_t i = 0; i < numValues; i++) { if (notNull == nullptr || notNull[i]) { uint64_t zeros = nanoBuffer[i] & 0x7; nanoBuffer[i] >>= 3; if (zeros != 0) { - for(uint64_t j = 0; j <= zeros; ++j) { + for (uint64_t j = 0; j <= zeros; ++j) { nanoBuffer[i] *= 10; } } @@ -387,13 +336,13 @@ namespace orc { if (!sameTimezone) { // adjust timestamp value to same wall clock time if writer and reader // time zones have different rules, which is required for Apache Orc. - const auto& wv = writerTimezone.getVariant(writerTime); - const auto& rv = readerTimezone.getVariant(writerTime); + const auto& wv = writerTimezone->getVariant(writerTime); + const auto& rv = readerTimezone->getVariant(writerTime); if (!wv.hasSameTzRule(rv)) { // If the timezone adjustment moves the millis across a DST boundary, // we need to reevaluate the offsets. int64_t adjustedTime = writerTime + wv.gmtOffset - rv.gmtOffset; - const auto& adjustedReader = readerTimezone.getVariant(adjustedTime); + const auto& adjustedReader = readerTimezone->getVariant(adjustedTime); writerTime = writerTime + wv.gmtOffset - adjustedReader.gmtOffset; } } @@ -406,38 +355,34 @@ namespace orc { } void TimestampColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); secondsRle->seek(positions.at(columnId)); nanoRle->seek(positions.at(columnId)); } - template<TypeKind columnKind, bool isLittleEndian> - class DoubleColumnReader: public ColumnReader { - public: + template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> + class DoubleColumnReader : public ColumnReader { + public: DoubleColumnReader(const Type& type, StripeStreams& stripe); ~DoubleColumnReader() override {} uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; - private: + private: std::unique_ptr<SeekableInputStream> inputStream; const uint64_t bytesPerValue = (columnKind == FLOAT) ? 4 : 8; - const char *bufferPointer; - const char *bufferEnd; + const char* bufferPointer; + const char* bufferEnd; unsigned char readByte() { if (bufferPointer == bufferEnd) { int length; - if (!inputStream->Next - (reinterpret_cast<const void**>(&bufferPointer), &length)) { + if (!inputStream->Next(reinterpret_cast<const void**>(&bufferPointer), &length)) { throw ParseError("bad read in DoubleColumnReader::next()"); } bufferEnd = bufferPointer + length; @@ -445,7 +390,8 @@ namespace orc { return static_cast<unsigned char>(*(bufferPointer++)); } - double readDouble() { + template <typename FloatType> + FloatType readDouble() { int64_t bits = 0; if (bufferEnd - bufferPointer >= 8) { if (isLittleEndian) { @@ -466,11 +412,12 @@ namespace orc { bits |= static_cast<int64_t>(readByte()) << (i * 8); } } - double *result = reinterpret_cast<double*>(&bits); + FloatType* result = reinterpret_cast<FloatType*>(&bits); return *result; } - double readFloat() { + template <typename FloatType> + FloatType readFloat() { int32_t bits = 0; if (bufferEnd - bufferPointer >= 4) { if (isLittleEndian) { @@ -487,33 +434,32 @@ namespace orc { bits |= readByte() << (i * 8); } } - float *result = reinterpret_cast<float*>(&bits); - return static_cast<double>(*result); + float* result = reinterpret_cast<float*>(&bits); + if (!result) { + std::cerr << "read float empty." << std::endl; + } + return static_cast<FloatType>(*result); } }; - template<TypeKind columnKind, bool isLittleEndian> - DoubleColumnReader<columnKind, isLittleEndian>::DoubleColumnReader( - const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - bufferPointer(nullptr), - bufferEnd(nullptr) { + template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> + DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::DoubleColumnReader( + const Type& type, StripeStreams& stripe) + : ColumnReader(type, stripe), bufferPointer(nullptr), bufferEnd(nullptr) { inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (inputStream == nullptr) - throw ParseError("DATA stream not found in Double column"); + if (inputStream == nullptr) throw ParseError("DATA stream not found in Double column"); } - template<TypeKind columnKind, bool isLittleEndian> - uint64_t DoubleColumnReader<columnKind, isLittleEndian>::skip(uint64_t numValues) { + template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> + uint64_t DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::skip( + uint64_t numValues) { numValues = ColumnReader::skip(numValues); - if (static_cast<size_t>(bufferEnd - bufferPointer) >= - bytesPerValue * numValues) { + if (static_cast<size_t>(bufferEnd - bufferPointer) >= bytesPerValue * numValues) { bufferPointer += bytesPerValue * numValues; } else { - size_t sizeToSkip = bytesPerValue * numValues - - static_cast<size_t>(bufferEnd - bufferPointer); + size_t sizeToSkip = + bytesPerValue * numValues - static_cast<size_t>(bufferEnd - bufferPointer); const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); while (sizeToSkip != 0) { size_t step = sizeToSkip > cap ? cap : sizeToSkip; @@ -527,33 +473,32 @@ namespace orc { return numValues; } - template<TypeKind columnKind, bool isLittleEndian> - void DoubleColumnReader<columnKind, isLittleEndian>::next( - ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> + void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::next( + ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); // update the notNull from the parent class notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data(); + ValueType* outArray = + reinterpret_cast<ValueType*>(dynamic_cast<BatchType&>(rowBatch).data.data()); - if (columnKind == FLOAT) { + if constexpr (columnKind == FLOAT) { if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { - outArray[i] = readFloat(); + outArray[i] = readFloat<ValueType>(); } } } else { - for(size_t i=0; i < numValues; ++i) { - outArray[i] = readFloat(); + for (size_t i = 0; i < numValues; ++i) { + outArray[i] = readFloat<ValueType>(); } } } else { if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { - outArray[i] = readDouble(); + outArray[i] = readDouble<ValueType>(); } } } else { @@ -561,25 +506,23 @@ namespace orc { // Only viable when the machine is little-endian. uint64_t bufferNum = 0; if (isLittleEndian) { - bufferNum = std::min(numValues, - static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue); + bufferNum = + std::min(numValues, static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue); uint64_t bufferBytes = bufferNum * bytesPerValue; - if (bufferPointer && bufferBytes) { + if (bufferBytes > 0) { memcpy(outArray, bufferPointer, bufferBytes); bufferPointer += bufferBytes; - } else { - bufferNum = 0; } } for (size_t i = bufferNum; i < numValues; ++i) { - outArray[i] = readDouble(); + outArray[i] = readDouble<ValueType>(); } } } } - template<TypeKind columnKind, bool isLittleEndian> - void DoubleColumnReader<columnKind, isLittleEndian>::seekToRowGroup( + template <TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType> + void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::seekToRowGroup( std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); inputStream->seek(positions.at(columnId)); @@ -604,54 +547,46 @@ namespace orc { } } - class StringDictionaryColumnReader: public ColumnReader { - private: + class StringDictionaryColumnReader : public ColumnReader { + private: std::shared_ptr<StringDictionary> dictionary; std::unique_ptr<RleDecoder> rle; - public: + public: StringDictionaryColumnReader(const Type& type, StripeStreams& stipe); ~StringDictionaryColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; + void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; }; - StringDictionaryColumnReader::StringDictionaryColumnReader - (const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - dictionary(new StringDictionary(stripe.getMemoryPool())) { - RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) - .kind()); - uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize(); + StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type, + StripeStreams& stripe) + : ColumnReader(type, stripe), dictionary(new StringDictionary(stripe.getMemoryPool())) { + RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind()); + uint32_t dictSize = stripe.getEncoding(columnId).dictionary_size(); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); if (stream == nullptr) { throw ParseError("DATA stream not found in StringDictionaryColumn"); } - rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool); + rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false); if (dictSize > 0 && stream == nullptr) { throw ParseError("LENGTH stream not found in StringDictionaryColumn"); } std::unique_ptr<RleDecoder> lengthDecoder = - createRleDecoder(std::move(stream), false, rleVersion, memoryPool); + createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); dictionary->dictionaryOffset.resize(dictSize + 1); int64_t* lengthArray = dictionary->dictionaryOffset.data(); lengthDecoder->next(lengthArray + 1, dictSize, nullptr); lengthArray[0] = 0; - for(uint32_t i = 1; i < dictSize + 1; ++i) { + for (uint32_t i = 1; i < dictSize + 1; ++i) { if (lengthArray[i] < 0) { throw ParseError("Negative dictionary entry length"); } @@ -660,10 +595,9 @@ namespace orc { int64_t blobSize = lengthArray[dictSize]; dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize)); std::unique_ptr<SeekableInputStream> blobStream = - stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); + stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); if (blobSize > 0 && blobStream == nullptr) { - throw ParseError( - "DICTIONARY_DATA stream not found in StringDictionaryColumn"); + throw ParseError("DICTIONARY_DATA stream not found in StringDictionaryColumn"); } readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get()); } @@ -678,47 +612,43 @@ namespace orc { return numValues; } - void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); // update the notNull from the parent class notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char *blob = dictionary->dictionaryBlob.data(); - int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data(); - char **outputStarts = byteBatch.data.data(); - int64_t *outputLengths = byteBatch.length.data(); + char* blob = dictionary->dictionaryBlob.data(); + int64_t* dictionaryOffsets = dictionary->dictionaryOffset.data(); + char** outputStarts = byteBatch.data.data(); + int64_t* outputLengths = byteBatch.length.data(); rle->next(outputLengths, numValues, notNull); uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1; if (notNull) { - for(uint64_t i=0; i < numValues; ++i) { + for (uint64_t i = 0; i < numValues; ++i) { if (notNull[i]) { int64_t entry = outputLengths[i]; - if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) { + if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) { throw ParseError("Entry index out of range in StringDictionaryColumn"); } outputStarts[i] = blob + dictionaryOffsets[entry]; - outputLengths[i] = dictionaryOffsets[entry+1] - - dictionaryOffsets[entry]; + outputLengths[i] = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry]; } } } else { - for(uint64_t i=0; i < numValues; ++i) { + for (uint64_t i = 0; i < numValues; ++i) { int64_t entry = outputLengths[i]; if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) { throw ParseError("Entry index out of range in StringDictionaryColumn"); } outputStarts[i] = blob + dictionaryOffsets[entry]; - outputLengths[i] = dictionaryOffsets[entry+1] - - dictionaryOffsets[entry]; + outputLengths[i] = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry]; } } } - void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) { + void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; rowBatch.isEncoded = true; @@ -731,17 +661,16 @@ namespace orc { } void StringDictionaryColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); rle->seek(positions.at(columnId)); } - - class StringDirectColumnReader: public ColumnReader { - private: + class StringDirectColumnReader : public ColumnReader { + private: std::unique_ptr<RleDecoder> lengthRle; std::unique_ptr<SeekableInputStream> blobStream; - const char *lastBuffer; + const char* lastBuffer; size_t lastBufferLength; /** @@ -751,38 +680,28 @@ namespace orc { * @param numValues the lengths of the arrays * @return the total number of bytes for the non-null values */ - size_t computeSize(const int64_t *lengths, const char *notNull, - uint64_t numValues); + size_t computeSize(const int64_t* lengths, const char* notNull, uint64_t numValues); - public: + public: StringDirectColumnReader(const Type& type, StripeStreams& stipe); ~StringDirectColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; }; - StringDirectColumnReader::StringDirectColumnReader - (const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) - .kind()); + StringDirectColumnReader::StringDirectColumnReader(const Type& type, StripeStreams& stripe) + : ColumnReader(type, stripe) { + RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in StringDirectColumn"); - lengthRle = createRleDecoder( - std::move(stream), false, rleVersion, memoryPool); + if (stream == nullptr) throw ParseError("LENGTH stream not found in StringDirectColumn"); + lengthRle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics); blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (blobStream == nullptr) - throw ParseError("DATA stream not found in StringDirectColumn"); + if (blobStream == nullptr) throw ParseError("DATA stream not found in StringDirectColumn"); lastBuffer = nullptr; lastBufferLength = 0; } @@ -799,8 +718,7 @@ namespace orc { size_t totalBytes = 0; // read the lengths, so we know haw many bytes to skip while (done < numValues) { - uint64_t step = std::min(BUFFER_SIZE, - static_cast<size_t>(numValues - done)); + uint64_t step = std::min(BUFFER_SIZE, static_cast<size_t>(numValues - done)); lengthRle->next(buffer, step, nullptr); totalBytes += computeSize(buffer, nullptr, step); done += step; @@ -824,33 +742,31 @@ namespace orc { return numValues; } - size_t StringDirectColumnReader::computeSize(const int64_t* lengths, - const char* notNull, + size_t StringDirectColumnReader::computeSize(const int64_t* lengths, const char* notNull, uint64_t numValues) { size_t totalLength = 0; if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { totalLength += static_cast<size_t>(lengths[i]); } } } else { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { totalLength += static_cast<size_t>(lengths[i]); } } return totalLength; } - void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); // update the notNull from the parent class notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char **startPtr = byteBatch.data.data(); - int64_t *lengthPtr = byteBatch.length.data(); + char** startPtr = byteBatch.data.data(); + int64_t* lengthPtr = byteBatch.length.data(); // read the length vector lengthRle->next(lengthPtr, numValues, notNull); @@ -862,7 +778,7 @@ namespace orc { // to get the rest directly out of the stream's buffer. size_t bytesBuffered = 0; byteBatch.blob.resize(totalLength); - char *ptr= byteBatch.blob.data(); + char* ptr = byteBatch.blob.data(); while (bytesBuffered + lastBufferLength < totalLength) { memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength); bytesBuffered += lastBufferLength; @@ -902,7 +818,7 @@ namespace orc { } void StringDirectColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); blobStream->seek(positions.at(columnId)); lengthRle->seek(positions.at(columnId)); @@ -911,145 +827,130 @@ namespace orc { lastBufferLength = 0; } - class StructColumnReader: public ColumnReader { - private: + class StructColumnReader : public ColumnReader { + private: std::vector<std::unique_ptr<ColumnReader>> children; - public: - StructColumnReader(const Type& type, StripeStreams& stipe); + public: + StructColumnReader(const Type& type, StripeStreams& stripe, bool useTightNumericVector = false, + bool throwOnSchemaEvolutionOverflow = false); uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); + private: + template <bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull); }; - StructColumnReader::StructColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { + StructColumnReader::StructColumnReader(const Type& type, StripeStreams& stripe, + bool useTightNumericVector, + bool throwOnSchemaEvolutionOverflow) + : ColumnReader(type, stripe) { // count the number of selected sub-columns const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) { - case proto::ColumnEncoding_Kind_DIRECT: - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - const Type& child = *type.getSubtype(i); - if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { - children.push_back(buildReader(child, stripe)); + case proto::ColumnEncoding_Kind_DIRECT: + for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { + const Type& child = *type.getSubtype(i); + if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { + children.push_back( + buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow)); + } } - } - break; - case proto::ColumnEncoding_Kind_DIRECT_V2: - case proto::ColumnEncoding_Kind_DICTIONARY: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - default: - throw ParseError("Unknown encoding for StructColumnReader"); + break; + case proto::ColumnEncoding_Kind_DIRECT_V2: + case proto::ColumnEncoding_Kind_DICTIONARY: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + default: + throw ParseError("Unknown encoding for StructColumnReader"); } } uint64_t StructColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - for(auto& ptr : children) { + for (auto& ptr : children) { ptr->skip(numValues); } return numValues; } - void StructColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void StructColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { nextInternal<false>(rowBatch, numValues, notNull); } - void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { nextInternal<true>(rowBatch, numValues, notNull); } - template<bool encoded> - void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + template <bool encoded> + void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); - uint64_t i=0; - notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr; - for(auto iter = children.begin(); iter != children.end(); ++iter, ++i) { + uint64_t i = 0; + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + for (auto iter = children.begin(); iter != children.end(); ++iter, ++i) { if (encoded) { - (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), - numValues, notNull); + (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues, + notNull); } else { - (*iter)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), - numValues, notNull); + (*iter)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues, notNull); } } } void StructColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); - for(auto& ptr : children) { + for (auto& ptr : children) { ptr->seekToRowGroup(positions); } } - class ListColumnReader: public ColumnReader { - private: + class ListColumnReader : public ColumnReader { + private: std::unique_ptr<ColumnReader> child; std::unique_ptr<RleDecoder> rle; - public: - ListColumnReader(const Type& type, StripeStreams& stipe); + public: + ListColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, + bool throwOnSchemaEvolutionOverflow = false); ~ListColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); + private: + template <bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull); }; - ListColumnReader::ListColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { + ListColumnReader::ListColumnReader(const Type& type, StripeStreams& stripe, + bool useTightNumericVector, + bool throwOnSchemaEvolutionOverflow) + : ColumnReader(type, stripe) { // count the number of selected sub-columns const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in List column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool); + if (stream == nullptr) throw ParseError("LENGTH stream not found in List column"); + rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); const Type& childType = *type.getSubtype(0); if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) { - child = buildReader(childType, stripe); + child = buildReader(childType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } } @@ -1059,7 +960,7 @@ namespace orc { uint64_t ListColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - ColumnReader *childReader = child.get(); + ColumnReader* childReader = child.get(); if (childReader) { const uint64_t BUFFER_SIZE = 1024; int64_t buffer[BUFFER_SIZE]; @@ -1068,7 +969,7 @@ namespace orc { while (lengthsRead < numValues) { uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { + for (size_t i = 0; i < chunk; ++i) { childrenElements += static_cast<size_t>(buffer[i]); } lengthsRead += chunk; @@ -1080,30 +981,26 @@ namespace orc { return numValues; } - void ListColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void ListColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { nextInternal<false>(rowBatch, numValues, notNull); } - void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { nextInternal<true>(rowBatch, numValues, notNull); } - template<bool encoded> - void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + template <bool encoded> + void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); - ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); + ListVectorBatch& listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); int64_t* offsets = listBatch.offsets.data(); notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr; rle->next(offsets, numValues, notNull); uint64_t totalChildren = 0; if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { uint64_t tmp = static_cast<uint64_t>(offsets[i]); offsets[i] = static_cast<int64_t>(totalChildren); @@ -1113,14 +1010,14 @@ namespace orc { } } } else { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { uint64_t tmp = static_cast<uint64_t>(offsets[i]); offsets[i] = static_cast<int64_t>(totalChildren); totalChildren += tmp; } } offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader *childReader = child.get(); + ColumnReader* childReader = child.get(); if (childReader) { if (encoded) { childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr); @@ -1130,8 +1027,7 @@ namespace orc { } } - void ListColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + void ListColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); rle->seek(positions.at(columnId)); if (child.get()) { @@ -1139,54 +1035,49 @@ namespace orc { } } - class MapColumnReader: public ColumnReader { - private: + class MapColumnReader : public ColumnReader { + private: std::unique_ptr<ColumnReader> keyReader; std::unique_ptr<ColumnReader> elementReader; std::unique_ptr<RleDecoder> rle; - public: - MapColumnReader(const Type& type, StripeStreams& stipe); + public: + MapColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, + bool throwOnSchemaEvolutionOverflow = false); ~MapColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); + private: + template <bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull); }; - MapColumnReader::MapColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { + MapColumnReader::MapColumnReader(const Type& type, StripeStreams& stripe, + bool useTightNumericVector, bool throwOnSchemaEvolutionOverflow) + : ColumnReader(type, stripe) { // Determine if the key and/or value columns are selected const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in Map column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool); + if (stream == nullptr) throw ParseError("LENGTH stream not found in Map column"); + rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics); const Type& keyType = *type.getSubtype(0); if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) { - keyReader = buildReader(keyType, stripe); + keyReader = + buildReader(keyType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } const Type& elementType = *type.getSubtype(1); if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) { - elementReader = buildReader(elementType, stripe); + elementReader = + buildReader(elementType, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } } @@ -1196,8 +1087,8 @@ namespace orc { uint64_t MapColumnReader::skip(uint64_t numValues) { numValues = ColumnReader::skip(numValues); - ColumnReader *rawKeyReader = keyReader.get(); - ColumnReader *rawElementReader = elementReader.get(); + ColumnReader* rawKeyReader = keyReader.get(); + ColumnReader* rawElementReader = elementReader.get(); if (rawKeyReader || rawElementReader) { const uint64_t BUFFER_SIZE = 1024; int64_t buffer[BUFFER_SIZE]; @@ -1206,7 +1097,7 @@ namespace orc { while (lengthsRead < numValues) { uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { + for (size_t i = 0; i < chunk; ++i) { childrenElements += static_cast<size_t>(buffer[i]); } lengthsRead += chunk; @@ -1223,32 +1114,26 @@ namespace orc { return numValues; } - void MapColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) - { + void MapColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { nextInternal<false>(rowBatch, numValues, notNull); } - void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) - { + void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { nextInternal<true>(rowBatch, numValues, notNull); } - template<bool encoded> - void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + template <bool encoded> + void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); - MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); + MapVectorBatch& mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); int64_t* offsets = mapBatch.offsets.data(); notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr; rle->next(offsets, numValues, notNull); uint64_t totalChildren = 0; if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { uint64_t tmp = static_cast<uint64_t>(offsets[i]); offsets[i] = static_cast<int64_t>(totalChildren); @@ -1258,14 +1143,14 @@ namespace orc { } } } else { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { uint64_t tmp = static_cast<uint64_t>(offsets[i]); offsets[i] = static_cast<int64_t>(totalChildren); totalChildren += tmp; } } offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader *rawKeyReader = keyReader.get(); + ColumnReader* rawKeyReader = keyReader.get(); if (rawKeyReader) { if (encoded) { rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr); @@ -1273,7 +1158,7 @@ namespace orc { rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr); } } - ColumnReader *rawElementReader = elementReader.get(); + ColumnReader* rawElementReader = elementReader.get(); if (rawElementReader) { if (encoded) { rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr); @@ -1283,8 +1168,7 @@ namespace orc { } } - void MapColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + void MapColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); rle->seek(positions.at(columnId)); if (keyReader.get()) { @@ -1295,54 +1179,49 @@ namespace orc { } } - class UnionColumnReader: public ColumnReader { - private: + class UnionColumnReader : public ColumnReader { + private: std::unique_ptr<ByteRleDecoder> rle; std::vector<std::unique_ptr<ColumnReader>> childrenReader; std::vector<int64_t> childrenCounts; uint64_t numChildren; - public: - UnionColumnReader(const Type& type, StripeStreams& stipe); + public: + UnionColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, + bool throwOnSchemaEvolutionOverflow = false); uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); + private: + template <bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull); }; - UnionColumnReader::UnionColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { + UnionColumnReader::UnionColumnReader(const Type& type, StripeStreams& stripe, + bool useTightNumericVector, + bool throwOnSchemaEvolutionOverflow) + : ColumnReader(type, stripe) { numChildren = type.getSubtypeCount(); childrenReader.resize(numChildren); childrenCounts.resize(numChildren); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in Union column"); - rle = createByteRleDecoder(std::move(stream)); + if (stream == nullptr) throw ParseError("LENGTH stream not found in Union column"); + rle = createByteRleDecoder(std::move(stream), metrics); // figure out which types are selected const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - for(unsigned int i=0; i < numChildren; ++i) { - const Type &child = *type.getSubtype(i); + for (unsigned int i = 0; i < numChildren; ++i) { + const Type& child = *type.getSubtype(i); if (selectedColumns[static_cast<size_t>(child.getColumnId())]) { - childrenReader[i] = buildReader(child, stripe); + childrenReader[i] = + buildReader(child, stripe, useTightNumericVector, throwOnSchemaEvolutionOverflow); } } } @@ -1352,17 +1231,17 @@ namespace orc { const uint64_t BUFFER_SIZE = 1024; char buffer[BUFFER_SIZE]; uint64_t lengthsRead = 0; - int64_t *counts = childrenCounts.data(); + int64_t* counts = childrenCounts.data(); memset(counts, 0, sizeof(int64_t) * numChildren); while (lengthsRead < numValues) { uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { + for (size_t i = 0; i < chunk; ++i) { counts[static_cast<size_t>(buffer[i])] += 1; } lengthsRead += chunk; } - for(size_t i=0; i < numChildren; ++i) { + for (size_t i = 0; i < numChildren; ++i) { if (counts[i] != 0 && childrenReader[i] != nullptr) { childrenReader[i]->skip(static_cast<uint64_t>(counts[i])); } @@ -1370,63 +1249,57 @@ namespace orc { return numValues; } - void UnionColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void UnionColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { nextInternal<false>(rowBatch, numValues, notNull); } - void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { nextInternal<true>(rowBatch, numValues, notNull); } - template<bool encoded> - void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + template <bool encoded> + void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); - UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); + UnionVectorBatch& unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); uint64_t* offsets = unionBatch.offsets.data(); int64_t* counts = childrenCounts.data(); memset(counts, 0, sizeof(int64_t) * numChildren); unsigned char* tags = unionBatch.tags.data(); notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr; - rle->next(reinterpret_cast<char *>(tags), numValues, notNull); + rle->next(reinterpret_cast<char*>(tags), numValues, notNull); // set the offsets for each row if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { - offsets[i] = - static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); + offsets[i] = static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); } } } else { - for(size_t i=0; i < numValues; ++i) { - offsets[i] = - static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); + for (size_t i = 0; i < numValues; ++i) { + offsets[i] = static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); } } // read the right number of each child column - for(size_t i=0; i < numChildren; ++i) { + for (size_t i = 0; i < numChildren; ++i) { if (childrenReader[i] != nullptr) { if (encoded) { childrenReader[i]->nextEncoded(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); + static_cast<uint64_t>(counts[i]), nullptr); } else { - childrenReader[i]->next(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); + childrenReader[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]), + nullptr); } } } } void UnionColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); rle->seek(positions.at(columnId)); - for(size_t i = 0; i < numChildren; ++i) { + for (size_t i = 0; i < numChildren; ++i) { if (childrenReader[i] != nullptr) { childrenReader[i]->seekToRowGroup(positions); } @@ -1446,13 +1319,13 @@ namespace orc { } } - class Decimal64ColumnReader: public ColumnReader { - public: + class Decimal64ColumnReader : public ColumnReader { + public: static const uint32_t MAX_PRECISION_64 = 18; static const uint32_t MAX_PRECISION_128 = 38; static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1]; - protected: + protected: std::unique_ptr<SeekableInputStream> valueStream; int32_t precision; int32_t scale; @@ -1467,9 +1340,8 @@ namespace orc { void readBuffer() { while (buffer == bufferEnd) { int length; - if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), - &length)) { - throw ParseError("Read past end of stream in Decimal64ColumnReader "+ + if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), &length)) { + throw ParseError("Read past end of stream in Decimal64ColumnReader " + valueStream->getName()); } bufferEnd = buffer + length; @@ -1489,69 +1361,61 @@ namespace orc { } } value = unZigZag(static_cast<uint64_t>(value)); - if (scale > currentScale && - static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) { + if (scale > currentScale && static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) { value *= POWERS_OF_TEN[scale - currentScale]; } else if (scale < currentScale && - static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) { + static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) { value /= POWERS_OF_TEN[currentScale - scale]; } else if (scale != currentScale) { throw ParseError("Decimal scale out of range"); } } - public: + public: Decimal64ColumnReader(const Type& type, StripeStreams& stipe); ~Decimal64ColumnReader() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; }; const uint32_t Decimal64ColumnReader::MAX_PRECISION_64; const uint32_t Decimal64ColumnReader::MAX_PRECISION_128; - const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]= - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; - - Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { + const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + + Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, StripeStreams& stripe) + : ColumnReader(type, stripe) { scale = static_cast<int32_t>(type.getScale()); precision = static_cast<int32_t>(type.getPrecision()); valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (valueStream == nullptr) - throw ParseError("DATA stream not found in Decimal64Column"); + if (valueStream == nullptr) throw ParseError("DATA stream not found in Decimal64Column"); buffer = nullptr; bufferEnd = nullptr; RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); std::unique_ptr<SeekableInputStream> stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); - if (stream == nullptr) - throw ParseError("SECONDARY stream not found in Decimal64Column"); - scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool); + if (stream == nullptr) throw ParseError("SECONDARY stream not found in Decimal64Column"); + scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics); } Decimal64ColumnReader::~Decimal64ColumnReader() { @@ -1571,13 +1435,10 @@ namespace orc { return numValues; } - void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal64VectorBatch &batch = - dynamic_cast<Decimal64VectorBatch&>(rowBatch); + Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch); int64_t* values = batch.values.data(); // read the next group of scales int64_t* scaleBuffer = batch.readScales.data(); @@ -1585,13 +1446,13 @@ namespace orc { batch.precision = precision; batch.scale = scale; if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); } } } else { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); } } @@ -1599,28 +1460,25 @@ namespace orc { void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) { if (scale > currentScale) { - while(scale > currentScale) { + while (scale > currentScale) { uint32_t scaleAdjust = - std::min(Decimal64ColumnReader::MAX_PRECISION_64, - scale - currentScale); + std::min(Decimal64ColumnReader::MAX_PRECISION_64, scale - currentScale); value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust]; currentScale += scaleAdjust; } } else if (scale < currentScale) { Int128 remainder; - while(currentScale > scale) { + while (currentScale > scale) { uint32_t scaleAdjust = - std::min(Decimal64ColumnReader::MAX_PRECISION_64, - currentScale - scale); - value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], - remainder); + std::min(Decimal64ColumnReader::MAX_PRECISION_64, currentScale - scale); + value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], remainder); currentScale -= scaleAdjust; } } } void Decimal64ColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { + std::unordered_map<uint64_t, PositionProvider>& positions) { ColumnReader::seekToRowGroup(positions); valueStream->seek(positions.at(columnId)); scaleDecoder->seek(positions.at(columnId)); @@ -1629,16 +1487,14 @@ namespace orc { bufferEnd = nullptr; } - class Decimal128ColumnReader: public Decimal64ColumnReader { - public: + class Decimal128ColumnReader : public Decimal64ColumnReader { + public: Decimal128ColumnReader(const Type& type, StripeStreams& stipe); ~Decimal128ColumnReader() override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; - private: + private: void readInt128(Int128& value, int32_t currentScale) { value = 0; Int128 work; @@ -1648,22 +1504,19 @@ namespace orc { unsigned char ch = static_cast<unsigned char>(*(buffer++)); work = ch & 0x7f; work <<= offset; - value |= work; + value |= work; offset += 7; if (!(ch & 0x80)) { break; } } unZigZagInt128(value); - scaleInt128(value, static_cast<uint32_t>(scale), - static_cast<uint32_t>(currentScale)); + scaleInt128(value, static_cast<uint32_t>(scale), static_cast<uint32_t>(currentScale)); } }; - Decimal128ColumnReader::Decimal128ColumnReader - (const Type& type, - StripeStreams& stripe - ): Decimal64ColumnReader(type, stripe) { + Decimal128ColumnReader::Decimal128ColumnReader(const Type& type, StripeStreams& stripe) + : Decimal64ColumnReader(type, stripe) { // PASS } @@ -1671,13 +1524,11 @@ namespace orc { // PASS } - void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal128VectorBatch &batch = - dynamic_cast<Decimal128VectorBatch&>(rowBatch); + Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch); Int128* values = batch.values.data(); // read the next group of scales int64_t* scaleBuffer = batch.readScales.data(); @@ -1685,38 +1536,35 @@ namespace orc { batch.precision = precision; batch.scale = scale; if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); } } } else { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); } } } - class Decimal64ColumnReaderV2: public ColumnReader { - protected: + class Decimal64ColumnReaderV2 : public ColumnReader { + protected: std::unique_ptr<RleDecoder> valueDecoder; int32_t precision; int32_t scale; - public: + public: Decimal64ColumnReaderV2(const Type& type, StripeStreams& stripe); ~Decimal64ColumnReaderV2() override; uint64_t skip(uint64_t numValues) override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; }; - Decimal64ColumnReaderV2::Decimal64ColumnReaderV2(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { + Decimal64ColumnReaderV2::Decimal64ColumnReaderV2(const Type& type, StripeStreams& stripe) + : ColumnReader(type, stripe) { scale = static_cast<int32_t>(type.getScale()); precision = static_cast<int32_t>(type.getPrecision()); std::unique_ptr<SeekableInputStream> stream = @@ -1726,7 +1574,7 @@ namespace orc { ss << "DATA stream not found in Decimal64V2 column. ColumnId=" << columnId; throw ParseError(ss.str()); } - valueDecoder = createRleDecoder(std::move(stream), true, RleVersion_2, memoryPool); + valueDecoder = createRleDecoder(std::move(stream), true, RleVersion_2, memoryPool, metrics); } Decimal64ColumnReaderV2::~Decimal64ColumnReaderV2() { @@ -1739,20 +1587,18 @@ namespace orc { return numValues; } - void Decimal64ColumnReaderV2::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void Decimal64ColumnReaderV2::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal64VectorBatch &batch = - dynamic_cast<Decimal64VectorBatch&>(rowBatch); + Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch); valueDecoder->next(batch.values.data(), numValues, notNull); batch.precision = precision; batch.scale = scale; } - class DecimalHive11ColumnReader: public Decimal64ColumnReader { - private: + class DecimalHive11ColumnReader : public Decimal64ColumnReader { + private: bool throwOnOverflow; std::ostream* errorStream; @@ -1762,7 +1608,7 @@ namespace orc { bool readInt128(Int128& value, int32_t currentScale) { // -/+ 99999999999999999999999999999999999999 static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001); - static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff); + static const Int128 MAX_VALUE(0x4b3b4ca85a86c47a, 0x098a223fffffffff); value = 0; Int128 work; @@ -1778,7 +1624,7 @@ namespace orc { result = false; } work <<= offset; - value |= work; + value |= work; offset += 7; if (!(ch & 0x80)) { break; @@ -1789,24 +1635,19 @@ namespace orc { return result; } unZigZagInt128(value); - scaleInt128(value, static_cast<uint32_t>(scale), - static_cast<uint32_t>(currentScale)); + scaleInt128(value, static_cast<uint32_t>(scale), static_cast<uint32_t>(currentScale)); return value >= MIN_VALUE && value <= MAX_VALUE; } - public: + public: DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe); ~DecimalHive11ColumnReader() override; - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; }; - DecimalHive11ColumnReader::DecimalHive11ColumnReader - (const Type& type, - StripeStreams& stripe - ): Decimal64ColumnReader(type, stripe) { + DecimalHive11ColumnReader::DecimalHive11ColumnReader(const Type& type, StripeStreams& stripe) + : Decimal64ColumnReader(type, stripe) { scale = stripe.getForcedScaleOnHive11Decimal(); throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow(); errorStream = stripe.getErrorStream(); @@ -1816,13 +1657,11 @@ namespace orc { // PASS } - void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { + void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { ColumnReader::next(rowBatch, numValues, notNull); notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal128VectorBatch &batch = - dynamic_cast<Decimal128VectorBatch&>(rowBatch); + Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch); Int128* values = batch.values.data(); // read the next group of scales int64_t* scaleBuffer = batch.readScales.data(); @@ -1832,10 +1671,9 @@ namespace orc { batch.precision = precision; batch.scale = scale; if (notNull) { - for(size_t i=0; i < numValues; ++i) { + for (size_t i = 0; i < numValues; ++i) { if (notNull[i]) { - if (!readInt128(values[i], - static_cast<int32_t>(scaleBuffer[i]))) { + if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) { if (throwOnOverflow) { throw ParseError("Hive 0.11 decimal was more than 38 digits."); } else { @@ -1848,9 +1686,8 @@ namespace orc { } } } else { - for(size_t i=0; i < numValues; ++i) { - if (!readInt128(values[i], - static_cast<int32_t>(scaleBuffer[i]))) { + for (size_t i = 0; i < numValues; ++i) { + if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) { if (throwOnOverflow) { throw ParseError("Hive 0.11 decimal was more than 38 digits."); } else { @@ -1866,109 +1703,133 @@ namespace orc { } static bool isLittleEndian() { - static union { uint32_t i; char c[4]; } num = { 0x01020304 }; + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; return num.c[0] == 4; } /** * Create a reader for the given stripe. */ - std::unique_ptr<ColumnReader> buildReader(const Type& type, - StripeStreams& stripe) { - switch (static_cast<int64_t>(type.getKind())) { - case DATE: - case INT: - case LONG: - case SHORT: - return std::unique_ptr<ColumnReader>( - new IntegerColumnReader(type, stripe)); - case BINARY: - case CHAR: - case STRING: - case VARCHAR: - switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){ - case proto::ColumnEncoding_Kind_DICTIONARY: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - return std::unique_ptr<ColumnReader>( - new StringDictionaryColumnReader(type, stripe)); - case proto::ColumnEncoding_Kind_DIRECT: - case proto::ColumnEncoding_Kind_DIRECT_V2: - return std::unique_ptr<ColumnReader>( - new StringDirectColumnReader(type, stripe)); - default: - throw NotImplementedYet("buildReader unhandled string encoding"); - } + std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe, + bool useTightNumericVector, + bool throwOnSchemaEvolutionOverflow, + bool convertToReadType) { + if (convertToReadType && stripe.getSchemaEvolution() && + stripe.getSchemaEvolution()->needConvert(type)) { + return buildConvertReader(type, stripe, useTightNumericVector, + throwOnSchemaEvolutionOverflow); + } - case BOOLEAN: - return std::unique_ptr<ColumnReader>( - new BooleanColumnReader(type, stripe)); + switch (static_cast<int64_t>(type.getKind())) { + case SHORT: + if (useTightNumericVector) { + return std::make_unique<IntegerColumnReader<ShortVectorBatch>>(type, stripe); + } + return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe); + case INT: + if (useTightNumericVector) { + return std::make_unique<IntegerColumnReader<IntVectorBatch>>(type, stripe); + } + return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe); + case LONG: + case DATE: + return std::make_unique<IntegerColumnReader<LongVectorBatch>>(type, stripe); + case BINARY: + case CHAR: + case STRING: + case VARCHAR: + switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) { + case proto::ColumnEncoding_Kind_DICTIONARY: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + return std::make_unique<StringDictionaryColumnReader>(type, stripe); + case proto::ColumnEncoding_Kind_DIRECT: + case proto::ColumnEncoding_Kind_DIRECT_V2: + return std::make_unique<StringDirectColumnReader>(type, stripe); + default: + throw NotImplementedYet("buildReader unhandled string encoding"); + } - case BYTE: - return std::unique_ptr<ColumnReader>( - new ByteColumnReader(type, stripe)); + case BOOLEAN: { + if (useTightNumericVector) { + return std::make_unique<BooleanColumnReader<ByteVectorBatch>>(type, stripe); + } else { + return std::make_unique<BooleanColumnReader<LongVectorBatch>>(type, stripe); + } + } - case LIST: - return std::unique_ptr<ColumnReader>( - new ListColumnReader(type, stripe)); + case BYTE: + if (useTightNumericVector) { + return std::make_unique<ByteColumnReader<ByteVectorBatch>>(type, stripe); + } + return std::make_unique<ByteColumnReader<LongVectorBatch>>(type, stripe); - case MAP: - return std::unique_ptr<ColumnReader>( - new MapColumnReader(type, stripe)); + case LIST: + return std::make_unique<ListColumnReader>(type, stripe, useTightNumericVector, + throwOnSchemaEvolutionOverflow); - case UNION: - return std::unique_ptr<ColumnReader>( - new UnionColumnReader(type, stripe)); + case MAP: + return std::make_unique<MapColumnReader>(type, stripe, useTightNumericVector, + throwOnSchemaEvolutionOverflow); - case STRUCT: - return std::unique_ptr<ColumnReader>( - new StructColumnReader(type, stripe)); + case UNION: + return std::make_unique<UnionColumnReader>(type, stripe, useTightNumericVector, + throwOnSchemaEvolutionOverflow); - case FLOAT: - if (isLittleEndian()) { - return std::unique_ptr<ColumnReader>( - new DoubleColumnReader<FLOAT, true>(type, stripe)); - } - return std::unique_ptr<ColumnReader>( - new DoubleColumnReader<FLOAT, false>(type, stripe)); + case STRUCT: + return std::make_unique<StructColumnReader>(type, stripe, useTightNumericVector, + throwOnSchemaEvolutionOverflow); - case DOUBLE: - if (isLittleEndian()) { - return std::unique_ptr<ColumnReader>( - new DoubleColumnReader<DOUBLE, true>(type, stripe)); - } - return std::unique_ptr<ColumnReader>( - new DoubleColumnReader<DOUBLE, false>(type, stripe)); - - case TIMESTAMP: - return std::unique_ptr<ColumnReader> - (new TimestampColumnReader(type, stripe, false)); - - case TIMESTAMP_INSTANT: - return std::unique_ptr<ColumnReader> - (new TimestampColumnReader(type, stripe, true)); - - case DECIMAL: - // is this a Hive 0.11 or 0.12 file? - if (type.getPrecision() == 0) { - return std::unique_ptr<ColumnReader> - (new DecimalHive11ColumnReader(type, stripe)); + case FLOAT: { + if (useTightNumericVector) { + if (isLittleEndian()) { + return std::make_unique<DoubleColumnReader<FLOAT, true, float, FloatVectorBatch>>( + type, stripe); + } + return std::make_unique<DoubleColumnReader<FLOAT, false, float, FloatVectorBatch>>( + type, stripe); + } + if (isLittleEndian()) { + return std::make_unique<DoubleColumnReader<FLOAT, true, double, DoubleVectorBatch>>( + type, stripe); + } + return std::make_unique<DoubleColumnReader<FLOAT, false, double, DoubleVectorBatch>>( + type, stripe); } - // can we represent the values using int64_t? - if (type.getPrecision() <= Decimal64ColumnReader::MAX_PRECISION_64) { - if (stripe.isDecimalAsLong()) { - return std::unique_ptr<ColumnReader> - (new Decimal64ColumnReaderV2(type, stripe)); + case DOUBLE: { + if (isLittleEndian()) { + return std::make_unique<DoubleColumnReader<DOUBLE, true, double, DoubleVectorBatch>>( + type, stripe); } - return std::unique_ptr<ColumnReader> - (new Decimal64ColumnReader(type, stripe)); + return std::make_unique<DoubleColumnReader<DOUBLE, false, double, DoubleVectorBatch>>( + type, stripe); } - // otherwise we use the Int128 implementation - return std::unique_ptr<ColumnReader> - (new Decimal128ColumnReader(type, stripe)); + case TIMESTAMP: + return std::make_unique<TimestampColumnReader>(type, stripe, false); + + case TIMESTAMP_INSTANT: + return std::make_unique<TimestampColumnReader>(type, stripe, true); - default: - throw NotImplementedYet("buildReader unhandled type"); + case DECIMAL: + // is this a Hive 0.11 or 0.12 file? + if (type.getPrecision() == 0) { + return std::make_unique<DecimalHive11ColumnReader>(type, stripe); + } + // can we represent the values using int64_t? + if (type.getPrecision() <= Decimal64ColumnReader::MAX_PRECISION_64) { + if (stripe.isDecimalAsLong()) { + return std::make_unique<Decimal64ColumnReaderV2>(type, stripe); + } + return std::make_unique<Decimal64ColumnReader>(type, stripe); + } + // otherwise we use the Int128 implementation + return std::make_unique<Decimal128ColumnReader>(type, stripe); + + default: + throw NotImplementedYet("buildReader unhandled type"); } } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.hh b/contrib/libs/apache/orc/c++/src/ColumnReader.hh index 80b59de2c1..f0f3fe1b52 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.hh @@ -30,8 +30,10 @@ namespace orc { + class SchemaEvolution; + class StripeStreams { - public: + public: virtual ~StripeStreams(); /** @@ -53,10 +55,9 @@ namespace orc { * @param shouldStream should the reading page the stream in * @return the new stream */ - virtual std::unique_ptr<SeekableInputStream> - getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const = 0; + virtual std::unique_ptr<SeekableInputStream> getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const = 0; /** * Get the memory pool for this reader. @@ -64,6 +65,11 @@ namespace orc { virtual MemoryPool& getMemoryPool() const = 0; /** + * Get the reader metrics for this reader. + */ + virtual ReaderMetrics* getReaderMetrics() const = 0; + + /** * Get the writer's timezone, so that we can convert their dates correctly. */ virtual const Timezone& getWriterTimezone() const = 0; @@ -97,18 +103,24 @@ namespace orc { * encoded in RLE. */ virtual bool isDecimalAsLong() const = 0; + + /** + * @return get schema evolution utility object + */ + virtual const SchemaEvolution* getSchemaEvolution() const = 0; }; /** * The interface for reading ORC data types. */ class ColumnReader { - protected: + protected: std::unique_ptr<ByteRleDecoder> notNullDecoder; uint64_t columnId; MemoryPool& memoryPool; + ReaderMetrics* metrics; - public: + public: ColumnReader(const Type& type, StripeStreams& stipe); virtual ~ColumnReader(); @@ -128,9 +140,7 @@ namespace orc { * a mask (with at least numValues bytes) for which values to * set. */ - virtual void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull); + virtual void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull); /** * Read the next group of values without decoding @@ -140,10 +150,7 @@ namespace orc { * a mask (with at least numValues bytes) for which values to * set. */ - virtual void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) - { + virtual void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { rowBatch.isEncoded = false; next(rowBatch, numValues, notNull); } @@ -152,16 +159,16 @@ namespace orc { * Seek to beginning of a row group in the current stripe * @param positions a list of PositionProviders storing the positions */ - virtual void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions); - + virtual void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions); }; /** * Create a reader for the given stripe. */ - std::unique_ptr<ColumnReader> buildReader(const Type& type, - StripeStreams& stripe); -} + std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe, + bool useTightNumericVector = false, + bool throwOnSchemaEvolutionOverflow = false, + bool convertToReadType = true); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc index 32b68af349..f24be1f0b2 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc @@ -27,55 +27,43 @@ namespace orc { StreamsFactory::~StreamsFactory() { - //PASS + // PASS } class StreamsFactoryImpl : public StreamsFactory { - public: - StreamsFactoryImpl( - const WriterOptions& writerOptions, - OutputStream* outputStream) : - options(writerOptions), - outStream(outputStream) { - } - - virtual std::unique_ptr<BufferedOutputStream> - createStream(proto::Stream_Kind kind) const override; - private: + public: + StreamsFactoryImpl(const WriterOptions& writerOptions, OutputStream* outputStream) + : options(writerOptions), outStream(outputStream) {} + + virtual std::unique_ptr<BufferedOutputStream> createStream( + proto::Stream_Kind kind) const override; + + private: const WriterOptions& options; OutputStream* outStream; }; - std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream( - proto::Stream_Kind) const { + std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(proto::Stream_Kind) const { // In the future, we can decide compression strategy and modifier // based on stream kind. But for now we just use the setting from // WriterOption - return createCompressor( - options.getCompression(), - outStream, - options.getCompressionStrategy(), + return createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(), // BufferedOutputStream initial capacity - 1 * 1024 * 1024, - options.getCompressionBlockSize(), - *options.getMemoryPool()); + options.getOutputBufferCapacity(), options.getCompressionBlockSize(), + *options.getMemoryPool(), options.getWriterMetrics()); } - std::unique_ptr<StreamsFactory> createStreamsFactory( - const WriterOptions& options, - OutputStream* outStream) { - return std::unique_ptr<StreamsFactory>( - new StreamsFactoryImpl(options, outStream)); + std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options, + OutputStream* outStream) { + return std::make_unique<StreamsFactoryImpl>(options, outStream); } RowIndexPositionRecorder::~RowIndexPositionRecorder() { // PASS } - proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) - { - switch (rleVersion) - { + proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) { + switch (rleVersion) { case RleVersion_1: return proto::ColumnEncoding_Kind_DIRECT; case RleVersion_2: @@ -85,24 +73,21 @@ namespace orc { } } - ColumnWriter::ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - columnId(type.getColumnId()), - colIndexStatistics(), - colStripeStatistics(), - colFileStatistics(), - enableIndex(options.getEnableIndex()), - rowIndex(), - rowIndexEntry(), - rowIndexPosition(), - enableBloomFilter(false), - memPool(*options.getMemoryPool()), - indexStream(), - bloomFilterStream(), - hasNullValue(false) { - + ColumnWriter::ColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : columnId(type.getColumnId()), + colIndexStatistics(), + colStripeStatistics(), + colFileStatistics(), + enableIndex(options.getEnableIndex()), + rowIndex(), + rowIndexEntry(), + rowIndexPosition(), + enableBloomFilter(false), + memPool(*options.getMemoryPool()), + indexStream(), + bloomFilterStream(), + hasNullValue(false) { std::unique_ptr<BufferedOutputStream> presentStream = factory.createStream(proto::Stream_Kind_PRESENT); notNullEncoder = createBooleanRleEncoder(std::move(presentStream)); @@ -112,20 +97,17 @@ namespace orc { colFileStatistics = createColumnStatistics(type); if (enableIndex) { - rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex()); - rowIndexEntry = - std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry()); - rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>( - new RowIndexPositionRecorder(*rowIndexEntry)); - indexStream = - factory.createStream(proto::Stream_Kind_ROW_INDEX); + rowIndex = std::make_unique<proto::RowIndex>(); + rowIndexEntry = std::make_unique<proto::RowIndexEntry>(); + rowIndexPosition = std::make_unique<RowIndexPositionRecorder>(*rowIndexEntry); + indexStream = factory.createStream(proto::Stream_Kind_ROW_INDEX); // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported - if (options.isColumnUseBloomFilter(columnId) - && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) { + if (options.isColumnUseBloomFilter(columnId) && + options.getBloomFilterVersion() == BloomFilterVersion::UTF8) { enableBloomFilter = true; - bloomFilter.reset(new BloomFilterImpl( - options.getRowIndexStride(), options.getBloomFilterFPP())); + bloomFilter.reset( + new BloomFilterImpl(options.getRowIndexStride(), options.getBloomFilterFPP())); bloomFilterIndex.reset(new proto::BloomFilterIndex()); bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8); } @@ -136,9 +118,7 @@ namespace orc { // PASS } - void ColumnWriter::add(ColumnVectorBatch& batch, - uint64_t offset, - uint64_t numValues, + void ColumnWriter::add(ColumnVectorBatch& batch, uint64_t offset, uint64_t numValues, const char* incomingMask) { const char* notNull = batch.notNull.data() + offset; notNullEncoder->add(notNull, numValues, incomingMask); @@ -167,8 +147,7 @@ namespace orc { return notNullEncoder->getBufferSize(); } - void ColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void ColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { getProtoBufStatistics(stats, colStripeStatistics.get()); } @@ -182,13 +161,12 @@ namespace orc { colIndexStatistics->reset(); } - void ColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void ColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { getProtoBufStatistics(stats, colFileStatistics.get()); } void ColumnWriter::createRowIndexEntry() { - proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics(); + proto::ColumnStatistics* indexStats = rowIndexEntry->mutable_statistics(); colIndexStatistics->toProtoBuf(*indexStats); *rowIndex->add_entry() = *rowIndexEntry; @@ -206,12 +184,12 @@ namespace orc { void ColumnWriter::addBloomFilterEntry() { if (enableBloomFilter) { - BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter()); + BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloom_filter()); bloomFilter->reset(); } } - void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + void ColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { if (!hasNullValue) { // remove positions of present stream int presentCount = indexStream->isCompressed() ? 4 : 3; @@ -266,7 +244,7 @@ namespace orc { if (enableBloomFilter) { bloomFilter->reset(); - bloomFilterIndex->clear_bloomfilter(); + bloomFilterIndex->clear_bloom_filter(); } } @@ -275,28 +253,21 @@ namespace orc { } class StructColumnWriter : public ColumnWriter { - public: - StructColumnWriter( - const Type& type, - const StreamsFactory& factory, + public: + StructColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override; - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override; virtual void mergeStripeStatsIntoFileStats() override; @@ -304,23 +275,20 @@ namespace orc { virtual void createRowIndexEntry() override; - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; + virtual void writeIndex(std::vector<proto::Stream>& streams) const override; virtual void writeDictionary() override; virtual void reset() override; - private: + private: std::vector<std::unique_ptr<ColumnWriter>> children; }; - StructColumnWriter::StructColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) { + StructColumnWriter::StructColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options) { + for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) { const Type& child = *type.getSubtype(i); children.push_back(buildWriter(child, factory, options)); } @@ -330,20 +298,15 @@ namespace orc { } } - void StructColumnWriter::add( - ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void StructColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { - const StructVectorBatch* structBatch = - dynamic_cast<const StructVectorBatch *>(&rowBatch); + const StructVectorBatch* structBatch = dynamic_cast<const StructVectorBatch*>(&rowBatch); if (structBatch == nullptr) { throw InvalidArgument("Failed to cast to StructVectorBatch"); } ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = structBatch->hasNulls ? - structBatch->notNull.data() + offset : nullptr; + const char* notNull = structBatch->hasNulls ? structBatch->notNull.data() + offset : nullptr; for (uint32_t i = 0; i < children.size(); ++i) { children[i]->add(*structBatch->fields[i], offset, numValues, notNull); } @@ -372,8 +335,7 @@ namespace orc { } } - void StructColumnWriter::writeIndex( - std::vector<proto::Stream> &streams) const { + void StructColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); for (uint32_t i = 0; i < children.size(); ++i) { children[i]->writeIndex(streams); @@ -388,19 +350,17 @@ namespace orc { return size; } - void StructColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + void StructColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); encodings.push_back(encoding); for (uint32_t i = 0; i < children.size(); ++i) { children[i]->getColumnEncoding(encodings); } } - void StructColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void StructColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); for (uint32_t i = 0; i < children.size(); ++i) { @@ -416,8 +376,7 @@ namespace orc { } } - void StructColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void StructColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); for (uint32_t i = 0; i < children.size(); ++i) { @@ -425,7 +384,7 @@ namespace orc { } } - void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { + void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); for (uint32_t i = 0; i < children.size(); ++i) { @@ -455,47 +414,38 @@ namespace orc { } } + template <typename BatchType> class IntegerColumnWriter : public ColumnWriter { - public: - IntegerColumnWriter( - const Type& type, - const StreamsFactory& factory, + public: + IntegerColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - protected: + protected: std::unique_ptr<RleEncoder> rleEncoder; - private: + private: RleVersion rleVersion; }; - IntegerColumnWriter::IntegerColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()) { + template <typename BatchType> + IntegerColumnWriter<BatchType>::IntegerColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) { std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createRleEncoder( - std::move(dataStream), - true, - rleVersion, - memPool, + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool, options.getAlignedBitpacking()); if (enableIndex) { @@ -503,15 +453,12 @@ namespace orc { } } - void IntegerColumnWriter::add( - ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const LongVectorBatch* longBatch = - dynamic_cast<const LongVectorBatch*>(&rowBatch); - if (longBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); + template <typename BatchType> + void IntegerColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset, + uint64_t numValues, const char* incomingMask) { + const BatchType* intBatch = dynamic_cast<const BatchType*>(&rowBatch); + if (intBatch == nullptr) { + throw InvalidArgument("Failed to cast to IntegerVectorBatch"); } IntegerColumnStatisticsImpl* intStats = dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); @@ -521,9 +468,8 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const int64_t* data = longBatch->data.data() + offset; - const char* notNull = longBatch->hasNulls ? - longBatch->notNull.data() + offset : nullptr; + const auto* data = intBatch->data.data() + offset; + const char* notNull = intBatch->hasNulls ? intBatch->notNull.data() + offset : nullptr; rleEncoder->add(data, numValues, notNull); @@ -533,9 +479,9 @@ namespace orc { if (notNull == nullptr || notNull[i]) { ++count; if (enableBloomFilter) { - bloomFilter->addLong(data[i]); + bloomFilter->addLong(static_cast<int64_t>(data[i])); } - intStats->update(data[i], 1); + intStats->update(static_cast<int64_t>(data[i]), 1); } } intStats->increase(count); @@ -544,7 +490,8 @@ namespace orc { } } - void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) { + template <typename BatchType> + void IntegerColumnWriter<BatchType>::flush(std::vector<proto::Stream>& streams) { ColumnWriter::flush(streams); proto::Stream stream; @@ -554,59 +501,57 @@ namespace orc { streams.push_back(stream); } - uint64_t IntegerColumnWriter::getEstimatedSize() const { + template <typename BatchType> + uint64_t IntegerColumnWriter<BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); size += rleEncoder->getBufferSize(); return size; } - void IntegerColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + template <typename BatchType> + void IntegerColumnWriter<BatchType>::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } - void IntegerColumnWriter::recordPosition() const { + template <typename BatchType> + void IntegerColumnWriter<BatchType>::recordPosition() const { ColumnWriter::recordPosition(); rleEncoder->recordPosition(rowIndexPosition.get()); } + template <typename BatchType> class ByteColumnWriter : public ColumnWriter { - public: - ByteColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + public: + ByteColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - private: + private: std::unique_ptr<ByteRleEncoder> byteRleEncoder; }; - ByteColumnWriter::ByteColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { + template <typename BatchType> + ByteColumnWriter<BatchType>::ByteColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options) { std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); + factory.createStream(proto::Stream_Kind_DATA); byteRleEncoder = createByteRleEncoder(std::move(dataStream)); if (enableIndex) { @@ -614,13 +559,12 @@ namespace orc { } } - void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); + template <typename BatchType> + void ByteColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset, + uint64_t numValues, const char* incomingMask) { + BatchType* byteBatch = dynamic_cast<BatchType*>(&rowBatch); if (byteBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); + throw InvalidArgument("Failed to cast to IntegerVectorBatch"); } IntegerColumnStatisticsImpl* intStats = dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); @@ -630,9 +574,8 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - int64_t* data = byteBatch->data.data() + offset; - const char* notNull = byteBatch->hasNulls ? - byteBatch->notNull.data() + offset : nullptr; + auto* data = byteBatch->data.data() + offset; + const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + offset : nullptr; char* byteData = reinterpret_cast<char*>(data); for (uint64_t i = 0; i < numValues; ++i) { @@ -656,7 +599,8 @@ namespace orc { } } - void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) { + template <typename BatchType> + void ByteColumnWriter<BatchType>::flush(std::vector<proto::Stream>& streams) { ColumnWriter::flush(streams); proto::Stream stream; @@ -666,59 +610,59 @@ namespace orc { streams.push_back(stream); } - uint64_t ByteColumnWriter::getEstimatedSize() const { + template <typename BatchType> + uint64_t ByteColumnWriter<BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); size += byteRleEncoder->getBufferSize(); return size; } - void ByteColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + template <typename BatchType> + void ByteColumnWriter<BatchType>::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } - void ByteColumnWriter::recordPosition() const { + template <typename BatchType> + void ByteColumnWriter<BatchType>::recordPosition() const { ColumnWriter::recordPosition(); byteRleEncoder->recordPosition(rowIndexPosition.get()); } + template <typename BatchType> class BooleanColumnWriter : public ColumnWriter { - public: - BooleanColumnWriter(const Type& type, - const StreamsFactory& factory, + public: + BooleanColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - private: + private: std::unique_ptr<ByteRleEncoder> rleEncoder; }; - BooleanColumnWriter::BooleanColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { + template <typename BatchType> + BooleanColumnWriter<BatchType>::BooleanColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options) { std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); + factory.createStream(proto::Stream_Kind_DATA); rleEncoder = createBooleanRleEncoder(std::move(dataStream)); if (enableIndex) { @@ -726,13 +670,14 @@ namespace orc { } } - void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); + template <typename BatchType> + void BooleanColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset, + uint64_t numValues, const char* incomingMask) { + BatchType* byteBatch = dynamic_cast<BatchType*>(&rowBatch); if (byteBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); + std::stringstream ss; + ss << "Failed to cast to " << typeid(BatchType).name(); + throw InvalidArgument(ss.str()); } BooleanColumnStatisticsImpl* boolStats = dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get()); @@ -742,9 +687,8 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - int64_t* data = byteBatch->data.data() + offset; - const char* notNull = byteBatch->hasNulls ? - byteBatch->notNull.data() + offset : nullptr; + auto* data = byteBatch->data.data() + offset; + const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + offset : nullptr; char* byteData = reinterpret_cast<char*>(data); for (uint64_t i = 0; i < numValues; ++i) { @@ -768,7 +712,8 @@ namespace orc { } } - void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) { + template <typename BatchType> + void BooleanColumnWriter<BatchType>::flush(std::vector<proto::Stream>& streams) { ColumnWriter::flush(streams); proto::Stream stream; @@ -778,65 +723,63 @@ namespace orc { streams.push_back(stream); } - uint64_t BooleanColumnWriter::getEstimatedSize() const { + template <typename BatchType> + uint64_t BooleanColumnWriter<BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); size += rleEncoder->getBufferSize(); return size; } - void BooleanColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + template <typename BatchType> + void BooleanColumnWriter<BatchType>::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } - void BooleanColumnWriter::recordPosition() const { + template <typename BatchType> + void BooleanColumnWriter<BatchType>::recordPosition() const { ColumnWriter::recordPosition(); rleEncoder->recordPosition(rowIndexPosition.get()); } - class DoubleColumnWriter : public ColumnWriter { - public: - DoubleColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isFloat); + template <typename ValueType, typename BatchType> + class FloatingColumnWriter : public ColumnWriter { + public: + FloatingColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options, bool isFloat); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - private: + private: bool isFloat; std::unique_ptr<AppendOnlyBufferedStream> dataStream; DataBuffer<char> buffer; }; - DoubleColumnWriter::DoubleColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isFloatType) : - ColumnWriter(type, factory, options), - isFloat(isFloatType), - buffer(*options.getMemoryPool()) { - dataStream.reset(new AppendOnlyBufferedStream( - factory.createStream(proto::Stream_Kind_DATA))); + template <typename ValueType, typename BatchType> + FloatingColumnWriter<ValueType, BatchType>::FloatingColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options, + bool isFloatType) + : ColumnWriter(type, factory, options), + isFloat(isFloatType), + buffer(*options.getMemoryPool()) { + dataStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA))); buffer.resize(isFloat ? 4 : 8); if (enableIndex) { @@ -854,26 +797,24 @@ namespace orc { } } - void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const DoubleVectorBatch* dblBatch = - dynamic_cast<const DoubleVectorBatch*>(&rowBatch); + template <typename ValueType, typename BatchType> + void FloatingColumnWriter<ValueType, BatchType>::add(ColumnVectorBatch& rowBatch, uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const BatchType* dblBatch = dynamic_cast<const BatchType*>(&rowBatch); if (dblBatch == nullptr) { - throw InvalidArgument("Failed to cast to DoubleVectorBatch"); + throw InvalidArgument("Failed to cast to FloatingVectorBatch"); } DoubleColumnStatisticsImpl* doubleStats = - dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get()); + dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get()); if (doubleStats == nullptr) { throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl"); } ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const double* doubleData = dblBatch->data.data() + offset; - const char* notNull = dblBatch->hasNulls ? - dblBatch->notNull.data() + offset : nullptr; + const ValueType* doubleData = dblBatch->data.data() + offset; + const char* notNull = dblBatch->hasNulls ? dblBatch->notNull.data() + offset : nullptr; size_t bytes = isFloat ? 4 : 8; char* data = buffer.data(); @@ -883,14 +824,14 @@ namespace orc { if (isFloat) { encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data); } else { - encodeFloatNum<double, int64_t>(doubleData[i], data); + encodeFloatNum<double, int64_t>(static_cast<double>(doubleData[i]), data); } dataStream->write(data, bytes); ++count; if (enableBloomFilter) { - bloomFilter->addDouble(doubleData[i]); + bloomFilter->addDouble(static_cast<double>(doubleData[i])); } - doubleStats->update(doubleData[i]); + doubleStats->update(static_cast<double>(doubleData[i])); } } doubleStats->increase(count); @@ -899,7 +840,8 @@ namespace orc { } } - void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) { + template <typename ValueType, typename BatchType> + void FloatingColumnWriter<ValueType, BatchType>::flush(std::vector<proto::Stream>& streams) { ColumnWriter::flush(streams); proto::Stream stream; @@ -909,24 +851,27 @@ namespace orc { streams.push_back(stream); } - uint64_t DoubleColumnWriter::getEstimatedSize() const { + template <typename ValueType, typename BatchType> + uint64_t FloatingColumnWriter<ValueType, BatchType>::getEstimatedSize() const { uint64_t size = ColumnWriter::getEstimatedSize(); size += dataStream->getSize(); return size; } - void DoubleColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + template <typename ValueType, typename BatchType> + void FloatingColumnWriter<ValueType, BatchType>::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } - void DoubleColumnWriter::recordPosition() const { + template <typename ValueType, typename BatchType> + void FloatingColumnWriter<ValueType, BatchType>::recordPosition() const { ColumnWriter::recordPosition(); dataStream->recordPosition(rowIndexPosition.get()); } @@ -935,27 +880,26 @@ namespace orc { * Implementation of increasing sorted string dictionary */ class SortedStringDictionary { - public: + public: struct DictEntry { - DictEntry(const char * str, size_t len):data(str),length(len) {} - const char * data; + DictEntry(const char* str, size_t len) : data(str), length(len) {} + const char* data; size_t length; }; - SortedStringDictionary():totalLength(0) {} + SortedStringDictionary() : totalLength(0) {} // insert a new string into dictionary, return its insertion order - size_t insert(const char * data, size_t len); + size_t insert(const char* data, size_t len); // write dictionary data & length to output buffer - void flush(AppendOnlyBufferedStream * dataStream, - RleEncoder * lengthEncoder) const; + void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; // reorder input index buffer from insertion order to dictionary order void reorder(std::vector<int64_t>& idxBuffer) const; // get dict entries in insertion order - void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const; + void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const; // return count of entries size_t size() const; @@ -965,7 +909,7 @@ namespace orc { void clear(); - private: + private: struct LessThan { bool operator()(const DictEntry& left, const DictEntry& right) const { int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); @@ -989,14 +933,14 @@ namespace orc { }; // insert a new string into dictionary, return its insertion order - size_t SortedStringDictionary::insert(const char * str, size_t len) { + size_t SortedStringDictionary::insert(const char* str, size_t len) { auto ret = dict.insert({DictEntry(str, len), dict.size()}); if (ret.second) { // make a copy to internal storage data.push_back(std::vector<char>(len)); memcpy(data.back().data(), str, len); // update dictionary entry to link pointer to internal storage - DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first)); + DictEntry* entry = const_cast<DictEntry*>(&(ret.first->first)); entry->data = data.back().data(); totalLength += len; } @@ -1004,8 +948,8 @@ namespace orc { } // write dictionary data & length to output buffer - void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream, - RleEncoder * lengthEncoder) const { + void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, + RleEncoder* lengthEncoder) const { for (auto it = dict.cbegin(); it != dict.cend(); ++it) { dataStream->write(it->first.data, it->first.length); lengthEncoder->write(static_cast<int64_t>(it->first.length)); @@ -1032,14 +976,13 @@ namespace orc { // do the transformation for (size_t i = 0; i != idxBuffer.size(); ++i) { - idxBuffer[i] = static_cast<int64_t>( - mapping[static_cast<size_t>(idxBuffer[i])]); + idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]); } } // get dict entries in insertion order void SortedStringDictionary::getEntriesInInsertionOrder( - std::vector<const DictEntry *>& entries) const { + std::vector<const DictEntry*>& entries) const { entries.resize(dict.size()); for (auto it = dict.cbegin(); it != dict.cend(); ++it) { entries[it->second] = &(it->first); @@ -1056,29 +999,25 @@ namespace orc { return totalLength; } - void SortedStringDictionary::clear() { + void SortedStringDictionary::clear() { totalLength = 0; data.clear(); dict.clear(); } class StringColumnWriter : public ColumnWriter { - public: - StringColumnWriter(const Type& type, - const StreamsFactory& factory, + public: + StringColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; @@ -1088,7 +1027,7 @@ namespace orc { virtual void reset() override; - private: + private: /** * dictionary related functions */ @@ -1098,7 +1037,7 @@ namespace orc { void deleteDictStreams(); void fallbackToDirectEncoding(); - protected: + protected: RleVersion rleVersion; bool useCompression; const StreamsFactory& streamsFactory; @@ -1128,18 +1067,16 @@ namespace orc { mutable std::vector<size_t> startOfRowGroups; }; - StringColumnWriter::StringColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - useCompression(options.getCompression() != CompressionKind_NONE), - streamsFactory(factory), - alignedBitPacking(options.getAlignedBitpacking()), - doneDictionaryCheck(false), - useDictionary(options.getEnableDictionary()), - dictSizeThreshold(options.getDictionaryKeySizeThreshold()){ + StringColumnWriter::StringColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + useCompression(options.getCompression() != CompressionKind_NONE), + streamsFactory(factory), + alignedBitPacking(options.getAlignedBitpacking()), + doneDictionaryCheck(false), + useDictionary(options.getEnableDictionary()), + dictSizeThreshold(options.getDictionaryKeySizeThreshold()) { if (type.getKind() == TypeKind::BINARY) { useDictionary = false; doneDictionaryCheck = true; @@ -1157,12 +1094,9 @@ namespace orc { } } - void StringColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void StringColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { - const StringVectorBatch* stringBatch = - dynamic_cast<const StringVectorBatch*>(&rowBatch); + const StringVectorBatch* stringBatch = dynamic_cast<const StringVectorBatch*>(&rowBatch); if (stringBatch == nullptr) { throw InvalidArgument("Failed to cast to StringVectorBatch"); } @@ -1175,12 +1109,11 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - char *const * data = stringBatch->data.data() + offset; + char* const* data = stringBatch->data.data() + offset; const int64_t* length = stringBatch->length.data() + offset; - const char* notNull = stringBatch->hasNulls ? - stringBatch->notNull.data() + offset : nullptr; + const char* notNull = stringBatch->hasNulls ? stringBatch->notNull.data() + offset : nullptr; - if (!useDictionary){ + if (!useDictionary) { directLengthEncoder->add(length, numValues, notNull); } @@ -1259,21 +1192,18 @@ namespace orc { return size; } - void StringColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + void StringColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; if (!useDictionary) { - encoding.set_kind(rleVersion == RleVersion_1 ? - proto::ColumnEncoding_Kind_DIRECT : - proto::ColumnEncoding_Kind_DIRECT_V2); + encoding.set_kind(rleVersion == RleVersion_1 ? proto::ColumnEncoding_Kind_DIRECT + : proto::ColumnEncoding_Kind_DIRECT_V2); } else { - encoding.set_kind(rleVersion == RleVersion_1 ? - proto::ColumnEncoding_Kind_DICTIONARY : - proto::ColumnEncoding_Kind_DICTIONARY_V2); + encoding.set_kind(rleVersion == RleVersion_1 ? proto::ColumnEncoding_Kind_DICTIONARY + : proto::ColumnEncoding_Kind_DICTIONARY_V2); } - encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size())); + encoding.set_dictionary_size(static_cast<uint32_t>(dictionary.size())); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } @@ -1292,8 +1222,9 @@ namespace orc { bool StringColumnWriter::checkDictionaryKeyRatio() { if (!doneDictionaryCheck) { - useDictionary = dictionary.size() <= static_cast<size_t>( - static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold); + useDictionary = dictionary.size() <= + static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer.size()) * + dictSizeThreshold); doneDictionaryCheck = true; } @@ -1320,33 +1251,24 @@ namespace orc { void StringColumnWriter::createDirectStreams() { std::unique_ptr<BufferedOutputStream> directLengthStream = - streamsFactory.createStream(proto::Stream_Kind_LENGTH); - directLengthEncoder = createRleEncoder(std::move(directLengthStream), - false, - rleVersion, - memPool, - alignedBitPacking); - directDataStream.reset(new AppendOnlyBufferedStream( - streamsFactory.createStream(proto::Stream_Kind_DATA))); + streamsFactory.createStream(proto::Stream_Kind_LENGTH); + directLengthEncoder = createRleEncoder(std::move(directLengthStream), false, rleVersion, + memPool, alignedBitPacking); + directDataStream.reset( + new AppendOnlyBufferedStream(streamsFactory.createStream(proto::Stream_Kind_DATA))); } void StringColumnWriter::createDictStreams() { std::unique_ptr<BufferedOutputStream> dictDataStream = - streamsFactory.createStream(proto::Stream_Kind_DATA); - dictDataEncoder = createRleEncoder(std::move(dictDataStream), - false, - rleVersion, - memPool, - alignedBitPacking); + streamsFactory.createStream(proto::Stream_Kind_DATA); + dictDataEncoder = + createRleEncoder(std::move(dictDataStream), false, rleVersion, memPool, alignedBitPacking); std::unique_ptr<BufferedOutputStream> dictLengthStream = - streamsFactory.createStream(proto::Stream_Kind_LENGTH); - dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), - false, - rleVersion, - memPool, + streamsFactory.createStream(proto::Stream_Kind_LENGTH); + dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), false, rleVersion, memPool, alignedBitPacking); dictStream.reset(new AppendOnlyBufferedStream( - streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA))); + streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA))); } void StringColumnWriter::deleteDictStreams() { @@ -1360,7 +1282,7 @@ namespace orc { } void StringColumnWriter::writeDictionary() { - if (useDictionary && !doneDictionaryCheck) { + if (useDictionary && !doneDictionaryCheck) { // when index is disabled, dictionary check happens while writing 1st stripe if (!checkDictionaryKeyRatio()) { fallbackToDirectEncoding(); @@ -1376,7 +1298,7 @@ namespace orc { dictionary.reorder(dictionary.idxInDictBuffer); // write data sequences - int64_t * data = dictionary.idxInDictBuffer.data(); + int64_t* data = dictionary.idxInDictBuffer.data(); if (enableIndex) { size_t prevOffset = 0; for (size_t i = 0; i < startOfRowGroups.size(); ++i) { @@ -1386,9 +1308,9 @@ namespace orc { // update index positions int rowGroupId = static_cast<int>(i); - proto::RowIndexEntry* indexEntry = - (rowGroupId < rowIndex->entry_size()) ? - rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get(); + proto::RowIndexEntry* indexEntry = (rowGroupId < rowIndex->entry_size()) + ? rowIndex->mutable_entry(rowGroupId) + : rowIndexEntry.get(); // add positions for direct streams RowIndexPositionRecorder recorder(*indexEntry); @@ -1397,8 +1319,7 @@ namespace orc { prevOffset = offset; } - dictDataEncoder->add(data + prevOffset, - dictionary.idxInDictBuffer.size() - prevOffset, + dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer.size() - prevOffset, nullptr); } else { dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr); @@ -1412,18 +1333,18 @@ namespace orc { if (enableIndex) { // fallback happens at the 1st row group; // simply complete positions for direct streams - proto::RowIndexEntry * indexEntry = rowIndexEntry.get(); + proto::RowIndexEntry* indexEntry = rowIndexEntry.get(); RowIndexPositionRecorder recorder(*indexEntry); directDataStream->recordPosition(&recorder); directLengthEncoder->recordPosition(&recorder); } // get dictionary entries in insertion order - std::vector<const SortedStringDictionary::DictEntry *> entries; + std::vector<const SortedStringDictionary::DictEntry*> entries; dictionary.getEntriesInInsertionOrder(entries); // store each length of the data into a vector - const SortedStringDictionary::DictEntry * dictEntry = nullptr; + const SortedStringDictionary::DictEntry* dictEntry = nullptr; for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) { // write one row data in direct encoding dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])]; @@ -1438,7 +1359,7 @@ namespace orc { /** * Counts how many utf-8 chars of the input data */ - static uint64_t charLength(const char * data, uint64_t length) { + static uint64_t charLength(const char* data, uint64_t length) { uint64_t chars = 0; for (uint64_t i = 0; i < length; i++) { if (isUtfStartByte(data[i])) { @@ -1458,9 +1379,7 @@ namespace orc { * @param data the bytes of UTF-8 * @param length the length of data to truncate */ - static uint64_t truncateBytesTo(uint64_t maxCharLength, - const char * data, - uint64_t length) { + static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { uint64_t chars = 0; if (length <= maxCharLength) { return length; @@ -1490,8 +1409,8 @@ namespace orc { * @param from the first byte location * @param until the last byte location * @return the index of the last character - */ - static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) { + */ + static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { uint64_t posn = until; /* we don't expect characters more than 5 bytes */ while (posn >= from) { @@ -1501,36 +1420,29 @@ namespace orc { posn -= 1; } /* beginning of a valid char not found */ - throw std::logic_error( - "Could not truncate string, beginning of a valid char not found"); + throw std::logic_error("Could not truncate string, beginning of a valid char not found"); } }; class CharColumnWriter : public StringColumnWriter { - public: - CharColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()), - padBuffer(*options.getMemoryPool()) { + public: + CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) + : StringColumnWriter(type, factory, options), + maxLength(type.getMaximumLength()), + padBuffer(*options.getMemoryPool()) { // utf-8 is currently 4 bytes long, but it could be up to 6 padBuffer.resize(maxLength * 6); } - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; - private: + private: uint64_t maxLength; DataBuffer<char> padBuffer; }; - void CharColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void CharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); if (charsBatch == nullptr) { @@ -1547,26 +1459,24 @@ namespace orc { char** data = charsBatch->data.data() + offset; int64_t* length = charsBatch->length.data() + offset; - const char* notNull = charsBatch->hasNulls ? - charsBatch->notNull.data() + offset : nullptr; + const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + offset : nullptr; uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { - const char * charData = nullptr; + const char* charData = nullptr; uint64_t originLength = static_cast<uint64_t>(length[i]); uint64_t charLength = Utf8Utils::charLength(data[i], originLength); if (charLength >= maxLength) { charData = data[i]; - length[i] = static_cast<int64_t>( - Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); + length[i] = + static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); } else { charData = padBuffer.data(); // the padding is exactly 1 byte per char length[i] = length[i] + static_cast<int64_t>(maxLength - charLength); memcpy(padBuffer.data(), data[i], originLength); - memset(padBuffer.data() + originLength, - ' ', + memset(padBuffer.data() + originLength, ' ', static_cast<size_t>(length[i]) - originLength); } @@ -1596,27 +1506,21 @@ namespace orc { } class VarCharColumnWriter : public StringColumnWriter { - public: - VarCharColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()) { + public: + VarCharColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : StringColumnWriter(type, factory, options), maxLength(type.getMaximumLength()) { // PASS } - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; - private: + private: uint64_t maxLength; }; - void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); if (charsBatch == nullptr) { @@ -1633,14 +1537,13 @@ namespace orc { char* const* data = charsBatch->data.data() + offset; int64_t* length = charsBatch->length.data() + offset; - const char* notNull = charsBatch->hasNulls ? - charsBatch->notNull.data() + offset : nullptr; + const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + offset : nullptr; uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { - uint64_t itemLength = Utf8Utils::truncateBytesTo( - maxLength, data[i], static_cast<uint64_t>(length[i])); + uint64_t itemLength = + Utf8Utils::truncateBytesTo(maxLength, data[i], static_cast<uint64_t>(length[i])); length[i] = static_cast<int64_t>(itemLength); if (useDictionary) { @@ -1669,23 +1572,18 @@ namespace orc { } class BinaryColumnWriter : public StringColumnWriter { - public: - BinaryColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options) { + public: + BinaryColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : StringColumnWriter(type, factory, options) { // PASS } - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; }; - void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); if (binBatch == nullptr) { @@ -1702,8 +1600,7 @@ namespace orc { char** data = binBatch->data.data() + offset; int64_t* length = binBatch->length.data() + offset; - const char* notNull = binBatch->hasNulls ? - binBatch->notNull.data() + offset : nullptr; + const char* notNull = binBatch->hasNulls ? binBatch->notNull.data() + offset : nullptr; uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { @@ -1726,60 +1623,43 @@ namespace orc { } class TimestampColumnWriter : public ColumnWriter { - public: - TimestampColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isInstantType); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + public: + TimestampColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options, bool isInstantType); + + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - protected: + protected: std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder; - private: + private: RleVersion rleVersion; - const Timezone& timezone; + const Timezone* timezone; const bool isUTC; }; - TimestampColumnWriter::TimestampColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isInstantType) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - timezone(isInstantType ? - getTimezoneByName("GMT") : - options.getTimezone()), - isUTC(isInstantType || - options.getTimezoneName() == "GMT") { + TimestampColumnWriter::TimestampColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options, bool isInstantType) + : ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + timezone(isInstantType ? &getTimezoneByName("GMT") : &options.getTimezone()), + isUTC(isInstantType || options.getTimezoneName() == "GMT") { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); std::unique_ptr<BufferedOutputStream> secondaryStream = factory.createStream(proto::Stream_Kind_SECONDARY); - secRleEncoder = createRleEncoder(std::move(dataStream), - true, - rleVersion, - memPool, + secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool, options.getAlignedBitpacking()); - nanoRleEncoder = createRleEncoder(std::move(secondaryStream), - false, - rleVersion, - memPool, + nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion, memPool, options.getAlignedBitpacking()); if (enableIndex) { @@ -1808,12 +1688,9 @@ namespace orc { } } - void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { - TimestampVectorBatch* tsBatch = - dynamic_cast<TimestampVectorBatch*>(&rowBatch); + TimestampVectorBatch* tsBatch = dynamic_cast<TimestampVectorBatch*>(&rowBatch); if (tsBatch == nullptr) { throw InvalidArgument("Failed to cast to TimestampVectorBatch"); } @@ -1826,10 +1703,9 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = tsBatch->hasNulls ? - tsBatch->notNull.data() + offset : nullptr; - int64_t *secs = tsBatch->data.data() + offset; - int64_t *nanos = tsBatch->nanoseconds.data() + offset; + const char* notNull = tsBatch->hasNulls ? tsBatch->notNull.data() + offset : nullptr; + int64_t* secs = tsBatch->data.data() + offset; + int64_t* nanos = tsBatch->nanoseconds.data() + offset; uint64_t count = 0; for (uint64_t i = 0; i < numValues; ++i) { @@ -1837,7 +1713,7 @@ namespace orc { // TimestampVectorBatch already stores data in UTC int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; if (!isUTC) { - millsUTC = timezone.convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000; + millsUTC = timezone->convertToUTC(secs[i]) * 1000 + nanos[i] / 1000000; } ++count; if (enableBloomFilter) { @@ -1849,7 +1725,7 @@ namespace orc { secs[i] += 1; } - secs[i] -= timezone.getEpoch(); + secs[i] -= timezone->getEpoch(); nanos[i] = formatNano(nanos[i]); } } @@ -1886,12 +1762,12 @@ namespace orc { } void TimestampColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } @@ -1902,32 +1778,23 @@ namespace orc { nanoRleEncoder->recordPosition(rowIndexPosition.get()); } - class DateColumnWriter : public IntegerColumnWriter { - public: - DateColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); + class DateColumnWriter : public IntegerColumnWriter<LongVectorBatch> { + public: + DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; }; - DateColumnWriter::DateColumnWriter( - const Type &type, - const StreamsFactory &factory, - const WriterOptions &options) : - IntegerColumnWriter(type, factory, options) { + DateColumnWriter::DateColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : IntegerColumnWriter<LongVectorBatch>(type, factory, options) { // PASS } - void DateColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void DateColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { - const LongVectorBatch* longBatch = - dynamic_cast<const LongVectorBatch*>(&rowBatch); + const LongVectorBatch* longBatch = dynamic_cast<const LongVectorBatch*>(&rowBatch); if (longBatch == nullptr) { throw InvalidArgument("Failed to cast to LongVectorBatch"); } @@ -1941,8 +1808,7 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); const int64_t* data = longBatch->data.data() + offset; - const char* notNull = longBatch->hasNulls ? - longBatch->notNull.data() + offset : nullptr; + const char* notNull = longBatch->hasNulls ? longBatch->notNull.data() + offset : nullptr; rleEncoder->add(data, numValues, notNull); @@ -1963,55 +1829,45 @@ namespace orc { } class Decimal64ColumnWriter : public ColumnWriter { - public: + public: static const uint32_t MAX_PRECISION_64 = 18; static const uint32_t MAX_PRECISION_128 = 38; - Decimal64ColumnWriter(const Type& type, - const StreamsFactory& factory, + Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - protected: + protected: RleVersion rleVersion; uint64_t precision; uint64_t scale; std::unique_ptr<AppendOnlyBufferedStream> valueStream; std::unique_ptr<RleEncoder> scaleEncoder; - private: + private: char buffer[10]; }; - Decimal64ColumnWriter::Decimal64ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - precision(type.getPrecision()), - scale(type.getScale()) { - valueStream.reset(new AppendOnlyBufferedStream( - factory.createStream(proto::Stream_Kind_DATA))); + Decimal64ColumnWriter::Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + precision(type.getPrecision()), + scale(type.getScale()) { + valueStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA))); std::unique_ptr<BufferedOutputStream> scaleStream = factory.createStream(proto::Stream_Kind_SECONDARY); - scaleEncoder = createRleEncoder(std::move(scaleStream), - true, - rleVersion, - memPool, + scaleEncoder = createRleEncoder(std::move(scaleStream), true, rleVersion, memPool, options.getAlignedBitpacking()); if (enableIndex) { @@ -2019,26 +1875,22 @@ namespace orc { } } - void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { - const Decimal64VectorBatch* decBatch = - dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); + const Decimal64VectorBatch* decBatch = dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); if (decBatch == nullptr) { throw InvalidArgument("Failed to cast to Decimal64VectorBatch"); } DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); if (decStats == nullptr) { throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); } ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; + const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr; const int64_t* values = decBatch->values.data() + offset; uint64_t count = 0; @@ -2059,10 +1911,8 @@ namespace orc { valueStream->write(buffer, static_cast<size_t>(data - buffer)); ++count; if (enableBloomFilter) { - std::string decimal = Decimal( - values[i], static_cast<int32_t>(scale)).toString(true); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); + std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true); + bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size())); } decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); } @@ -2099,12 +1949,12 @@ namespace orc { } void Decimal64ColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } @@ -2116,44 +1966,35 @@ namespace orc { } class Decimal64ColumnWriterV2 : public ColumnWriter { - public: - Decimal64ColumnWriterV2(const Type& type, - const StreamsFactory& factory, + public: + Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; virtual void recordPosition() const override; - protected: + protected: uint64_t precision; uint64_t scale; std::unique_ptr<RleEncoder> valueEncoder; }; - Decimal64ColumnWriterV2::Decimal64ColumnWriterV2( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - precision(type.getPrecision()), - scale(type.getScale()) { + Decimal64ColumnWriterV2::Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options), + precision(type.getPrecision()), + scale(type.getScale()) { std::unique_ptr<BufferedOutputStream> dataStream = factory.createStream(proto::Stream_Kind_DATA); - valueEncoder = createRleEncoder(std::move(dataStream), - true, - RleVersion_2, - memPool, + valueEncoder = createRleEncoder(std::move(dataStream), true, RleVersion_2, memPool, options.getAlignedBitpacking()); if (enableIndex) { @@ -2161,18 +2002,15 @@ namespace orc { } } - void Decimal64ColumnWriterV2::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const Decimal64VectorBatch* decBatch = - dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); + void Decimal64ColumnWriterV2::add(ColumnVectorBatch& rowBatch, uint64_t offset, + uint64_t numValues, const char* incomingMask) { + const Decimal64VectorBatch* decBatch = dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); if (decBatch == nullptr) { throw InvalidArgument("Failed to cast to Decimal64VectorBatch"); } DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); if (decStats == nullptr) { throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); } @@ -2180,8 +2018,7 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); const int64_t* data = decBatch->values.data() + offset; - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; + const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr; valueEncoder->add(data, numValues, notNull); @@ -2190,10 +2027,8 @@ namespace orc { if (!notNull || notNull[i]) { ++count; if (enableBloomFilter) { - std::string decimal = Decimal( - data[i], static_cast<int32_t>(scale)).toString(true); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); + std::string decimal = Decimal(data[i], static_cast<int32_t>(scale)).toString(true); + bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size())); } decStats->update(Decimal(data[i], static_cast<int32_t>(scale))); } @@ -2221,12 +2056,12 @@ namespace orc { } void Decimal64ColumnWriterV2::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(RleVersionMapper(RleVersion_2)); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); } @@ -2237,25 +2072,20 @@ namespace orc { } class Decimal128ColumnWriter : public Decimal64ColumnWriter { - public: - Decimal128ColumnWriter(const Type& type, - const StreamsFactory& factory, + public: + Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; - private: + private: char buffer[20]; }; - Decimal128ColumnWriter::Decimal128ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - Decimal64ColumnWriter(type, factory, options) { + Decimal128ColumnWriter::Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : Decimal64ColumnWriter(type, factory, options) { // PASS } @@ -2272,26 +2102,22 @@ namespace orc { return val; } - void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { - const Decimal128VectorBatch* decBatch = - dynamic_cast<const Decimal128VectorBatch*>(&rowBatch); + const Decimal128VectorBatch* decBatch = dynamic_cast<const Decimal128VectorBatch*>(&rowBatch); if (decBatch == nullptr) { throw InvalidArgument("Failed to cast to Decimal128VectorBatch"); } DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); if (decStats == nullptr) { throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); } ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; + const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr; const Int128* values = decBatch->values.data() + offset; // The current encoding of decimal columns stores the integer representation @@ -2314,10 +2140,8 @@ namespace orc { ++count; if (enableBloomFilter) { - std::string decimal = Decimal( - values[i], static_cast<int32_t>(scale)).toString(true); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); + std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true); + bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size())); } decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); } @@ -2331,29 +2155,22 @@ namespace orc { } class ListColumnWriter : public ColumnWriter { - public: - ListColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); + public: + ListColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); ~ListColumnWriter() override; - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override; - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override; virtual void mergeStripeStatsIntoFileStats() override; @@ -2361,8 +2178,7 @@ namespace orc { virtual void createRowIndexEntry() override; - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; + virtual void writeIndex(std::vector<proto::Stream>& streams) const override; virtual void recordPosition() const override; @@ -2370,24 +2186,18 @@ namespace orc { virtual void reset() override; - private: + private: std::unique_ptr<RleEncoder> lengthEncoder; RleVersion rleVersion; std::unique_ptr<ColumnWriter> child; }; - ListColumnWriter::ListColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()){ - + ListColumnWriter::ListColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) { std::unique_ptr<BufferedOutputStream> lengthStream = - factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), - false, - rleVersion, - memPool, + factory.createStream(proto::Stream_Kind_LENGTH); + lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool, options.getAlignedBitpacking()); if (type.getSubtypeCount() == 1) { @@ -2403,9 +2213,7 @@ namespace orc { // PASS } - void ListColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void ListColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch); if (listBatch == nullptr) { @@ -2420,8 +2228,7 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); int64_t* offsets = listBatch->offsets.data() + offset; - const char* notNull = listBatch->hasNulls ? - listBatch->notNull.data() + offset : nullptr; + const char* notNull = listBatch->hasNulls ? listBatch->notNull.data() + offset : nullptr; uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); @@ -2473,7 +2280,7 @@ namespace orc { } } - void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + void ListColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); if (child.get()) { child->writeIndex(streams); @@ -2489,13 +2296,12 @@ namespace orc { return size; } - void ListColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + void ListColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); if (child.get()) { @@ -2503,8 +2309,7 @@ namespace orc { } } - void ListColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void ListColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); if (child.get()) { child->getStripeStatistics(stats); @@ -2518,15 +2323,14 @@ namespace orc { } } - void ListColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void ListColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); if (child.get()) { child->getFileStatistics(stats); } } - void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { + void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); if (child.get()) { child->mergeRowGroupStatsIntoStripeStats(); @@ -2559,29 +2363,22 @@ namespace orc { } class MapColumnWriter : public ColumnWriter { - public: - MapColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); + public: + MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); ~MapColumnWriter() override; - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override; - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override; virtual void mergeStripeStatsIntoFileStats() override; @@ -2589,8 +2386,7 @@ namespace orc { virtual void createRowIndexEntry() override; - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; + virtual void writeIndex(std::vector<proto::Stream>& streams) const override; virtual void recordPosition() const override; @@ -2598,24 +2394,19 @@ namespace orc { virtual void reset() override; - private: + private: std::unique_ptr<ColumnWriter> keyWriter; std::unique_ptr<ColumnWriter> elemWriter; std::unique_ptr<RleEncoder> lengthEncoder; RleVersion rleVersion; }; - MapColumnWriter::MapColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()){ + MapColumnWriter::MapColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) { std::unique_ptr<BufferedOutputStream> lengthStream = - factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), - false, - rleVersion, - memPool, + factory.createStream(proto::Stream_Kind_LENGTH); + lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool, options.getAlignedBitpacking()); if (type.getSubtypeCount() > 0) { @@ -2635,9 +2426,7 @@ namespace orc { // PASS } - void MapColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void MapColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch); if (mapBatch == nullptr) { @@ -2652,8 +2441,7 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); int64_t* offsets = mapBatch->offsets.data() + offset; - const char* notNull = mapBatch->hasNulls ? - mapBatch->notNull.data() + offset : nullptr; + const char* notNull = mapBatch->hasNulls ? mapBatch->notNull.data() + offset : nullptr; uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); @@ -2712,8 +2500,7 @@ namespace orc { } } - void MapColumnWriter::writeIndex( - std::vector<proto::Stream> &streams) const { + void MapColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); if (keyWriter.get()) { keyWriter->writeIndex(streams); @@ -2735,13 +2522,12 @@ namespace orc { return size; } - void MapColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + void MapColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); if (keyWriter.get()) { @@ -2752,8 +2538,7 @@ namespace orc { } } - void MapColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void MapColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); if (keyWriter.get()) { keyWriter->getStripeStatistics(stats); @@ -2773,8 +2558,7 @@ namespace orc { } } - void MapColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void MapColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); if (keyWriter.get()) { keyWriter->getFileStatistics(stats); @@ -2784,7 +2568,7 @@ namespace orc { } } - void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { + void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); if (keyWriter.get()) { keyWriter->mergeRowGroupStatsIntoStripeStats(); @@ -2829,28 +2613,22 @@ namespace orc { } class UnionColumnWriter : public ColumnWriter { - public: - UnionColumnWriter(const Type& type, - const StreamsFactory& factory, + public: + UnionColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) override; virtual void flush(std::vector<proto::Stream>& streams) override; virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override; - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override; - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; + virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override; virtual void mergeStripeStatsIntoFileStats() override; @@ -2858,8 +2636,7 @@ namespace orc { virtual void createRowIndexEntry() override; - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; + virtual void writeIndex(std::vector<proto::Stream>& streams) const override; virtual void recordPosition() const override; @@ -2867,24 +2644,20 @@ namespace orc { virtual void reset() override; - private: + private: std::unique_ptr<ByteRleEncoder> rleEncoder; std::vector<std::unique_ptr<ColumnWriter>> children; }; - UnionColumnWriter::UnionColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - + UnionColumnWriter::UnionColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : ColumnWriter(type, factory, options) { std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); + factory.createStream(proto::Stream_Kind_DATA); rleEncoder = createByteRleEncoder(std::move(dataStream)); for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { - children.push_back(buildWriter(*type.getSubtype(i), - factory, - options)); + children.push_back(buildWriter(*type.getSubtype(i), factory, options)); } if (enableIndex) { @@ -2892,9 +2665,7 @@ namespace orc { } } - void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, + void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, const char* incomingMask) { UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch); if (unionBatch == nullptr) { @@ -2903,10 +2674,9 @@ namespace orc { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = unionBatch->hasNulls ? - unionBatch->notNull.data() + offset : nullptr; - unsigned char * tags = unionBatch->tags.data() + offset; - uint64_t * offsets = unionBatch->offsets.data() + offset; + const char* notNull = unionBatch->hasNulls ? unionBatch->notNull.data() + offset : nullptr; + unsigned char* tags = unionBatch->tags.data() + offset; + uint64_t* offsets = unionBatch->offsets.data() + offset; std::vector<int64_t> childOffset(children.size(), -1); std::vector<uint64_t> childLength(children.size(), 0); @@ -2922,8 +2692,7 @@ namespace orc { for (uint32_t i = 0; i < children.size(); ++i) { if (childLength[i] > 0) { - children[i]->add(*unionBatch->children[i], - static_cast<uint64_t>(childOffset[i]), + children[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]), childLength[i], nullptr); } } @@ -2964,7 +2733,7 @@ namespace orc { } } - void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + void UnionColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const { ColumnWriter::writeIndex(streams); for (uint32_t i = 0; i < children.size(); ++i) { children[i]->writeIndex(streams); @@ -2980,13 +2749,12 @@ namespace orc { return size; } - void UnionColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { + void UnionColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const { proto::ColumnEncoding encoding; encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); + encoding.set_dictionary_size(0); if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); + encoding.set_bloom_encoding(BloomFilterVersion::UTF8); } encodings.push_back(encoding); for (uint32_t i = 0; i < children.size(); ++i) { @@ -2994,8 +2762,7 @@ namespace orc { } } - void UnionColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void UnionColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getStripeStatistics(stats); for (uint32_t i = 0; i < children.size(); ++i) { children[i]->getStripeStatistics(stats); @@ -3009,15 +2776,14 @@ namespace orc { } } - void UnionColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { + void UnionColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const { ColumnWriter::getFileStatistics(stats); for (uint32_t i = 0; i < children.size(); ++i) { children[i]->getFileStatistics(stats); } } - void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { + void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { ColumnWriter::mergeRowGroupStatsIntoStripeStats(); for (uint32_t i = 0; i < children.size(); ++i) { children[i]->mergeRowGroupStatsIntoStripeStats(); @@ -3049,140 +2815,80 @@ namespace orc { } } - std::unique_ptr<ColumnWriter> buildWriter( - const Type& type, - const StreamsFactory& factory, + std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) { switch (static_cast<int64_t>(type.getKind())) { case STRUCT: - return std::unique_ptr<ColumnWriter>( - new StructColumnWriter( - type, - factory, - options)); + return std::make_unique<StructColumnWriter>(type, factory, options); + case SHORT: + if (options.getUseTightNumericVector()) { + return std::make_unique<IntegerColumnWriter<ShortVectorBatch>>(type, factory, options); + } + return std::make_unique<IntegerColumnWriter<LongVectorBatch>>(type, factory, options); case INT: + if (options.getUseTightNumericVector()) { + return std::make_unique<IntegerColumnWriter<IntVectorBatch>>(type, factory, options); + } + return std::make_unique<IntegerColumnWriter<LongVectorBatch>>(type, factory, options); case LONG: - case SHORT: - return std::unique_ptr<ColumnWriter>( - new IntegerColumnWriter( - type, - factory, - options)); + return std::make_unique<IntegerColumnWriter<LongVectorBatch>>(type, factory, options); case BYTE: - return std::unique_ptr<ColumnWriter>( - new ByteColumnWriter( - type, - factory, - options)); + if (options.getUseTightNumericVector()) { + return std::make_unique<ByteColumnWriter<ByteVectorBatch>>(type, factory, options); + } + return std::make_unique<ByteColumnWriter<LongVectorBatch>>(type, factory, options); case BOOLEAN: - return std::unique_ptr<ColumnWriter>( - new BooleanColumnWriter( - type, - factory, - options)); + if (options.getUseTightNumericVector()) { + return std::make_unique<BooleanColumnWriter<ByteVectorBatch>>(type, factory, options); + } + return std::make_unique<BooleanColumnWriter<LongVectorBatch>>(type, factory, options); case DOUBLE: - return std::unique_ptr<ColumnWriter>( - new DoubleColumnWriter( - type, - factory, - options, - false)); + return std::make_unique<FloatingColumnWriter<double, DoubleVectorBatch>>(type, factory, + options, false); case FLOAT: - return std::unique_ptr<ColumnWriter>( - new DoubleColumnWriter( - type, - factory, - options, - true)); + if (options.getUseTightNumericVector()) { + return std::make_unique<FloatingColumnWriter<float, FloatVectorBatch>>(type, factory, + options, true); + } + return std::make_unique<FloatingColumnWriter<double, DoubleVectorBatch>>(type, factory, + options, true); case BINARY: - return std::unique_ptr<ColumnWriter>( - new BinaryColumnWriter( - type, - factory, - options)); + return std::make_unique<BinaryColumnWriter>(type, factory, options); case STRING: - return std::unique_ptr<ColumnWriter>( - new StringColumnWriter( - type, - factory, - options)); + return std::make_unique<StringColumnWriter>(type, factory, options); case CHAR: - return std::unique_ptr<ColumnWriter>( - new CharColumnWriter( - type, - factory, - options)); + return std::make_unique<CharColumnWriter>(type, factory, options); case VARCHAR: - return std::unique_ptr<ColumnWriter>( - new VarCharColumnWriter( - type, - factory, - options)); + return std::make_unique<VarCharColumnWriter>(type, factory, options); case DATE: - return std::unique_ptr<ColumnWriter>( - new DateColumnWriter( - type, - factory, - options)); + return std::make_unique<DateColumnWriter>(type, factory, options); case TIMESTAMP: - return std::unique_ptr<ColumnWriter>( - new TimestampColumnWriter( - type, - factory, - options, - false)); + return std::make_unique<TimestampColumnWriter>(type, factory, options, false); case TIMESTAMP_INSTANT: - return std::unique_ptr<ColumnWriter>( - new TimestampColumnWriter( - type, - factory, - options, - true)); + return std::make_unique<TimestampColumnWriter>(type, factory, options, true); case DECIMAL: if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) { if (options.getFileVersion() == FileVersion::UNSTABLE_PRE_2_0()) { - return std::unique_ptr<ColumnWriter>( - new Decimal64ColumnWriterV2( - type, - factory, - options)); + return std::make_unique<Decimal64ColumnWriterV2>(type, factory, options); } - return std::unique_ptr<ColumnWriter>( - new Decimal64ColumnWriter( - type, - factory, - options)); + return std::make_unique<Decimal64ColumnWriter>(type, factory, options); } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) { - return std::unique_ptr<ColumnWriter>( - new Decimal128ColumnWriter( - type, - factory, - options)); + return std::make_unique<Decimal128ColumnWriter>(type, factory, options); } else { - throw NotImplementedYet("Decimal precision more than 38 is not " - "supported"); + throw NotImplementedYet( + "Decimal precision more than 38 is not " + "supported"); } case LIST: - return std::unique_ptr<ColumnWriter>( - new ListColumnWriter( - type, - factory, - options)); + return std::make_unique<ListColumnWriter>(type, factory, options); case MAP: - return std::unique_ptr<ColumnWriter>( - new MapColumnWriter( - type, - factory, - options)); + return std::make_unique<MapColumnWriter>(type, factory, options); case UNION: - return std::unique_ptr<ColumnWriter>( - new UnionColumnWriter( - type, - factory, - options)); + return std::make_unique<UnionColumnWriter>(type, factory, options); default: - throw NotImplementedYet("Type is not supported yet for creating " - "ColumnWriter."); + throw NotImplementedYet( + "Type is not supported yet for creating " + "ColumnWriter."); } } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh index 20983774c4..f21ffd6f83 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh @@ -24,15 +24,15 @@ #include "BloomFilter.hh" #include "ByteRLE.hh" #include "Compression.hh" -#include "orc/Exceptions.hh" #include "Statistics.hh" +#include "orc/Exceptions.hh" #include "wrap/orc-proto-wrapper.hh" namespace orc { class StreamsFactory { - public: + public: virtual ~StreamsFactory(); /** @@ -40,29 +40,26 @@ namespace orc { * @param kind the kind of the stream * @return the buffered output stream */ - virtual std::unique_ptr<BufferedOutputStream> - createStream(proto::Stream_Kind kind) const = 0; + virtual std::unique_ptr<BufferedOutputStream> createStream(proto::Stream_Kind kind) const = 0; }; - std::unique_ptr<StreamsFactory> createStreamsFactory( - const WriterOptions& options, - OutputStream * outStream); + std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options, + OutputStream* outStream); /** * record stream positions for row index */ class RowIndexPositionRecorder : public PositionRecorder { - public: + public: virtual ~RowIndexPositionRecorder() override; - RowIndexPositionRecorder(proto::RowIndexEntry& entry): - rowIndexEntry(entry) {} + RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry(entry) {} virtual void add(uint64_t pos) override { rowIndexEntry.add_positions(pos); } - private: + private: proto::RowIndexEntry& rowIndexEntry; }; @@ -70,7 +67,7 @@ namespace orc { * The interface for writing ORC data types. */ class ColumnWriter { - protected: + protected: std::unique_ptr<ByteRleEncoder> notNullEncoder; uint64_t columnId; std::unique_ptr<MutableColumnStatistics> colIndexStatistics; @@ -88,9 +85,8 @@ namespace orc { std::unique_ptr<BloomFilterImpl> bloomFilter; std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex; - public: - ColumnWriter(const Type& type, const StreamsFactory& factory, - const WriterOptions& options); + public: + ColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); virtual ~ColumnWriter(); @@ -103,10 +99,8 @@ namespace orc { * a mask (with at least numValues bytes) for which * values to write. */ - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char * incomingMask); + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, + const char* incomingMask); /** * Flush column writer output streams. * @param streams vector to store streams generated by flush() @@ -123,22 +117,19 @@ namespace orc { * Get the encoding used by the writer for this column. * @param encodings vector to store the returned ColumnEncoding info */ - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const = 0; + virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const = 0; /** * Get the stripe statistics for this column. * @param stats vector to store the returned stripe statistics */ - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const; + virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const; /** * Get the file statistics for this column. * @param stats vector to store the returned file statistics */ - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const; + virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const; /** * Merge index stats into stripe stats and reset index stats. @@ -167,7 +158,7 @@ namespace orc { * Write row index streams for this column. * @param streams output list of ROW_INDEX streams */ - virtual void writeIndex(std::vector<proto::Stream> &streams) const; + virtual void writeIndex(std::vector<proto::Stream>& streams) const; /** * Record positions for index. @@ -188,22 +179,21 @@ namespace orc { */ virtual void writeDictionary(); - protected: + protected: /** * Utility function to translate ColumnStatistics into protobuf form and * add it to output list. * @param statsList output list for protobuf stats * @param stats ColumnStatistics to be transformed and added */ - void getProtoBufStatistics( - std::vector<proto::ColumnStatistics>& statsList, - const MutableColumnStatistics* stats) const { - proto::ColumnStatistics pbStats; - stats->toProtoBuf(pbStats); - statsList.push_back(pbStats); - } + void getProtoBufStatistics(std::vector<proto::ColumnStatistics>& statsList, + const MutableColumnStatistics* stats) const { + proto::ColumnStatistics pbStats; + stats->toProtoBuf(pbStats); + statsList.push_back(pbStats); + } - protected: + protected: MemoryPool& memPool; std::unique_ptr<BufferedOutputStream> indexStream; std::unique_ptr<BufferedOutputStream> bloomFilterStream; @@ -213,10 +203,8 @@ namespace orc { /** * Create a writer for the given type. */ - std::unique_ptr<ColumnWriter> buildWriter( - const Type& type, - const StreamsFactory& factory, + std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc index 477bfd3b4c..cf2ff27ef1 100644 --- a/contrib/libs/apache/orc/c++/src/Common.cc +++ b/contrib/libs/apache/orc/c++/src/Common.cc @@ -82,6 +82,8 @@ namespace orc { return "Scritchley Go"; case TRINO_WRITER: return "Trino"; + case CUDF_WRITER: + return "CUDF"; default: { std::ostringstream buffer; buffer << "Unknown(" << id << ")"; @@ -138,14 +140,14 @@ namespace orc { ss << majorVersion << '.' << minorVersion; return ss.str(); } - - const FileVersion& FileVersion::v_0_11(){ - static FileVersion version(0,11); + + const FileVersion& FileVersion::v_0_11() { + static FileVersion version(0, 11); return version; } - - const FileVersion& FileVersion::v_0_12(){ - static FileVersion version(0,12); + + const FileVersion& FileVersion::v_0_12() { + static FileVersion version(0, 12); return version; } @@ -156,9 +158,9 @@ namespace orc { * without providing any forward or backward compatibility. * * When 2.0 is released, this version identifier will be completely removed. - */ + */ const FileVersion& FileVersion::UNSTABLE_PRE_2_0() { static FileVersion version(1, 9999); return version; } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc index ea10171507..94be774ab4 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.cc +++ b/contrib/libs/apache/orc/c++/src/Compression.cc @@ -16,13 +16,15 @@ * limitations under the License. */ -#include "Adaptor.hh" #include "Compression.hh" -#include "orc/Exceptions.hh" +#include "Adaptor.hh" #include "LzoDecompressor.hh" +#include "Utils.hh" #include "lz4.h" +#include "orc/Exceptions.hh" #include <algorithm> +#include <array> #include <iomanip> #include <iostream> #include <sstream> @@ -47,28 +49,30 @@ namespace orc { - class CompressionStreamBase: public BufferedOutputStream { - public: - CompressionStreamBase(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); + class CompressionStreamBase : public BufferedOutputStream { + public: + CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); - virtual bool Next(void** data, int*size) override = 0; + virtual bool Next(void** data, int* size) override = 0; virtual void BackUp(int count) override; virtual std::string getName() const override = 0; virtual uint64_t flush() override; + virtual void suppress() override; - virtual bool isCompressed() const override { return true; } + virtual bool isCompressed() const override { + return true; + } virtual uint64_t getSize() const override; - protected: - void writeHeader(char * buffer, size_t compressedSize, bool original) { - buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); - buffer[1] = static_cast<char>(compressedSize >> 7); - buffer[2] = static_cast<char>(compressedSize >> 15); + protected: + void writeData(const unsigned char* data, int size); + + void writeHeader(size_t compressedSize, bool original) { + *header[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); + *header[1] = static_cast<char>(compressedSize >> 7); + *header[2] = static_cast<char>(compressedSize >> 15); } // ensure enough room for compression block header @@ -81,7 +85,7 @@ namespace orc { int level; // Compressed data output buffer - char * outputBuffer; + char* outputBuffer; // Size for compressionBuffer int bufferSize; @@ -91,24 +95,24 @@ namespace orc { // Compress output buffer size int outputSize; + + // Compression block header pointer array + static const uint32_t HEADER_SIZE = 3; + std::array<char*, HEADER_SIZE> header; }; - CompressionStreamBase::CompressionStreamBase(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) : - BufferedOutputStream(pool, - outStream, - capacity, - blockSize), - rawInputBuffer(pool, blockSize), - level(compressionLevel), - outputBuffer(nullptr), - bufferSize(0), - outputPosition(0), - outputSize(0) { - // PASS + CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel, + uint64_t capacity, uint64_t blockSize, + MemoryPool& pool, WriterMetrics* metrics) + : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics), + rawInputBuffer(pool, blockSize), + level(compressionLevel), + outputBuffer(nullptr), + bufferSize(0), + outputPosition(0), + outputSize(0) { + // init header pointer array + header.fill(nullptr); } void CompressionStreamBase::BackUp(int count) { @@ -119,7 +123,7 @@ namespace orc { } uint64_t CompressionStreamBase::flush() { - void * data; + void* data; int size; if (!Next(&data, &size)) { throw std::runtime_error("Failed to flush compression buffer."); @@ -129,79 +133,91 @@ namespace orc { return BufferedOutputStream::flush(); } + void CompressionStreamBase::suppress() { + outputBuffer = nullptr; + bufferSize = outputPosition = outputSize = 0; + BufferedOutputStream::suppress(); + } + uint64_t CompressionStreamBase::getSize() const { - return BufferedOutputStream::getSize() - - static_cast<uint64_t>(outputSize - outputPosition); + return BufferedOutputStream::getSize() - static_cast<uint64_t>(outputSize - outputPosition); + } + + // write the data content into outputBuffer + void CompressionStreamBase::writeData(const unsigned char* data, int size) { + int offset = 0; + while (offset < size) { + if (outputPosition == outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { + throw std::runtime_error("Failed to get next output buffer from output stream."); + } + outputPosition = 0; + } else if (outputPosition > outputSize) { + // for safety this will unlikely happen + throw std::logic_error("Write to an out-of-bound place during compression!"); + } + int currentSize = std::min(outputSize - outputPosition, size - offset); + memcpy(outputBuffer + outputPosition, data + offset, static_cast<size_t>(currentSize)); + offset += currentSize; + outputPosition += currentSize; + } } void CompressionStreamBase::ensureHeader() { // adjust 3 bytes for the compression header - if (outputPosition + 3 >= outputSize) { - int newPosition = outputPosition + 3 - outputSize; - if (!BufferedOutputStream::Next( - reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::runtime_error( - "Failed to get next output buffer from output stream."); + for (uint32_t i = 0; i < HEADER_SIZE; ++i) { + if (outputPosition >= outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { + throw std::runtime_error("Failed to get next output buffer from output stream."); + } + outputPosition = 0; } - outputPosition = newPosition; - } else { - outputPosition += 3; + header[i] = outputBuffer + outputPosition; + ++outputPosition; } } /** * Streaming compression base class */ - class CompressionStream: public CompressionStreamBase { - public: - CompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual bool Next(void** data, int*size) override; + class CompressionStream : public CompressionStreamBase { + public: + CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + + virtual bool Next(void** data, int* size) override; virtual std::string getName() const override = 0; - protected: + protected: // return total compressed size virtual uint64_t doStreamingCompression() = 0; }; - CompressionStream::CompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) : - CompressionStreamBase(outStream, - compressionLevel, - capacity, - blockSize, - pool) { + CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel, + uint64_t capacity, uint64_t blockSize, MemoryPool& pool, + WriterMetrics* metrics) + : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) { // PASS } - bool CompressionStream::Next(void** data, int*size) { + bool CompressionStream::Next(void** data, int* size) { if (bufferSize != 0) { ensureHeader(); + uint64_t preSize = getSize(); uint64_t totalCompressedSize = doStreamingCompression(); - - char * header = outputBuffer + outputPosition - totalCompressedSize - 3; if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { - writeHeader(header, static_cast<size_t>(bufferSize), true); - memcpy( - header + 3, - rawInputBuffer.data(), - static_cast<size_t>(bufferSize)); - - int backup = static_cast<int>(totalCompressedSize) - bufferSize; - BufferedOutputStream::BackUp(backup); - outputPosition -= backup; - outputSize -= backup; + writeHeader(static_cast<size_t>(bufferSize), true); + // reset output buffer + outputBuffer = nullptr; + outputPosition = outputSize = 0; + uint64_t backup = getSize() - preSize; + BufferedOutputStream::BackUp(static_cast<int>(backup)); + + // copy raw input buffer into block buffer + writeData(rawInputBuffer.data(), bufferSize); } else { - writeHeader(header, totalCompressedSize, false); + writeHeader(totalCompressedSize, false); } } @@ -212,13 +228,10 @@ namespace orc { return true; } - class ZlibCompressionStream: public CompressionStream { - public: - ZlibCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); + class ZlibCompressionStream : public CompressionStream { + public: + ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); virtual ~ZlibCompressionStream() override { end(); @@ -226,26 +239,19 @@ namespace orc { virtual std::string getName() const override; - protected: + protected: virtual uint64_t doStreamingCompression() override; - private: + private: void init(); void end(); z_stream strm; }; - ZlibCompressionStream::ZlibCompressionStream( - OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : CompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { + ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel, + uint64_t capacity, uint64_t blockSize, + MemoryPool& pool, WriterMetrics* metrics) + : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) { init(); } @@ -259,18 +265,13 @@ namespace orc { do { if (outputPosition >= outputSize) { - if (!BufferedOutputStream::Next( - reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::runtime_error( - "Failed to get next output buffer from output stream."); + if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) { + throw std::runtime_error("Failed to get next output buffer from output stream."); } outputPosition = 0; } - strm.next_out = reinterpret_cast<unsigned char *> - (outputBuffer + outputPosition); - strm.avail_out = static_cast<unsigned int> - (outputSize - outputPosition); + strm.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition); + strm.avail_out = static_cast<unsigned int>(outputSize - outputPosition); int ret = deflate(&strm, Z_FINISH); outputPosition = outputSize - static_cast<int>(strm.avail_out); @@ -291,7 +292,7 @@ namespace orc { return "ZlibCompressionStream"; } -DIAGNOSTIC_PUSH + DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) DIAGNOSTIC_IGNORE("-Wold-style-cast") @@ -303,8 +304,7 @@ DIAGNOSTIC_PUSH strm.opaque = nullptr; strm.next_in = nullptr; - if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) - != Z_OK) { + if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) { throw std::runtime_error("Error while calling deflateInit2() for zlib."); } } @@ -313,42 +313,46 @@ DIAGNOSTIC_PUSH (void)deflateEnd(&strm); } -DIAGNOSTIC_PUSH + DIAGNOSTIC_PUSH - enum DecompressState { DECOMPRESS_HEADER, - DECOMPRESS_START, - DECOMPRESS_CONTINUE, - DECOMPRESS_ORIGINAL, - DECOMPRESS_EOF}; + enum DecompressState { + DECOMPRESS_HEADER, + DECOMPRESS_START, + DECOMPRESS_CONTINUE, + DECOMPRESS_ORIGINAL, + DECOMPRESS_EOF + }; std::string decompressStateToString(DecompressState state) { switch (state) { - case DECOMPRESS_HEADER: return "DECOMPRESS_HEADER"; - case DECOMPRESS_START: return "DECOMPRESS_START"; - case DECOMPRESS_CONTINUE: return "DECOMPRESS_CONTINUE"; - case DECOMPRESS_ORIGINAL: return "DECOMPRESS_ORIGINAL"; - case DECOMPRESS_EOF: return "DECOMPRESS_EOF"; + case DECOMPRESS_HEADER: + return "DECOMPRESS_HEADER"; + case DECOMPRESS_START: + return "DECOMPRESS_START"; + case DECOMPRESS_CONTINUE: + return "DECOMPRESS_CONTINUE"; + case DECOMPRESS_ORIGINAL: + return "DECOMPRESS_ORIGINAL"; + case DECOMPRESS_EOF: + return "DECOMPRESS_EOF"; } return "unknown"; } class DecompressionStream : public SeekableInputStream { - public: - DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, - MemoryPool& pool); + public: + DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t bufferSize, + MemoryPool& pool, ReaderMetrics* metrics); virtual ~DecompressionStream() override {} - virtual bool Next(const void** data, int*size) override; + virtual bool Next(const void** data, int* size) override; virtual void BackUp(int count) override; virtual bool Skip(int count) override; virtual int64_t ByteCount() const override; virtual void seek(PositionProvider& position) override; virtual std::string getName() const override = 0; - protected: - virtual void NextDecompress(const void** data, - int*size, - size_t availableSize) = 0; + protected: + virtual void NextDecompress(const void** data, int* size, size_t availableSize) = 0; std::string getStreamName() const; void readBuffer(bool failOnEof); @@ -366,8 +370,8 @@ DIAGNOSTIC_PUSH // The starting and current position of the buffer for the uncompressed // data. It either points to the data buffer or the underlying input stream. - const char *outputBufferStart; - const char *outputBuffer; + const char* outputBufferStart; + const char* outputBuffer; size_t outputBufferLength; // The uncompressed buffer length. For compressed chunk, it's the original // (ie. the overall) and the actual length of the decompressed data. @@ -379,9 +383,9 @@ DIAGNOSTIC_PUSH size_t remainingLength; // the last buffer returned from the input - const char *inputBufferStart; - const char *inputBuffer; - const char *inputBufferEnd; + const char* inputBufferStart; + const char* inputBuffer; + const char* inputBufferEnd; // Variables for saving the position of the header and the start of the // buffer. Used when we have to seek a position. @@ -390,37 +394,38 @@ DIAGNOSTIC_PUSH // roughly the number of bytes returned off_t bytesReturned; + + ReaderMetrics* metrics; }; - DecompressionStream::DecompressionStream( - std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, - MemoryPool& _pool - ) : pool(_pool), - input(std::move(inStream)), - outputDataBuffer(pool, bufferSize), - state(DECOMPRESS_HEADER), - outputBufferStart(nullptr), - outputBuffer(nullptr), - outputBufferLength(0), - uncompressedBufferLength(0), - remainingLength(0), - inputBufferStart(nullptr), - inputBuffer(nullptr), - inputBufferEnd(nullptr), - headerPosition(0), - inputBufferStartPosition(0), - bytesReturned(0) { - } + DecompressionStream::DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t bufferSize, MemoryPool& _pool, + ReaderMetrics* _metrics) + : pool(_pool), + input(std::move(inStream)), + outputDataBuffer(pool, bufferSize), + state(DECOMPRESS_HEADER), + outputBufferStart(nullptr), + outputBuffer(nullptr), + outputBufferLength(0), + uncompressedBufferLength(0), + remainingLength(0), + inputBufferStart(nullptr), + inputBuffer(nullptr), + inputBufferEnd(nullptr), + headerPosition(0), + inputBufferStartPosition(0), + bytesReturned(0), + metrics(_metrics) {} std::string DecompressionStream::getStreamName() const { return input->getName(); } void DecompressionStream::readBuffer(bool failOnEof) { + SCOPED_MINUS_STOPWATCH(metrics, DecompressionLatencyUs); int length; - if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), - &length)) { + if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), &length)) { if (failOnEof) { throw ParseError("Read past EOF in DecompressionStream::readBuffer"); } @@ -430,8 +435,7 @@ DIAGNOSTIC_PUSH inputBufferStart = nullptr; } else { inputBufferEnd = inputBuffer + length; - inputBufferStartPosition - = static_cast<size_t>(input->ByteCount() - length); + inputBufferStartPosition = static_cast<size_t>(input->ByteCount() - length); inputBufferStart = inputBuffer; } } @@ -462,7 +466,8 @@ DIAGNOSTIC_PUSH } } - bool DecompressionStream::Next(const void** data, int*size) { + bool DecompressionStream::Next(const void** data, int* size) { + SCOPED_STOPWATCH(metrics, DecompressionLatencyUs, DecompressionCall); // If we are starting a new header, we will have to store its positions // after decompressing. bool saveBufferPositions = false; @@ -478,8 +483,8 @@ DIAGNOSTIC_PUSH if (state == DECOMPRESS_HEADER || remainingLength == 0) { readHeader(); // Here we already read the three bytes of the header. - headerPosition = inputBufferStartPosition - + static_cast<size_t>(inputBuffer - inputBufferStart) - 3; + headerPosition = + inputBufferStartPosition + static_cast<size_t>(inputBuffer - inputBufferStart) - 3; saveBufferPositions = true; } if (state == DECOMPRESS_EOF) { @@ -489,8 +494,7 @@ DIAGNOSTIC_PUSH readBuffer(true); } size_t availableSize = - std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), - remainingLength); + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength); if (state == DECOMPRESS_ORIGINAL) { *data = inputBuffer; *size = static_cast<int>(availableSize); @@ -501,8 +505,9 @@ DIAGNOSTIC_PUSH } else if (state == DECOMPRESS_START) { NextDecompress(data, size, availableSize); } else { - throw std::logic_error("Unknown compression state in " - "DecompressionStream::Next"); + throw std::logic_error( + "Unknown compression state in " + "DecompressionStream::Next"); } bytesReturned += static_cast<off_t>(*size); if (saveBufferPositions) { @@ -530,7 +535,7 @@ DIAGNOSTIC_PUSH // this is a stupid implementation for now. // should skip entire blocks without decompressing while (count > 0) { - const void *ptr; + const void* ptr; int len; if (!Next(&ptr, &len)) { return false; @@ -560,10 +565,10 @@ DIAGNOSTIC_PUSH // Case 1: the seeked position is in the current chunk and it's buffered and // decompressed/uncompressed. Note that after the headerPosition comes the 3 bytes of // the header. - if (headerPosition == seekedHeaderPosition - && inputBufferStartPosition <= headerPosition + 3 && inputBufferStart) { - position.next(); // Skip the input level position, i.e. seekedHeaderPosition. - size_t posInChunk = position.next(); // Chunk level position. + if (headerPosition == seekedHeaderPosition && inputBufferStartPosition <= headerPosition + 3 && + inputBufferStart) { + position.next(); // Skip the input level position, i.e. seekedHeaderPosition. + size_t posInChunk = position.next(); // Chunk level position. // Case 1.a: The position is in the decompressed/uncompressed buffer. Here we only // need to set the output buffer's pointer to the seeked position. if (uncompressedBufferLength >= posInChunk) { @@ -575,9 +580,8 @@ DIAGNOSTIC_PUSH // Skip bytes to seek. if (!Skip(static_cast<int>(posInChunk - uncompressedBufferLength))) { std::ostringstream ss; - ss << "Bad seek to (chunkHeader=" << seekedHeaderPosition << ", posInChunk=" - << posInChunk << ") in " << getName() << ". DecompressionState: " - << decompressStateToString(state); + ss << "Bad seek to (chunkHeader=" << seekedHeaderPosition << ", posInChunk=" << posInChunk + << ") in " << getName() << ". DecompressionState: " << decompressStateToString(state); throw ParseError(ss.str()); } return; @@ -592,15 +596,14 @@ DIAGNOSTIC_PUSH // Case 2: The input is buffered, but not yet decompressed. No need to // force re-reading the inputBuffer, we just have to move it to the // seeked position. - position.next(); // Skip the input level position. - inputBuffer - = inputBufferStart + (seekedHeaderPosition - inputBufferStartPosition); + position.next(); // Skip the input level position. + inputBuffer = inputBufferStart + (seekedHeaderPosition - inputBufferStartPosition); } else { // Case 3: The seeked position is not in the input buffer, here we are // forcing to read it. inputBuffer = nullptr; inputBufferEnd = nullptr; - input->seek(position); // Actually use the input level position. + input->seek(position); // Actually use the input level position. } bytesReturned = static_cast<off_t>(input->ByteCount()); if (!Skip(static_cast<int>(position.next()))) { @@ -609,33 +612,29 @@ DIAGNOSTIC_PUSH } class ZlibDecompressionStream : public DecompressionStream { - public: - ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool); + public: + ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, + MemoryPool& pool, ReaderMetrics* metrics); virtual ~ZlibDecompressionStream() override; virtual std::string getName() const override; - protected: - virtual void NextDecompress(const void** data, - int* size, - size_t availableSize) override; - private: + protected: + virtual void NextDecompress(const void** data, int* size, size_t availableSize) override; + + private: z_stream zstream; }; -DIAGNOSTIC_PUSH + DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) DIAGNOSTIC_IGNORE("-Wold-style-cast") #endif - ZlibDecompressionStream::ZlibDecompressionStream - (std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, - MemoryPool& _pool - ): DecompressionStream - (std::move(inStream), bufferSize, _pool) { + ZlibDecompressionStream::ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t bufferSize, MemoryPool& _pool, + ReaderMetrics* _metrics) + : DecompressionStream(std::move(inStream), bufferSize, _pool, _metrics) { zstream.next_in = nullptr; zstream.avail_in = 0; zstream.zalloc = nullptr; @@ -645,20 +644,20 @@ DIAGNOSTIC_PUSH zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity()); int64_t result = inflateInit2(&zstream, -15); switch (result) { - case Z_OK: - break; - case Z_MEM_ERROR: - throw std::logic_error("Memory error from inflateInit2"); - case Z_VERSION_ERROR: - throw std::logic_error("Version error from inflateInit2"); - case Z_STREAM_ERROR: - throw std::logic_error("Stream error from inflateInit2"); - default: - throw std::logic_error("Unknown error from inflateInit2"); + case Z_OK: + break; + case Z_MEM_ERROR: + throw std::logic_error("Memory error from inflateInit2"); + case Z_VERSION_ERROR: + throw std::logic_error("Version error from inflateInit2"); + case Z_STREAM_ERROR: + throw std::logic_error("Stream error from inflateInit2"); + default: + throw std::logic_error("Unknown error from inflateInit2"); } } -DIAGNOSTIC_POP + DIAGNOSTIC_POP ZlibDecompressionStream::~ZlibDecompressionStream() { int64_t result = inflateEnd(&zstream); @@ -668,49 +667,48 @@ DIAGNOSTIC_POP } } - void ZlibDecompressionStream::NextDecompress(const void** data, int* size, - size_t availableSize) { - zstream.next_in = - reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + void ZlibDecompressionStream::NextDecompress(const void** data, int* size, size_t availableSize) { + zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); zstream.avail_in = static_cast<uInt>(availableSize); outputBuffer = outputDataBuffer.data(); - zstream.next_out = - reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); + zstream.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity()); if (inflateReset(&zstream) != Z_OK) { - throw std::logic_error("Bad inflateReset in " - "ZlibDecompressionStream::NextDecompress"); + throw std::logic_error( + "Bad inflateReset in " + "ZlibDecompressionStream::NextDecompress"); } int64_t result; do { - result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH : - Z_SYNC_FLUSH); + result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH); switch (result) { - case Z_OK: - remainingLength -= availableSize; - inputBuffer += availableSize; - readBuffer(true); - availableSize = - std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), - remainingLength); - zstream.next_in = - reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availableSize); - break; - case Z_STREAM_END: - break; - case Z_BUF_ERROR: - throw std::logic_error("Buffer error in " - "ZlibDecompressionStream::NextDecompress"); - case Z_DATA_ERROR: - throw std::logic_error("Data error in " - "ZlibDecompressionStream::NextDecompress"); - case Z_STREAM_ERROR: - throw std::logic_error("Stream error in " - "ZlibDecompressionStream::NextDecompress"); - default: - throw std::logic_error("Unknown error in " - "ZlibDecompressionStream::NextDecompress"); + case Z_OK: + remainingLength -= availableSize; + inputBuffer += availableSize; + readBuffer(true); + availableSize = + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength); + zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream.avail_in = static_cast<uInt>(availableSize); + break; + case Z_STREAM_END: + break; + case Z_BUF_ERROR: + throw std::logic_error( + "Buffer error in " + "ZlibDecompressionStream::NextDecompress"); + case Z_DATA_ERROR: + throw std::logic_error( + "Data error in " + "ZlibDecompressionStream::NextDecompress"); + case Z_STREAM_ERROR: + throw std::logic_error( + "Stream error in " + "ZlibDecompressionStream::NextDecompress"); + default: + throw std::logic_error( + "Unknown error in " + "ZlibDecompressionStream::NextDecompress"); } } while (result != Z_STREAM_END); *size = static_cast<int>(outputDataBuffer.capacity() - zstream.avail_out); @@ -727,44 +725,38 @@ DIAGNOSTIC_POP return result.str(); } - class BlockDecompressionStream: public DecompressionStream { - public: - BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool); + class BlockDecompressionStream : public DecompressionStream { + public: + BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, + MemoryPool& pool, ReaderMetrics* metrics); virtual ~BlockDecompressionStream() override {} virtual std::string getName() const override = 0; - protected: - virtual void NextDecompress(const void** data, - int* size, - size_t availableSize) override; + protected: + virtual void NextDecompress(const void** data, int* size, size_t availableSize) override; + + virtual uint64_t decompress(const char* input, uint64_t length, char* output, + size_t maxOutputLength) = 0; - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength) = 0; - private: + private: // may need to stitch together multiple input buffers; // to give snappy a contiguous block DataBuffer<char> inputDataBuffer; }; - BlockDecompressionStream::BlockDecompressionStream - (std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& _pool - ) : DecompressionStream - (std::move(inStream), blockSize, _pool), - inputDataBuffer(pool, blockSize) { - } - + BlockDecompressionStream::BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, MemoryPool& _pool, + ReaderMetrics* _metrics) + : DecompressionStream(std::move(inStream), blockSize, _pool, _metrics), + inputDataBuffer(pool, blockSize) {} void BlockDecompressionStream::NextDecompress(const void** data, int* size, - size_t availableSize) { + size_t availableSize) { // Get contiguous bytes of compressed block. - const char *compressed = inputBuffer; + const char* compressed = inputBuffer; if (remainingLength == availableSize) { - inputBuffer += availableSize; + inputBuffer += availableSize; } else { // Did not read enough from input. if (inputDataBuffer.capacity() < remainingLength) { @@ -774,19 +766,16 @@ DIAGNOSTIC_POP inputBuffer += availableSize; compressed = inputDataBuffer.data(); - for (size_t pos = availableSize; pos < remainingLength; ) { + for (size_t pos = availableSize; pos < remainingLength;) { readBuffer(true); size_t avail = - std::min(static_cast<size_t>(inputBufferEnd - - inputBuffer), - remainingLength - pos); + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength - pos); ::memcpy(inputDataBuffer.data() + pos, inputBuffer, avail); pos += avail; inputBuffer += avail; } } - outputBufferLength = decompress(compressed, remainingLength, - outputDataBuffer.data(), + outputBufferLength = decompress(compressed, remainingLength, outputDataBuffer.data(), outputDataBuffer.capacity()); remainingLength = 0; state = DECOMPRESS_HEADER; @@ -796,15 +785,11 @@ DIAGNOSTIC_POP outputBufferLength = 0; } - class SnappyDecompressionStream: public BlockDecompressionStream { - public: - SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& _pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - _pool) { + class SnappyDecompressionStream : public BlockDecompressionStream { + public: + SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, + MemoryPool& _pool, ReaderMetrics* _metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { // PASS } @@ -814,15 +799,12 @@ DIAGNOSTIC_POP return result.str(); } - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; + protected: + virtual uint64_t decompress(const char* input, uint64_t length, char* output, + size_t maxOutputLength) override; }; - uint64_t SnappyDecompressionStream::decompress(const char *_input, - uint64_t length, - char *output, + uint64_t SnappyDecompressionStream::decompress(const char* _input, uint64_t length, char* output, size_t maxOutputLength) { size_t outLength; if (!snappy::GetUncompressedLength(_input, length, &outLength)) { @@ -839,15 +821,11 @@ DIAGNOSTIC_POP return outLength; } - class LzoDecompressionStream: public BlockDecompressionStream { - public: - LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& _pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - _pool) { + class LzoDecompressionStream : public BlockDecompressionStream { + public: + LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, + MemoryPool& _pool, ReaderMetrics* _metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { // PASS } @@ -857,29 +835,21 @@ DIAGNOSTIC_POP return result.str(); } - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; + protected: + virtual uint64_t decompress(const char* input, uint64_t length, char* output, + size_t maxOutputLength) override; }; - uint64_t LzoDecompressionStream::decompress(const char *inputPtr, - uint64_t length, - char *output, + uint64_t LzoDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output, size_t maxOutputLength) { - return lzoDecompress(inputPtr, inputPtr + length, output, - output + maxOutputLength); + return lzoDecompress(inputPtr, inputPtr + length, output, output + maxOutputLength); } - class Lz4DecompressionStream: public BlockDecompressionStream { - public: - Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& _pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - _pool) { + class Lz4DecompressionStream : public BlockDecompressionStream { + public: + Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, + MemoryPool& _pool, ReaderMetrics* _metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { // PASS } @@ -889,15 +859,12 @@ DIAGNOSTIC_POP return result.str(); } - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; + protected: + virtual uint64_t decompress(const char* input, uint64_t length, char* output, + size_t maxOutputLength) override; }; - uint64_t Lz4DecompressionStream::decompress(const char *inputPtr, - uint64_t length, - char *output, + uint64_t Lz4DecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output, size_t maxOutputLength) { int result = LZ4_decompress_safe(inputPtr, output, static_cast<int>(length), static_cast<int>(maxOutputLength)); @@ -910,26 +877,20 @@ DIAGNOSTIC_POP /** * Block compression base class */ - class BlockCompressionStream: public CompressionStreamBase { - public: - BlockCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : CompressionStreamBase(outStream, - compressionLevel, - capacity, - blockSize, - pool) - , compressorBuffer(pool) { + class BlockCompressionStream : public CompressionStreamBase { + public: + BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) + : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics), + compressorBuffer(pool) { // PASS } - virtual bool Next(void** data, int*size) override; + virtual bool Next(void** data, int* size) override; + virtual void suppress() override; virtual std::string getName() const override = 0; - protected: + protected: // compresses a block and returns the compressed size virtual uint64_t doBlockCompression() = 0; @@ -941,50 +902,27 @@ DIAGNOSTIC_POP DataBuffer<unsigned char> compressorBuffer; }; - bool BlockCompressionStream::Next(void** data, int*size) { + bool BlockCompressionStream::Next(void** data, int* size) { if (bufferSize != 0) { ensureHeader(); // perform compression size_t totalCompressedSize = doBlockCompression(); - const unsigned char * dataToWrite = nullptr; + const unsigned char* dataToWrite = nullptr; int totalSizeToWrite = 0; - char * header = outputBuffer + outputPosition - 3; if (totalCompressedSize >= static_cast<size_t>(bufferSize)) { - writeHeader(header, static_cast<size_t>(bufferSize), true); + writeHeader(static_cast<size_t>(bufferSize), true); dataToWrite = rawInputBuffer.data(); totalSizeToWrite = bufferSize; } else { - writeHeader(header, totalCompressedSize, false); + writeHeader(totalCompressedSize, false); dataToWrite = compressorBuffer.data(); totalSizeToWrite = static_cast<int>(totalCompressedSize); } - char * dst = header + 3; - while (totalSizeToWrite > 0) { - if (outputPosition == outputSize) { - if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::logic_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = 0; - dst = outputBuffer; - } else if (outputPosition > outputSize) { - // this will unlikely happen, but we have seen a few on zstd v1.1.0 - throw std::logic_error("Write to an out-of-bound place!"); - } - - int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition); - std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite)); - - outputPosition += sizeToWrite; - dataToWrite += sizeToWrite; - totalSizeToWrite -= sizeToWrite; - dst += sizeToWrite; - } + writeData(dataToWrite, totalSizeToWrite); } *data = rawInputBuffer.data(); @@ -995,52 +933,48 @@ DIAGNOSTIC_POP return true; } + void BlockCompressionStream::suppress() { + compressorBuffer.resize(0); + CompressionStreamBase::suppress(); + } + /** * LZ4 block compression */ - class Lz4CompressionSteam: public BlockCompressionStream { - public: - Lz4CompressionSteam(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : BlockCompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { + class Lz4CompressionSteam : public BlockCompressionStream { + public: + Lz4CompressionSteam(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) + : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) { this->init(); } virtual std::string getName() const override { return "Lz4CompressionStream"; } - + virtual ~Lz4CompressionSteam() override { this->end(); } - protected: + protected: virtual uint64_t doBlockCompression() override; virtual uint64_t estimateMaxCompressionSize() override { return static_cast<uint64_t>(LZ4_compressBound(bufferSize)); } - private: + private: void init(); void end(); - LZ4_stream_t *state; + LZ4_stream_t* state; }; uint64_t Lz4CompressionSteam::doBlockCompression() { - int result = LZ4_compress_fast_extState(static_cast<void*>(state), - reinterpret_cast<const char*>(rawInputBuffer.data()), - reinterpret_cast<char*>(compressorBuffer.data()), - bufferSize, - static_cast<int>(compressorBuffer.size()), - level); + int result = LZ4_compress_fast_extState( + static_cast<void*>(state), reinterpret_cast<const char*>(rawInputBuffer.data()), + reinterpret_cast<char*>(compressorBuffer.data()), bufferSize, + static_cast<int>(compressorBuffer.size()), level); if (result == 0) { throw std::runtime_error("Error during block compression using lz4."); } @@ -1062,34 +996,25 @@ DIAGNOSTIC_POP /** * Snappy block compression */ - class SnappyCompressionStream: public BlockCompressionStream { - public: - SnappyCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : BlockCompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { - } + class SnappyCompressionStream : public BlockCompressionStream { + public: + SnappyCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) + : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {} virtual std::string getName() const override { return "SnappyCompressionStream"; } - + virtual ~SnappyCompressionStream() override { // PASS } - protected: + protected: virtual uint64_t doBlockCompression() override; virtual uint64_t estimateMaxCompressionSize() override { - return static_cast<uint64_t> - (snappy::MaxCompressedLength(static_cast<size_t>(bufferSize))); + return static_cast<uint64_t>(snappy::MaxCompressedLength(static_cast<size_t>(bufferSize))); } }; @@ -1097,92 +1022,75 @@ DIAGNOSTIC_POP size_t compressedLength; snappy::RawCompress(reinterpret_cast<const char*>(rawInputBuffer.data()), static_cast<size_t>(bufferSize), - reinterpret_cast<char*>(compressorBuffer.data()), - &compressedLength); + reinterpret_cast<char*>(compressorBuffer.data()), &compressedLength); return static_cast<uint64_t>(compressedLength); } /** * ZSTD block compression */ - class ZSTDCompressionStream: public BlockCompressionStream { - public: - ZSTDCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : BlockCompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { + class ZSTDCompressionStream : public BlockCompressionStream { + public: + ZSTDCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, + uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) + : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) { this->init(); } virtual std::string getName() const override { return "ZstdCompressionStream"; } - + virtual ~ZSTDCompressionStream() override { this->end(); } - protected: + protected: virtual uint64_t doBlockCompression() override; virtual uint64_t estimateMaxCompressionSize() override { return ZSTD_compressBound(static_cast<size_t>(bufferSize)); } - - private: + + private: void init(); void end(); - ZSTD_CCtx *cctx; + ZSTD_CCtx* cctx; }; uint64_t ZSTDCompressionStream::doBlockCompression() { - return ZSTD_compressCCtx(cctx, - compressorBuffer.data(), - compressorBuffer.size(), - rawInputBuffer.data(), - static_cast<size_t>(bufferSize), - level); + return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(), + rawInputBuffer.data(), static_cast<size_t>(bufferSize), level); } - -DIAGNOSTIC_PUSH + + DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) DIAGNOSTIC_IGNORE("-Wold-style-cast") #endif void ZSTDCompressionStream::init() { - cctx = ZSTD_createCCtx(); if (!cctx) { throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd."); } } - void ZSTDCompressionStream::end() { (void)ZSTD_freeCCtx(cctx); cctx = nullptr; } -DIAGNOSTIC_PUSH + DIAGNOSTIC_PUSH /** * ZSTD block decompression */ - class ZSTDDecompressionStream: public BlockDecompressionStream { - public: - ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& _pool) - : BlockDecompressionStream(std::move(inStream), - blockSize, - _pool) { + class ZSTDDecompressionStream : public BlockDecompressionStream { + public: + ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize, + MemoryPool& _pool, ReaderMetrics* _metrics) + : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) { this->init(); } @@ -1196,127 +1104,106 @@ DIAGNOSTIC_PUSH return result.str(); } - protected: - virtual uint64_t decompress(const char *input, - uint64_t length, - char *output, + protected: + virtual uint64_t decompress(const char* input, uint64_t length, char* output, size_t maxOutputLength) override; - private: + private: void init(); void end(); - ZSTD_DCtx *dctx; + ZSTD_DCtx* dctx; }; - uint64_t ZSTDDecompressionStream::decompress(const char *inputPtr, - uint64_t length, - char *output, + uint64_t ZSTDDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output, size_t maxOutputLength) { - return static_cast<uint64_t>(ZSTD_decompressDCtx(dctx, - output, - maxOutputLength, - inputPtr, - length)); + return static_cast<uint64_t>( + ZSTD_decompressDCtx(dctx, output, maxOutputLength, inputPtr, length)); } -DIAGNOSTIC_PUSH + DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) DIAGNOSTIC_IGNORE("-Wold-style-cast") #endif void ZSTDDecompressionStream::init() { - dctx = ZSTD_createDCtx(); if (!dctx) { throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd."); } } - void ZSTDDecompressionStream::end() { (void)ZSTD_freeDCtx(dctx); dctx = nullptr; } -DIAGNOSTIC_PUSH + DIAGNOSTIC_PUSH - std::unique_ptr<BufferedOutputStream> - createCompressor( - CompressionKind kind, - OutputStream * outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool) { + std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind, + OutputStream* outStream, + CompressionStrategy strategy, + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + MemoryPool& pool, WriterMetrics* metrics) { switch (static_cast<int64_t>(kind)) { - case CompressionKind_NONE: { - return std::unique_ptr<BufferedOutputStream> - (new BufferedOutputStream( - pool, outStream, bufferCapacity, compressionBlockSize)); - } - case CompressionKind_ZLIB: { - int level = (strategy == CompressionStrategy_SPEED) ? - Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; - return std::unique_ptr<BufferedOutputStream> - (new ZlibCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_ZSTD: { - int level = (strategy == CompressionStrategy_SPEED) ? - 1 : ZSTD_CLEVEL_DEFAULT; - return std::unique_ptr<BufferedOutputStream> - (new ZSTDCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_LZ4: { - int level = (strategy == CompressionStrategy_SPEED) ? - LZ4_ACCELERATION_MAX : LZ4_ACCELERATION_DEFAULT; - return std::unique_ptr<BufferedOutputStream> - (new Lz4CompressionSteam( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_SNAPPY: { - int level = 0; - return std::unique_ptr<BufferedOutputStream> - (new SnappyCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_LZO: - default: - throw NotImplementedYet("compression codec"); + case CompressionKind_NONE: { + return std::make_unique<BufferedOutputStream>(pool, outStream, bufferCapacity, + compressionBlockSize, metrics); + } + case CompressionKind_ZLIB: { + int level = + (strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; + return std::make_unique<ZlibCompressionStream>(outStream, level, bufferCapacity, + compressionBlockSize, pool, metrics); + } + case CompressionKind_ZSTD: { + int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT; + return std::make_unique<ZSTDCompressionStream>(outStream, level, bufferCapacity, + compressionBlockSize, pool, metrics); + } + case CompressionKind_LZ4: { + int level = (strategy == CompressionStrategy_SPEED) ? LZ4_ACCELERATION_MAX + : LZ4_ACCELERATION_DEFAULT; + return std::make_unique<Lz4CompressionSteam>(outStream, level, bufferCapacity, + compressionBlockSize, pool, metrics); + } + case CompressionKind_SNAPPY: { + int level = 0; + return std::make_unique<SnappyCompressionStream>(outStream, level, bufferCapacity, + compressionBlockSize, pool, metrics); + } + case CompressionKind_LZO: + default: + throw NotImplementedYet("compression codec"); } } - std::unique_ptr<SeekableInputStream> - createDecompressor(CompressionKind kind, - std::unique_ptr<SeekableInputStream> input, - uint64_t blockSize, - MemoryPool& pool) { + std::unique_ptr<SeekableInputStream> createDecompressor( + CompressionKind kind, std::unique_ptr<SeekableInputStream> input, uint64_t blockSize, + MemoryPool& pool, ReaderMetrics* metrics) { switch (static_cast<int64_t>(kind)) { - case CompressionKind_NONE: - return REDUNDANT_MOVE(input); - case CompressionKind_ZLIB: - return std::unique_ptr<SeekableInputStream> - (new ZlibDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_SNAPPY: - return std::unique_ptr<SeekableInputStream> - (new SnappyDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_LZO: - return std::unique_ptr<SeekableInputStream> - (new LzoDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_LZ4: - return std::unique_ptr<SeekableInputStream> - (new Lz4DecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_ZSTD: - return std::unique_ptr<SeekableInputStream> - (new ZSTDDecompressionStream(std::move(input), blockSize, pool)); - default: { - std::ostringstream buffer; - buffer << "Unknown compression codec " << kind; - throw NotImplementedYet(buffer.str()); - } + case CompressionKind_NONE: + return input; + case CompressionKind_ZLIB: + return std::make_unique<ZlibDecompressionStream>(std::move(input), blockSize, pool, + metrics); + case CompressionKind_SNAPPY: + return std::make_unique<SnappyDecompressionStream>(std::move(input), blockSize, pool, + metrics); + case CompressionKind_LZO: + return std::make_unique<LzoDecompressionStream>(std::move(input), blockSize, pool, metrics); + case CompressionKind_LZ4: + return std::make_unique<Lz4DecompressionStream>(std::move(input), blockSize, pool, metrics); + case CompressionKind_ZSTD: + return std::make_unique<ZSTDDecompressionStream>(std::move(input), blockSize, pool, + metrics); + default: { + std::ostringstream buffer; + buffer << "Unknown compression codec " << kind; + throw NotImplementedYet(buffer.str()); + } } } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh index ff79377d83..55b152dd63 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.hh +++ b/contrib/libs/apache/orc/c++/src/Compression.hh @@ -30,12 +30,11 @@ namespace orc { * @param input the input stream that is the underlying source * @param bufferSize the maximum size of the buffer * @param pool the memory pool + * @param metrics the reader metrics */ - std::unique_ptr<SeekableInputStream> - createDecompressor(CompressionKind kind, - std::unique_ptr<SeekableInputStream> input, - uint64_t bufferSize, - MemoryPool& pool); + std::unique_ptr<SeekableInputStream> createDecompressor( + CompressionKind kind, std::unique_ptr<SeekableInputStream> input, uint64_t bufferSize, + MemoryPool& pool, ReaderMetrics* metrics); /** * Create a compressor for the given compression kind. @@ -46,13 +45,12 @@ namespace orc { * @param compressionBlockSize compression buffer block size * @param pool the memory pool */ - std::unique_ptr<BufferedOutputStream> - createCompressor(CompressionKind kind, - OutputStream * outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool); -} + std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind, + OutputStream* outStream, + CompressionStrategy strategy, + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + MemoryPool& pool, WriterMetrics* metrics); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc new file mode 100644 index 0000000000..459cafa1a0 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc @@ -0,0 +1,1001 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConvertColumnReader.hh" + +namespace orc { + + // Assume that we are using tight numeric vector batch + using BooleanVectorBatch = ByteVectorBatch; + + ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType, + StripeStreams& stripe, bool _throwOnOverflow) + : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) { + reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true, + /*throwOnOverflow=*/false, /*convertToReadType*/ false); + data = + fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true); + } + + void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) { + reader->next(*data, numValues, notNull); + rowBatch.resize(data->capacity); + rowBatch.numElements = data->numElements; + rowBatch.hasNulls = data->hasNulls; + if (!rowBatch.hasNulls) { + memset(rowBatch.notNull.data(), 1, data->notNull.size()); + } else { + memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size()); + } + } + + uint64_t ConvertColumnReader::skip(uint64_t numValues) { + return reader->skip(numValues); + } + + void ConvertColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + reader->seekToRowGroup(positions); + } + + static inline bool canFitInLong(double value) { + constexpr double MIN_LONG_AS_DOUBLE = -0x1p63; + constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63; + return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE)); + } + + template <typename FileType, typename ReadType> + static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) { + if (!shouldThrow) { + dstBatch.notNull.data()[idx] = 0; + dstBatch.hasNulls = true; + } else { + std::ostringstream ss; + ss << "Overflow when convert from " << typeid(FileType).name() << " to " + << typeid(ReadType).name(); + throw SchemaEvolutionError(ss.str()); + } + } + + // return false if overflow + template <typename ReadType> + static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) { + dstValue = static_cast<ReadType>(inputLong); + if constexpr (std::is_same<ReadType, int64_t>::value) { + return true; + } + if (static_cast<int64_t>(dstValue) != inputLong) { + return false; + } + return true; + } + + template <typename DestBatchPtrType> + static inline DestBatchPtrType SafeCastBatchTo(ColumnVectorBatch* batch) { + auto result = dynamic_cast<DestBatchPtrType>(batch); + if (result == nullptr) { + std::ostringstream ss; + ss << "Bad cast when convert from ColumnVectorBatch to " + << typeid(typename std::remove_const< + typename std::remove_pointer<DestBatchPtrType>::type>::type) + .name(); + throw InvalidArgument(ss.str()); + } + return result; + } + + // set null or throw exception if overflow + template <typename ReadType, typename FileType> + static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue, + ColumnVectorBatch& destBatch, uint64_t idx, + bool shouldThrow) { + constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value); + constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value); + int64_t longValue = static_cast<int64_t>(srcValue); + if (isFileTypeFloatingPoint) { + if (isReadTypeFloatingPoint) { + destValue = static_cast<ReadType>(srcValue); + } else { + if (!canFitInLong(static_cast<double>(srcValue)) || + !downCastToInteger(destValue, longValue)) { + handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow); + } + } + } else { + if (isReadTypeFloatingPoint) { + destValue = static_cast<ReadType>(srcValue); + if (destValue != destValue) { // check is NaN + handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow); + } + } else { + if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) { + handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow); + } + } + } + } + + // { boolean, byte, short, int, long, float, double } -> + // { byte, short, int, long, float, double } + template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType> + class NumericConvertColumnReader : public ConvertColumnReader { + public: + NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, + bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); + if (rowBatch.hasNulls) { + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + if (rowBatch.notNull[i]) { + convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i, + throwOnOverflow); + } + } + } else { + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i, + throwOnOverflow); + } + } + } + }; + + // { boolean, byte, short, int, long, float, double } -> { boolean } + template <typename FileTypeBatch> + class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool> + : public ConvertColumnReader { + public: + NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, + bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch); + if (rowBatch.hasNulls) { + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + if (rowBatch.notNull[i]) { + dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1); + } + } + } else { + for (uint64_t i = 0; i < rowBatch.numElements; ++i) { + dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1); + } + } + } + }; + + class ConvertToStringVariantColumnReader : public ConvertColumnReader { + public: + ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType, + StripeStreams& stripe, bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; + + virtual uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0; + + protected: + std::vector<std::string> strBuffer; + }; + + void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + // cache converted string in the buffer + auto totalLength = convertToStrBuffer(rowBatch, numValues); + + // contact string values to blob buffer of vector batch + auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch); + dstBatch.blob.resize(totalLength); + char* blob = dstBatch.blob.data(); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const auto size = strBuffer[i].size(); + ::memcpy(blob, strBuffer[i].c_str(), size); + dstBatch.data[i] = blob; + dstBatch.length[i] = static_cast<int32_t>(size); + blob += size; + } + } + strBuffer.clear(); + } + + class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader { + public: + BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType, + StripeStreams& stripe, bool _throwOnOverflow) + : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) { + trueValue = "TRUE"; + falseValue = "FALSE"; + if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) { + if (readType.getMaximumLength() < 5) { + throw SchemaEvolutionError("Invalid maximum length for boolean type: " + + std::to_string(readType.getMaximumLength())); + } + if (readType.getKind() == CHAR) { + trueValue.resize(readType.getMaximumLength(), ' '); + falseValue.resize(readType.getMaximumLength(), ' '); + } + } + } + + uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override; + + private: + std::string trueValue; + std::string falseValue; + }; + + uint64_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch, + uint64_t numValues) { + uint64_t size = 0; + strBuffer.resize(numValues); + const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get()); + // cast the bool value to string + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue); + size += strBuffer[i].size(); + } + } + return size; + } + + template <typename FileTypeBatch> + class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader { + public: + NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType, + StripeStreams& stripe, bool _throwOnOverflow) + : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override; + }; + + template <typename FileTypeBatch> + uint64_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer( + ColumnVectorBatch& rowBatch, uint64_t numValues) { + uint64_t size = 0; + strBuffer.resize(numValues); + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + if (readType.getKind() == STRING) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = std::to_string(srcBatch.data[i]); + size += strBuffer[i].size(); + } + } + } else if (readType.getKind() == VARCHAR) { + const auto maxLength = readType.getMaximumLength(); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = std::to_string(srcBatch.data[i]); + if (strBuffer[i].size() > maxLength) { + handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow); + } else { + size += strBuffer[i].size(); + } + } + } + } else if (readType.getKind() == CHAR) { + const auto maxLength = readType.getMaximumLength(); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = std::to_string(srcBatch.data[i]); + if (strBuffer[i].size() > maxLength) { + handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow); + } else { + strBuffer[i].resize(maxLength, ' '); + size += strBuffer[i].size(); + } + } + } + } else { + throw SchemaEvolutionError("Invalid type for numeric to string conversion: " + + readType.toString()); + } + return size; + } + + template <typename FileTypeBatch, typename ReadTypeBatch, bool isFloatingFileType> + class NumericToDecimalColumnReader : public ConvertColumnReader { + public: + NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, + bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) { + precision = static_cast<int32_t>(readType.getPrecision()); + scale = static_cast<int32_t>(readType.getScale()); + bool overflow = false; + upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow); + } + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); + dstBatch.precision = precision; + dstBatch.scale = scale; + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + if constexpr (isFloatingFileType) { + convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]); + } else { + convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]); + } + } + } + } + + private: + template <typename SrcType> + void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) { + const auto result = convertDecimal(value, precision, scale); + Int128 i128 = result.second; + if (result.first) { + handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow); + return; + } + + if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) { + if (!i128.fitsInLong()) { + handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow); + } else { + dstBatch.values[idx] = i128.toLong(); + } + } else { + dstBatch.values[idx] = i128; + } + } + + template <typename SrcType> + void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) { + int fromScale = 0; + auto result = convertDecimal(value, fromScale, precision, scale); + if (result.first) { + handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow); + } else { + if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) { + if (!result.second.fitsInLong()) { + handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow); + } else { + dstBatch.values[idx] = result.second.toLong(); + } + } else { + dstBatch.values[idx] = result.second; + } + } + } + + int32_t precision; + int32_t scale; + int64_t scaleMultiplier; + Int128 upperBound; + }; + + class ConvertToTimestampColumnReader : public ConvertColumnReader { + public: + ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType, + StripeStreams& stripe, bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow), + readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT") + : &stripe.getReaderTimezone()), + needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; + + protected: + const orc::Timezone* readerTimezone; + const bool needConvertTimezone; + }; + + // avoid emitting vtable in every translation unit + void ConvertToTimestampColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, + char* notNull) { + ConvertColumnReader::next(rowBatch, numValues, notNull); + } + + template <typename FileTypeBatch> + class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader { + public: + NumericToTimestampColumnReader(const Type& _readType, const Type& fileType, + StripeStreams& stripe, bool _throwOnOverflow) + : ConvertToTimestampColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToTimestamp(dstBatch, i, srcBatch.data[i]); + } + } + } + + private: + template <typename FileType> + void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, FileType value); + }; + + template <typename FileTypeBatch> + template <typename FileType> + void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp( + TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) { + if constexpr (std::is_floating_point<FileType>::value) { + if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) || + value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) { + handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow); + return; + } + dstBatch.data[idx] = static_cast<int64_t>(value); + dstBatch.nanoseconds[idx] = static_cast<int32_t>( + static_cast<double>(value - static_cast<FileType>(dstBatch.data[idx])) * 1e9); + if (dstBatch.nanoseconds[idx] < 0) { + dstBatch.data[idx] -= 1; + dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9); + } + } else { + dstBatch.data[idx] = value; + dstBatch.nanoseconds[idx] = 0; + } + if (needConvertTimezone) { + dstBatch.data[idx] = readerTimezone->convertFromUTC(dstBatch.data[idx]); + } + } + + template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType> + class DecimalToNumericColumnReader : public ConvertColumnReader { + public: + DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, + bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) { + precision = fileType.getPrecision(); + scale = fileType.getScale(); + factor = 1; + for (int i = 0; i < scale; i++) { + factor *= 10; + } + } + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + if constexpr (std::is_floating_point_v<ReadType>) { + convertDecimalToDouble(dstBatch, i, srcBatch); + } else { + convertDecimalToInteger(dstBatch, i, srcBatch); + } + } + } + } + + private: + void convertDecimalToInteger(ReadTypeBatch& dstBatch, uint64_t idx, + const FileTypeBatch& srcBatch) { + using FileType = decltype(srcBatch.values[idx]); + Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale); + if (!result.fitsInLong()) { + handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow); + return; + } + convertNumericElement<ReadType, int64_t>(result.toLong(), dstBatch.data[idx], dstBatch, idx, + throwOnOverflow); + } + + void convertDecimalToDouble(ReadTypeBatch& dstBatch, uint64_t idx, + const FileTypeBatch& srcBatch) { + double doubleValue = Int128(srcBatch.values[idx]).toDouble(); + dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor); + } + + int32_t precision; + int32_t scale; + int64_t factor; + }; + + template <typename FileTypeBatch> + class DecimalToNumericColumnReader<FileTypeBatch, BooleanVectorBatch, bool> + : public ConvertColumnReader { + public: + DecimalToNumericColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, + bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + dstBatch.data[i] = srcBatch.values[i] == 0 ? 0 : 1; + } + } + } + }; + + template <typename FileTypeBatch, typename ReadTypeBatch> + class DecimalConvertColumnReader : public ConvertColumnReader { + public: + DecimalConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe, + bool _throwOnOverflow) + : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) { + fromPrecision = fileType.getPrecision(); + fromScale = fileType.getScale(); + toPrecision = _readType.getPrecision(); + toScale = _readType.getScale(); + } + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get()); + auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertDecimalToDecimal(dstBatch, i, srcBatch); + } + } + } + + private: + void convertDecimalToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, + const FileTypeBatch& srcBatch) { + using FileType = decltype(srcBatch.values[idx]); + using ReadType = decltype(dstBatch.values[idx]); + + auto [overflows, resultI128] = + convertDecimal(srcBatch.values[idx], fromScale, toPrecision, toScale); + if (overflows) { + handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow); + } + if constexpr (std::is_same_v<ReadTypeBatch, Decimal64VectorBatch>) { + if (!resultI128.fitsInLong()) { + handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow); + } else { + dstBatch.values[idx] = resultI128.toLong(); + } + } else { + dstBatch.values[idx] = resultI128; + } + } + + int32_t fromPrecision; + int32_t fromScale; + int32_t toPrecision; + int32_t toScale; + }; + +#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \ + using FROM##To##TO##ColumnReader = \ + NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>; + +#define DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = NumericToStringVariantColumnReader<FROM##VectorBatch>; + +#define DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(FROM, IS_FROM_FLOATING) \ + using FROM##To##Decimal64##ColumnReader = \ + NumericToDecimalColumnReader<FROM##VectorBatch, Decimal64VectorBatch, IS_FROM_FLOATING>; \ + using FROM##To##Decimal128##ColumnReader = \ + NumericToDecimalColumnReader<FROM##VectorBatch, Decimal128VectorBatch, IS_FROM_FLOATING>; + +#define DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(FROM) \ + using FROM##ToTimestampColumnReader = NumericToTimestampColumnReader<FROM##VectorBatch>; + +#define DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(TO, TYPE) \ + using Decimal64##To##TO##ColumnReader = \ + DecimalToNumericColumnReader<Decimal64VectorBatch, TO##VectorBatch, TYPE>; \ + using Decimal128##To##TO##ColumnReader = \ + DecimalToNumericColumnReader<Decimal128VectorBatch, TO##VectorBatch, TYPE>; + +#define DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(TO) \ + using Decimal64##To##TO##ColumnReader = \ + DecimalConvertColumnReader<Decimal64VectorBatch, TO##VectorBatch>; \ + using Decimal128##To##TO##ColumnReader = \ + DecimalConvertColumnReader<Decimal128VectorBatch, TO##VectorBatch>; + + DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t) + DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t) + DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t) + DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t) + DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t) + DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t) + DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t) + DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t) + DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t) + DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t) + DEFINE_NUMERIC_CONVERT_READER(Float, Double, double) + DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool) + DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool) + DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t) + DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool) + DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t) + DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t) + DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool) + DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t) + DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t) + DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t) + DEFINE_NUMERIC_CONVERT_READER(Double, Float, float) + // Floating to integer + DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool) + DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t) + DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t) + DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t) + DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t) + DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool) + DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t) + DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t) + DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t) + DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t) + // Integer to Floating + DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float) + DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float) + DEFINE_NUMERIC_CONVERT_READER(Short, Float, float) + DEFINE_NUMERIC_CONVERT_READER(Int, Float, float) + DEFINE_NUMERIC_CONVERT_READER(Long, Float, float) + DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double) + DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double) + DEFINE_NUMERIC_CONVERT_READER(Short, Double, double) + DEFINE_NUMERIC_CONVERT_READER(Int, Double, double) + DEFINE_NUMERIC_CONVERT_READER(Long, Double, double) + + // Numeric to String/Char + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar) + DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar) + using BooleanToStringColumnReader = BooleanToStringVariantColumnReader; + using BooleanToCharColumnReader = BooleanToStringVariantColumnReader; + using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader; + + // Numeric to Decimal + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false) + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false) + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false) + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false) + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false) + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true) + DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true) + + // Numeric to Timestamp + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean) + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte) + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short) + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int) + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long) + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float) + DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double) + + // Decimal to Numeric + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Boolean, bool) + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Byte, int8_t) + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Short, int16_t) + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Int, int32_t) + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Long, int64_t) + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Float, float) + DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Double, double) + + // Decimal to Decimal + DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal64) + DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal128) + +#define CREATE_READER(NAME) \ + return std::make_unique<NAME>(_readType, fileType, stripe, throwOnOverflow); + +#define CASE_CREATE_READER(TYPE, CONVERT) \ + case TYPE: \ + CREATE_READER(CONVERT##ColumnReader) + + const static int32_t MAX_PRECISION_64 = 18; + + static inline bool isDecimal64(const Type& type) { + return type.getPrecision() > 0 && type.getPrecision() <= MAX_PRECISION_64; + } + +#define CASE_CREATE_FROM_DECIMAL_READER(TYPE, TO) \ + case TYPE: { \ + if (isDecimal64(fileType)) { \ + CREATE_READER(Decimal64To##TO##ColumnReader) \ + } else { \ + CREATE_READER(Decimal128To##TO##ColumnReader) \ + } \ + } + +#define CASE_CREATE_DECIMAL_READER(FROM) \ + case DECIMAL: { \ + if (isDecimal64(_readType)) { \ + CREATE_READER(FROM##ToDecimal64ColumnReader) \ + } else { \ + CREATE_READER(FROM##ToDecimal128ColumnReader) \ + } \ + } + +#define CASE_EXCEPTION \ + default: \ + throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \ + _readType.toString()); + + std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe, + bool useTightNumericVector, + bool throwOnOverflow) { + if (!useTightNumericVector) { + throw SchemaEvolutionError( + "SchemaEvolution only support tight vector, please create ColumnVectorBatch with " + "option useTightNumericVector"); + } + const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType); + + switch (fileType.getKind()) { + case BOOLEAN: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BYTE, BooleanToByte) + CASE_CREATE_READER(SHORT, BooleanToShort) + CASE_CREATE_READER(INT, BooleanToInt) + CASE_CREATE_READER(LONG, BooleanToLong) + CASE_CREATE_READER(FLOAT, BooleanToFloat) + CASE_CREATE_READER(DOUBLE, BooleanToDouble) + CASE_CREATE_READER(STRING, BooleanToString) + CASE_CREATE_READER(CHAR, BooleanToChar) + CASE_CREATE_READER(VARCHAR, BooleanToVarchar) + CASE_CREATE_DECIMAL_READER(Boolean) + CASE_CREATE_READER(TIMESTAMP, BooleanToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, BooleanToTimestamp) + case BOOLEAN: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case BYTE: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, ByteToBoolean) + CASE_CREATE_READER(SHORT, ByteToShort) + CASE_CREATE_READER(INT, ByteToInt) + CASE_CREATE_READER(LONG, ByteToLong) + CASE_CREATE_READER(FLOAT, ByteToFloat) + CASE_CREATE_READER(DOUBLE, ByteToDouble) + CASE_CREATE_READER(STRING, ByteToString) + CASE_CREATE_READER(CHAR, ByteToChar) + CASE_CREATE_READER(VARCHAR, ByteToVarchar) + CASE_CREATE_DECIMAL_READER(Byte) + CASE_CREATE_READER(TIMESTAMP, ByteToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, ByteToTimestamp) + case BYTE: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case SHORT: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, ShortToBoolean) + CASE_CREATE_READER(BYTE, ShortToByte) + CASE_CREATE_READER(INT, ShortToInt) + CASE_CREATE_READER(LONG, ShortToLong) + CASE_CREATE_READER(FLOAT, ShortToFloat) + CASE_CREATE_READER(DOUBLE, ShortToDouble) + CASE_CREATE_READER(STRING, ShortToString) + CASE_CREATE_READER(CHAR, ShortToChar) + CASE_CREATE_READER(VARCHAR, ShortToVarchar) + CASE_CREATE_DECIMAL_READER(Short) + CASE_CREATE_READER(TIMESTAMP, ShortToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, ShortToTimestamp) + case SHORT: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case INT: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, IntToBoolean) + CASE_CREATE_READER(BYTE, IntToByte) + CASE_CREATE_READER(SHORT, IntToShort) + CASE_CREATE_READER(LONG, IntToLong) + CASE_CREATE_READER(FLOAT, IntToFloat) + CASE_CREATE_READER(DOUBLE, IntToDouble) + CASE_CREATE_READER(STRING, IntToString) + CASE_CREATE_READER(CHAR, IntToChar) + CASE_CREATE_READER(VARCHAR, IntToVarchar) + CASE_CREATE_DECIMAL_READER(Int) + CASE_CREATE_READER(TIMESTAMP, IntToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, IntToTimestamp) + case INT: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case LONG: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, LongToBoolean) + CASE_CREATE_READER(BYTE, LongToByte) + CASE_CREATE_READER(SHORT, LongToShort) + CASE_CREATE_READER(INT, LongToInt) + CASE_CREATE_READER(FLOAT, LongToFloat) + CASE_CREATE_READER(DOUBLE, LongToDouble) + CASE_CREATE_READER(STRING, LongToString) + CASE_CREATE_READER(CHAR, LongToChar) + CASE_CREATE_READER(VARCHAR, LongToVarchar) + CASE_CREATE_DECIMAL_READER(Long) + CASE_CREATE_READER(TIMESTAMP, LongToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, LongToTimestamp) + case LONG: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case FLOAT: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, FloatToBoolean) + CASE_CREATE_READER(BYTE, FloatToByte) + CASE_CREATE_READER(SHORT, FloatToShort) + CASE_CREATE_READER(INT, FloatToInt) + CASE_CREATE_READER(LONG, FloatToLong) + CASE_CREATE_READER(DOUBLE, FloatToDouble) + CASE_CREATE_READER(STRING, FloatToString) + CASE_CREATE_READER(CHAR, FloatToChar) + CASE_CREATE_READER(VARCHAR, FloatToVarchar) + CASE_CREATE_DECIMAL_READER(Float) + CASE_CREATE_READER(TIMESTAMP, FloatToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, FloatToTimestamp) + case FLOAT: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case DOUBLE: { + switch (_readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, DoubleToBoolean) + CASE_CREATE_READER(BYTE, DoubleToByte) + CASE_CREATE_READER(SHORT, DoubleToShort) + CASE_CREATE_READER(INT, DoubleToInt) + CASE_CREATE_READER(LONG, DoubleToLong) + CASE_CREATE_READER(FLOAT, DoubleToFloat) + CASE_CREATE_READER(STRING, DoubleToString) + CASE_CREATE_READER(CHAR, DoubleToChar) + CASE_CREATE_READER(VARCHAR, DoubleToVarchar) + CASE_CREATE_DECIMAL_READER(Double) + CASE_CREATE_READER(TIMESTAMP, DoubleToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, DoubleToTimestamp) + case DOUBLE: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case STRING: + case BINARY: + case TIMESTAMP: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DECIMAL: { + switch (_readType.getKind()) { + CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean) + CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte) + CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short) + CASE_CREATE_FROM_DECIMAL_READER(INT, Int) + CASE_CREATE_FROM_DECIMAL_READER(LONG, Long) + CASE_CREATE_FROM_DECIMAL_READER(FLOAT, Float) + CASE_CREATE_FROM_DECIMAL_READER(DOUBLE, Double) + case DECIMAL: { + if (isDecimal64(fileType)) { + if (isDecimal64(_readType)) { + CREATE_READER(Decimal64ToDecimal64ColumnReader) + } else { + CREATE_READER(Decimal64ToDecimal128ColumnReader) + } + } else { + if (isDecimal64(_readType)) { + CREATE_READER(Decimal128ToDecimal64ColumnReader) + } else { + CREATE_READER(Decimal128ToDecimal128ColumnReader) + } + } + } + case STRING: + case CHAR: + case VARCHAR: + case TIMESTAMP: + case TIMESTAMP_INSTANT: + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case DATE: + case VARCHAR: + case CHAR: + case TIMESTAMP_INSTANT: + CASE_EXCEPTION + } + } + +#undef DEFINE_NUMERIC_CONVERT_READER +#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER +#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER +#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER +#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER +#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER +#undef CASE_CREATE_FROM_DECIMAL_READER +#undef CASE_CREATE_READER +#undef CASE_EXCEPTION + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh new file mode 100644 index 0000000000..6ed4d0170d --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.hh @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_CONVERT_COLUMN_READER_HH +#define ORC_CONVERT_COLUMN_READER_HH + +#include "ColumnReader.hh" +#include "SchemaEvolution.hh" + +namespace orc { + + class ConvertColumnReader : public ColumnReader { + public: + ConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, + bool throwOnOverflow); + + // override next() to implement convert logic + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; + + uint64_t skip(uint64_t numValues) override; + + void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override; + + protected: + bool useTightNumericVector; + const Type& readType; + std::unique_ptr<ColumnReader> reader; + std::unique_ptr<ColumnVectorBatch> data; + const bool throwOnOverflow; + }; + + std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe, + bool useTightNumericVector, + bool throwOnOverflow); + +} // namespace orc + +#endif // ORC_CONVERT_COLUMN_READER_HH diff --git a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc new file mode 100644 index 0000000000..7e6958deef --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.cc @@ -0,0 +1,589 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CpuInfoUtil.cc is from Apache Arrow as of 2023-03-21 + */ + +#include "CpuInfoUtil.hh" + +#ifdef __APPLE__ +#include <sys/sysctl.h> +#endif + +#ifndef _MSC_VER +#include <unistd.h> +#endif + +#ifdef _WIN32 +#define NOMINMAX +#include <Windows.h> +#include <intrin.h> +#endif + +#include <algorithm> +#include <array> +#include <bitset> +#include <cstdint> +#include <fstream> +#include <optional> +#include <sstream> +#include <string> +#include <thread> +#include <vector> + +#include "orc/Exceptions.hh" + +#undef CPUINFO_ARCH_X86 +#undef CPUINFO_ARCH_ARM +#undef CPUINFO_ARCH_PPC + +#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +#define CPUINFO_ARCH_X86 +#ifndef ORC_HAVE_RUNTIME_AVX512 +#define UNUSED(x) (void)(x) +#endif +#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) +#define CPUINFO_ARCH_ARM +#elif defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) +#define CPUINFO_ARCH_PPC +#endif + +namespace orc { + + namespace { + + constexpr int kCacheLevels = static_cast<int>(CpuInfo::CacheLevel::Last) + 1; + + //============================== OS Dependent ==============================// + +#if defined(_WIN32) + //------------------------------ WINDOWS ------------------------------// + void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; + DWORD buffer_size = 0; + size_t offset = 0; + typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*); + GetLogicalProcessorInformationFuncPointer func_pointer = + (GetLogicalProcessorInformationFuncPointer)GetProcAddress( + GetModuleHandle("kernel32"), "GetLogicalProcessorInformation"); + + if (!func_pointer) { + throw ParseError("Failed to find procedure GetLogicalProcessorInformation"); + } + + // Get buffer size + if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + throw ParseError("Failed to get size of processor information buffer"); + } + + buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size); + if (!buffer) { + return; + } + + if (!func_pointer(buffer, &buffer_size)) { + free(buffer); + throw ParseError("Failed to get processor information"); + } + + buffer_position = buffer; + while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) { + if (RelationCache == buffer_position->Relationship) { + PCACHE_DESCRIPTOR cache = &buffer_position->Cache; + if (cache->Level >= 1 && cache->Level <= kCacheLevels) { + const int64_t current = (*cache_sizes)[cache->Level - 1]; + (*cache_sizes)[cache->Level - 1] = std::max<int64_t>(current, cache->Size); + } + } + offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + buffer_position++; + } + + free(buffer); + } + +#if defined(CPUINFO_ARCH_X86) + // On x86, get CPU features by cpuid, https://en.wikipedia.org/wiki/CPUID + +#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5 + void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) { + __asm__ __volatile__("cpuid" + : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]), "=d"(CPUInfo[3]) + : "a"(function_id), "c"(subfunction_id)); + } + + int64_t _xgetbv(int xcr) { + int out = 0; + __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx"); + return out; + } +#endif // MINGW + + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + int register_EAX_id = 1; + int highest_valid_id = 0; + int highest_extended_valid_id = 0; + std::bitset<32> features_ECX; + std::array<int, 4> cpu_info; + + // Get highest valid id + __cpuid(cpu_info.data(), 0); + highest_valid_id = cpu_info[0]; + // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C + // HEX of "AuthenticAMD": 41757468 656E7469 63414D44 + if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) { + *vendor = CpuInfo::Vendor::Intel; + } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 && + cpu_info[2] == 0x444d4163) { + *vendor = CpuInfo::Vendor::AMD; + } + + if (highest_valid_id <= register_EAX_id) { + return; + } + + // EAX=1: Processor Info and Feature Bits + __cpuidex(cpu_info.data(), register_EAX_id, 0); + features_ECX = cpu_info[2]; + + // Get highest extended id + __cpuid(cpu_info.data(), 0x80000000); + highest_extended_valid_id = cpu_info[0]; + + // Retrieve CPU model name + if (highest_extended_valid_id >= static_cast<int>(0x80000004)) { + model_name->clear(); + for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) { + __cpuidex(cpu_info.data(), i, 0); + *model_name += std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info)); + } + } + + bool zmm_enabled = false; + if (features_ECX[27]) { // OSXSAVE + // Query if the OS supports saving ZMM registers when switching contexts + int64_t xcr0 = _xgetbv(0); + zmm_enabled = (xcr0 & 0xE0) == 0xE0; + } + + if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; + if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + + // cpuid with EAX=7, ECX=0: Extended Features + register_EAX_id = 7; + if (highest_valid_id > register_EAX_id) { + __cpuidex(cpu_info.data(), register_EAX_id, 0); + std::bitset<32> features_EBX = cpu_info[1]; + + if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; + if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; + if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + if (zmm_enabled) { + if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; + if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; + if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; + if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW; + if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL; + } + } + } + +#elif defined(CPUINFO_ARCH_ARM) + // Windows on Arm + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + *hardware_flags |= CpuInfo::ASIMD; + // TODO: vendor, model_name + } +#endif + +#elif defined(__APPLE__) + //------------------------------ MACOS ------------------------------// + std::optional<int64_t> IntegerSysCtlByName(const char* name) { + size_t len = sizeof(int64_t); + int64_t data = 0; + if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) { + return data; + } + // ENOENT is the official errno value for non-existing sysctl's, + // but EINVAL and ENOTSUP have been seen in the wild. + if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) { + std::ostringstream ss; + ss << "sysctlbyname failed for '" << name << "'"; + throw ParseError(ss.str()); + } + return std::nullopt; + } + + void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) { + static_assert(kCacheLevels >= 3, ""); + auto c = IntegerSysCtlByName("hw.l1dcachesize"); + if (c.has_value()) { + (*cache_sizes)[0] = *c; + } + c = IntegerSysCtlByName("hw.l2cachesize"); + if (c.has_value()) { + (*cache_sizes)[1] = *c; + } + c = IntegerSysCtlByName("hw.l3cachesize"); + if (c.has_value()) { + (*cache_sizes)[2] = *c; + } + } + + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + // hardware_flags + struct SysCtlCpuFeature { + const char* name; + int64_t flag; + }; + std::vector<SysCtlCpuFeature> features = { +#if defined(CPUINFO_ARCH_X86) + {"hw.optional.sse4_2", + CpuInfo::SSSE3 | CpuInfo::SSE4_1 | CpuInfo::SSE4_2 | CpuInfo::POPCNT}, + {"hw.optional.avx1_0", CpuInfo::AVX}, + {"hw.optional.avx2_0", CpuInfo::AVX2}, + {"hw.optional.bmi1", CpuInfo::BMI1}, + {"hw.optional.bmi2", CpuInfo::BMI2}, + {"hw.optional.avx512f", CpuInfo::AVX512F}, + {"hw.optional.avx512cd", CpuInfo::AVX512CD}, + {"hw.optional.avx512dq", CpuInfo::AVX512DQ}, + {"hw.optional.avx512bw", CpuInfo::AVX512BW}, + {"hw.optional.avx512vl", CpuInfo::AVX512VL}, +#elif defined(CPUINFO_ARCH_ARM) + // ARM64 (note that this is exposed under Rosetta as well) + {"hw.optional.neon", CpuInfo::ASIMD}, +#endif + }; + for (const auto& feature : features) { + auto v = IntegerSysCtlByName(feature.name); + if (v.value_or(0)) { + *hardware_flags |= feature.flag; + } + } + + // TODO: vendor, model_name + *vendor = CpuInfo::Vendor::Unknown; + *model_name = "Unknown"; + } + +#else + //------------------------------ LINUX ------------------------------// + // Get cache size, return 0 on error + int64_t LinuxGetCacheSize(int level) { + // get cache size by sysconf() +#ifdef _SC_LEVEL1_DCACHE_SIZE + const int kCacheSizeConf[] = { + _SC_LEVEL1_DCACHE_SIZE, + _SC_LEVEL2_CACHE_SIZE, + _SC_LEVEL3_CACHE_SIZE, + }; + static_assert(sizeof(kCacheSizeConf) / sizeof(kCacheSizeConf[0]) == kCacheLevels, ""); + + errno = 0; + const int64_t cache_size = sysconf(kCacheSizeConf[level]); + if (errno == 0 && cache_size > 0) { + return cache_size; + } +#endif + + // get cache size from sysfs if sysconf() fails or not supported + const char* kCacheSizeSysfs[] = { + "/sys/devices/system/cpu/cpu0/cache/index0/size", // l1d (index1 is l1i) + "/sys/devices/system/cpu/cpu0/cache/index2/size", // l2 + "/sys/devices/system/cpu/cpu0/cache/index3/size", // l3 + }; + static_assert(sizeof(kCacheSizeSysfs) / sizeof(kCacheSizeSysfs[0]) == kCacheLevels, ""); + + std::ifstream cacheinfo(kCacheSizeSysfs[level], std::ios::in); + if (!cacheinfo) { + return 0; + } + // cacheinfo is one line like: 65536, 64K, 1M, etc. + uint64_t size = 0; + char unit = '\0'; + cacheinfo >> size >> unit; + if (unit == 'K') { + size <<= 10; + } else if (unit == 'M') { + size <<= 20; + } else if (unit == 'G') { + size <<= 30; + } else if (unit != '\0') { + return 0; + } + return static_cast<int64_t>(size); + } + + // Helper function to parse for hardware flags from /proc/cpuinfo + // values contains a list of space-separated flags. check to see if the flags we + // care about are present. + // Returns a bitmap of flags. + int64_t LinuxParseCpuFlags(const std::string& values) { + const struct { + std::string name; + int64_t flag; + } flag_mappings[] = { +#if defined(CPUINFO_ARCH_X86) + {"ssse3", CpuInfo::SSSE3}, + {"sse4_1", CpuInfo::SSE4_1}, + {"sse4_2", CpuInfo::SSE4_2}, + {"popcnt", CpuInfo::POPCNT}, + {"avx", CpuInfo::AVX}, + {"avx2", CpuInfo::AVX2}, + {"avx512f", CpuInfo::AVX512F}, + {"avx512cd", CpuInfo::AVX512CD}, + {"avx512vl", CpuInfo::AVX512VL}, + {"avx512dq", CpuInfo::AVX512DQ}, + {"avx512bw", CpuInfo::AVX512BW}, + {"bmi1", CpuInfo::BMI1}, + {"bmi2", CpuInfo::BMI2}, +#elif defined(CPUINFO_ARCH_ARM) + {"asimd", CpuInfo::ASIMD}, +#endif + }; + const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + + int64_t flags = 0; + for (int i = 0; i < num_flags; ++i) { + if (values.find(flag_mappings[i].name) != std::string::npos) { + flags |= flag_mappings[i].flag; + } + } + return flags; + } + + void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) { + for (int i = 0; i < kCacheLevels; ++i) { + const int64_t cache_size = LinuxGetCacheSize(i); + if (cache_size > 0) { + (*cache_sizes)[i] = cache_size; + } + } + } + + static constexpr bool IsWhitespace(char c) { + return c == ' ' || c == '\t'; + } + + std::string TrimString(std::string value) { + size_t ltrim_chars = 0; + while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) { + ++ltrim_chars; + } + value.erase(0, ltrim_chars); + size_t rtrim_chars = 0; + while (rtrim_chars < value.size() && IsWhitespace(value[value.size() - 1 - rtrim_chars])) { + ++rtrim_chars; + } + value.erase(value.size() - rtrim_chars, rtrim_chars); + return value; + } + + // Read from /proc/cpuinfo + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); + while (cpuinfo) { + std::string line; + std::getline(cpuinfo, line); + const size_t colon = line.find(':'); + if (colon != std::string::npos) { + const std::string name = TrimString(line.substr(0, colon - 1)); + const std::string value = TrimString(line.substr(colon + 1, std::string::npos)); + if (name.compare("flags") == 0 || name.compare("Features") == 0) { + *hardware_flags |= LinuxParseCpuFlags(value); + } else if (name.compare("model name") == 0) { + *model_name = value; + } else if (name.compare("vendor_id") == 0) { + if (value.compare("GenuineIntel") == 0) { + *vendor = CpuInfo::Vendor::Intel; + } else if (value.compare("AuthenticAMD") == 0) { + *vendor = CpuInfo::Vendor::AMD; + } + } + } + } + } +#endif // WINDOWS, MACOS, LINUX + + //============================== Arch Dependent ==============================// + +#if defined(CPUINFO_ARCH_X86) + //------------------------------ X86_64 ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + enum { + USER_SIMD_NONE, + USER_SIMD_AVX512, + USER_SIMD_MAX, + }; + + int level = USER_SIMD_MAX; + // Parse the level + if (simd_level == "AVX512") { + level = USER_SIMD_AVX512; + } else if (simd_level == "NONE") { + level = USER_SIMD_NONE; + } else { + return false; + } + + // Disable feature as the level + if (level < USER_SIMD_AVX512) { + *hardware_flags &= ~CpuInfo::AVX512; + } + return true; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) { +#if defined(ORC_HAVE_RUNTIME_AVX512) + if (!ci->isDetected(CpuInfo::AVX512)) { + throw ParseError("CPU does not support the Supplemental AVX512 instruction set"); + } +#else + UNUSED(ci); +#endif + } + +#elif defined(CPUINFO_ARCH_ARM) + //------------------------------ AARCH64 ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + if (simd_level == "NONE") { + *hardware_flags &= ~CpuInfo::ASIMD; + return true; + } + return false; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) { + if (!ci->isDetected(CpuInfo::ASIMD)) { + throw ParseError("CPU does not support the Armv8 Neon instruction set"); + } + } + +#else + //------------------------------ PPC, ... ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + return true; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) {} + +#endif // X86, ARM, PPC + + } // namespace + + struct CpuInfo::Impl { + int64_t hardware_flags = 0; + int numCores = 0; + int64_t original_hardware_flags = 0; + Vendor vendor = Vendor::Unknown; + std::string model_name = "Unknown"; + std::array<int64_t, kCacheLevels> cache_sizes{}; + + Impl() { + OsRetrieveCacheSize(&cache_sizes); + OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name); + original_hardware_flags = hardware_flags; + numCores = std::max(static_cast<int>(std::thread::hardware_concurrency()), 1); + + // parse user simd level + const auto maybe_env_var = std::getenv("ORC_USER_SIMD_LEVEL"); + std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var); + std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(), + [](unsigned char c) { return std::toupper(c); }); + if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) { + throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); + } + } + }; + + CpuInfo::~CpuInfo() = default; + + CpuInfo::CpuInfo() : impl_(new Impl) {} + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wexit-time-destructors" +#endif + + const CpuInfo* CpuInfo::getInstance() { + static CpuInfo cpu_info; + return &cpu_info; + } + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + int64_t CpuInfo::hardwareFlags() const { + return impl_->hardware_flags; + } + + int CpuInfo::numCores() const { + return impl_->numCores <= 0 ? 1 : impl_->numCores; + } + + CpuInfo::Vendor CpuInfo::vendor() const { + return impl_->vendor; + } + + const std::string& CpuInfo::modelName() const { + return impl_->model_name; + } + + int64_t CpuInfo::cacheSize(CacheLevel level) const { + constexpr int64_t kDefaultCacheSizes[] = { + 32 * 1024, // Level 1: 32K + 256 * 1024, // Level 2: 256K + 3072 * 1024, // Level 3: 3M + }; + static_assert(sizeof(kDefaultCacheSizes) / sizeof(kDefaultCacheSizes[0]) == kCacheLevels, ""); + + static_assert(static_cast<int>(CacheLevel::L1) == 0, ""); + const int i = static_cast<int>(level); + if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i]; + if (i == 0) return kDefaultCacheSizes[0]; + // l3 may be not available, return maximum of l2 or default size + return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); + } + + bool CpuInfo::isSupported(int64_t flags) const { + return (impl_->hardware_flags & flags) == flags; + } + + bool CpuInfo::isDetected(int64_t flags) const { + return (impl_->original_hardware_flags & flags) == flags; + } + + void CpuInfo::verifyCpuRequirements() const { + return ArchVerifyCpuRequirements(this); + } + +} // namespace orc + +#undef CPUINFO_ARCH_X86 +#undef CPUINFO_ARCH_ARM +#undef CPUINFO_ARCH_PPC diff --git a/contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh new file mode 100644 index 0000000000..5637053e6d --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/CpuInfoUtil.hh @@ -0,0 +1,113 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CpuInfoUtil.hh is from Apache Arrow as of 2023-03-21 + */ + +#ifndef ORC_CPUINFOUTIL_HH +#define ORC_CPUINFOUTIL_HH + +#include <cstdint> +#include <memory> +#include <string> + +namespace orc { + + /** + * CpuInfo is an interface to query for cpu information at runtime. The caller can + * ask for the sizes of the caches and what hardware features are supported. + * On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and + * /sys/devices) + */ + class CpuInfo { + public: + ~CpuInfo(); + + // x86 features + static constexpr int64_t SSSE3 = (1LL << 0); + static constexpr int64_t SSE4_1 = (1LL << 1); + static constexpr int64_t SSE4_2 = (1LL << 2); + static constexpr int64_t POPCNT = (1LL << 3); + static constexpr int64_t AVX = (1LL << 4); + static constexpr int64_t AVX2 = (1LL << 5); + static constexpr int64_t AVX512F = (1LL << 6); + static constexpr int64_t AVX512CD = (1LL << 7); + static constexpr int64_t AVX512VL = (1LL << 8); + static constexpr int64_t AVX512DQ = (1LL << 9); + static constexpr int64_t AVX512BW = (1LL << 10); + static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW; + static constexpr int64_t BMI1 = (1LL << 11); + static constexpr int64_t BMI2 = (1LL << 12); + + /// Arm features + static constexpr int64_t ASIMD = (1LL << 32); + + // Cache enums for L1 (data), L2 and L3 + enum class CacheLevel { L1 = 0, L2, L3, Last = L3 }; + + // CPU vendors + enum class Vendor { Unknown, Intel, AMD }; + + static const CpuInfo* getInstance(); + + // Returns all the flags for this cpu + int64_t hardwareFlags() const; + + // Returns the number of cores (including hyper-threaded) on this machine. + int numCores() const; + + // Returns the vendor of the cpu. + Vendor vendor() const; + + // Returns the model name of the cpu (e.g. Intel i7-2600) + const std::string& modelName() const; + + // Returns the size of the cache in KB at this cache level + int64_t cacheSize(CacheLevel level) const; + + /** + * Returns whether or not the given feature is enabled. + * isSupported() is true if isDetected() is also true and the feature + * wasn't disabled by the user (for example by setting the ORC_USER_SIMD_LEVEL + * environment variable). + */ + bool isSupported(int64_t flags) const; + + // Returns whether or not the given feature is available on the CPU. + bool isDetected(int64_t flags) const; + + // Determine if the CPU meets the minimum CPU requirements and if not, issue an error + // and terminate. + void verifyCpuRequirements() const; + + bool hasEfficientBmi2() const { + // BMI2 (pext, pdep) is only efficient on Intel X86 processors. + return vendor() == Vendor::Intel && isSupported(BMI2); + } + + private: + CpuInfo(); + + struct Impl; + std::unique_ptr<Impl> impl_; + }; + +} // namespace orc + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Dispatch.hh b/contrib/libs/apache/orc/c++/src/Dispatch.hh new file mode 100644 index 0000000000..489317b28a --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/Dispatch.hh @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_DISPATCH_HH +#define ORC_DISPATCH_HH + +#include <utility> +#include <vector> + +#include "CpuInfoUtil.hh" + +namespace orc { + enum class DispatchLevel : int { + // These dispatch levels, corresponding to instruction set features, + // are sorted in increasing order of preference. + NONE = 0, + AVX512, + MAX + }; + + /** + * A facility for dynamic dispatch according to available DispatchLevel. + * + * Typical use: + * + * static void my_function_default(...); + * static void my_function_avx512(...); + * + * struct MyDynamicFunction { + * using FunctionType = decltype(&my_function_default); + * + * static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() { + * return { + * { DispatchLevel::NONE, my_function_default } + * #if defined(ORC_HAVE_RUNTIME_AVX512) + * , { DispatchLevel::AVX512, my_function_avx512 } + * #endif + * }; + * } + * }; + * + * void my_function(...) { + * static DynamicDispatch<MyDynamicFunction> dispatch; + * return dispatch.func(...); + * } + */ + template <typename DynamicFunction> + class DynamicDispatch { + protected: + using FunctionType = typename DynamicFunction::FunctionType; + using Implementation = std::pair<DispatchLevel, FunctionType>; + + public: + DynamicDispatch() { + Resolve(DynamicFunction::implementations()); + } + + FunctionType func = {}; + + protected: + // Use the Implementation with the highest DispatchLevel + void Resolve(const std::vector<Implementation>& implementations) { + Implementation cur{DispatchLevel::NONE, {}}; + + for (const auto& impl : implementations) { + if (impl.first >= cur.first && levelSupported(impl.first)) { + // Higher (or same) level than current + cur = impl; + } + } + + if (!cur.second) { + throw InvalidArgument("No appropriate implementation found"); + } + func = cur.second; + } + + private: + bool levelSupported(DispatchLevel level) const { + static const auto cpu_info = CpuInfo::getInstance(); + + switch (level) { + case DispatchLevel::NONE: + return true; + case DispatchLevel::AVX512: + case DispatchLevel::MAX: + return cpu_info->isSupported(CpuInfo::AVX512); + default: + return false; + } + } + }; +} // namespace orc + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc index 2077b27df4..23703ff324 100644 --- a/contrib/libs/apache/orc/c++/src/Exceptions.cc +++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc @@ -20,59 +20,68 @@ namespace orc { - NotImplementedYet::NotImplementedYet(const std::string& what_arg - ) : logic_error(what_arg) { + NotImplementedYet::NotImplementedYet(const std::string& what_arg) : logic_error(what_arg) { // PASS } - NotImplementedYet::NotImplementedYet(const char* what_arg - ) :logic_error(what_arg) { + NotImplementedYet::NotImplementedYet(const char* what_arg) : logic_error(what_arg) { // PASS } - NotImplementedYet::NotImplementedYet(const NotImplementedYet& error - ): logic_error(error) { + NotImplementedYet::NotImplementedYet(const NotImplementedYet& error) : logic_error(error) { // PASS } - NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT { + NotImplementedYet::~NotImplementedYet() noexcept { // PASS } - ParseError::ParseError(const std::string& what_arg - ): runtime_error(what_arg) { + ParseError::ParseError(const std::string& what_arg) : runtime_error(what_arg) { // PASS } - ParseError::ParseError(const char* what_arg - ): runtime_error(what_arg) { + ParseError::ParseError(const char* what_arg) : runtime_error(what_arg) { // PASS } - ParseError::ParseError(const ParseError& error): runtime_error(error) { + ParseError::ParseError(const ParseError& error) : runtime_error(error) { // PASS } - ParseError::~ParseError() ORC_NOEXCEPT { + ParseError::~ParseError() noexcept { // PASS } - InvalidArgument::InvalidArgument(const std::string& what_arg - ): runtime_error(what_arg) { + InvalidArgument::InvalidArgument(const std::string& what_arg) : runtime_error(what_arg) { // PASS } - InvalidArgument::InvalidArgument(const char* what_arg - ): runtime_error(what_arg) { + InvalidArgument::InvalidArgument(const char* what_arg) : runtime_error(what_arg) { // PASS } - InvalidArgument::InvalidArgument(const InvalidArgument& error - ): runtime_error(error) { + InvalidArgument::InvalidArgument(const InvalidArgument& error) : runtime_error(error) { // PASS } - InvalidArgument::~InvalidArgument() ORC_NOEXCEPT { + InvalidArgument::~InvalidArgument() noexcept { // PASS } -} + + SchemaEvolutionError::SchemaEvolutionError(const std::string& what_arg) : logic_error(what_arg) { + // PASS + } + + SchemaEvolutionError::SchemaEvolutionError(const char* what_arg) : logic_error(what_arg) { + // PASS + } + + SchemaEvolutionError::SchemaEvolutionError(const SchemaEvolutionError& error) + : logic_error(error) { + // PASS + } + + SchemaEvolutionError::~SchemaEvolutionError() noexcept { + // PASS + } +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc index 4ff500fbac..3c159f3775 100644 --- a/contrib/libs/apache/orc/c++/src/Int128.cc +++ b/contrib/libs/apache/orc/c++/src/Int128.cc @@ -45,7 +45,7 @@ namespace orc { size_t group = std::min(static_cast<size_t>(18), length - posn); int64_t chunk = std::stoll(str.substr(posn, group)); int64_t multiple = 1; - for(size_t i=0; i < group; ++i) { + for (size_t i = 0; i < group; ++i) { multiple *= 10; } *this *= multiple; @@ -58,7 +58,7 @@ namespace orc { } } - Int128& Int128::operator*=(const Int128 &right) { + Int128& Int128::operator*=(const Int128& right) { const uint64_t INT_MASK = 0xffffffff; const uint64_t CARRY_BIT = INT_MASK + 1; @@ -100,7 +100,7 @@ namespace orc { * @param wasNegative a flag for whether the value was original negative * @result the output length of the array */ - int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const { + int64_t Int128::fillInArray(uint32_t* array, bool& wasNegative) const { uint64_t high; uint64_t low; if (highbits < 0) { @@ -140,7 +140,6 @@ namespace orc { } } - /** * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is * the MSB. We can replace this with bsrq asm instruction on x64. @@ -162,10 +161,10 @@ namespace orc { */ void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { if (length > 0 && bits != 0) { - for(int64_t i=0; i < length-1; ++i) { - array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits)); + for (int64_t i = 0; i < length - 1; ++i) { + array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits)); } - array[length-1] <<= bits; + array[length - 1] <<= bits; } } @@ -177,8 +176,8 @@ namespace orc { */ void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { if (length > 0 && bits != 0) { - for(int64_t i=length-1; i > 0; --i) { - array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits)); + for (int64_t i = length - 1; i > 0; --i) { + array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits)); } array[0] >>= bits; } @@ -188,8 +187,8 @@ namespace orc { * Fix the signs of the result and remainder at the end of the division * based on the signs of the dividend and divisor. */ - void fixDivisionSigns(Int128 &result, Int128 &remainder, - bool dividendWasNegative, bool divisorWasNegative) { + void fixDivisionSigns(Int128& result, Int128& remainder, bool dividendWasNegative, + bool divisorWasNegative) { if (dividendWasNegative != divisorWasNegative) { result.negate(); } @@ -203,44 +202,42 @@ namespace orc { */ void buildFromArray(Int128& value, uint32_t* array, int64_t length) { switch (length) { - case 0: - value = 0; - break; - case 1: - value = array[0]; - break; - case 2: - value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]); - break; - case 3: - value = Int128(array[0], - (static_cast<uint64_t>(array[1]) << 32) + array[2]); - break; - case 4: - value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1], - (static_cast<uint64_t>(array[2]) << 32) + array[3]); - break; - case 5: - if (array[0] != 0) { - throw std::logic_error("Can't build Int128 with 5 ints."); - } - value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2], - (static_cast<uint64_t>(array[3]) << 32) + array[4]); - break; - default: - throw std::logic_error("Unsupported length for building Int128"); + case 0: + value = 0; + break; + case 1: + value = array[0]; + break; + case 2: + value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]); + break; + case 3: + value = Int128(array[0], (static_cast<uint64_t>(array[1]) << 32) + array[2]); + break; + case 4: + value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1], + (static_cast<uint64_t>(array[2]) << 32) + array[3]); + break; + case 5: + if (array[0] != 0) { + throw std::logic_error("Can't build Int128 with 5 ints."); + } + value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2], + (static_cast<uint64_t>(array[3]) << 32) + array[4]); + break; + default: + throw std::logic_error("Unsupported length for building Int128"); } } /** * Do a division where the divisor fits into a single 32 bit value. */ - Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, - uint32_t divisor, Int128& remainder, - bool dividendWasNegative, bool divisorWasNegative) { + Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, uint32_t divisor, + Int128& remainder, bool dividendWasNegative, bool divisorWasNegative) { uint64_t r = 0; uint32_t resultArray[5]; - for(int64_t j=0; j < dividendLength; j++) { + for (int64_t j = 0; j < dividendLength; j++) { r <<= 32; r += dividend[j]; resultArray[j] = static_cast<uint32_t>(r / divisor); @@ -249,12 +246,11 @@ namespace orc { Int128 result; buildFromArray(result, resultArray, dividendLength); remainder = static_cast<int64_t>(r); - fixDivisionSigns(result, remainder, dividendWasNegative, - divisorWasNegative); + fixDivisionSigns(result, remainder, dividendWasNegative, divisorWasNegative); return result; } - Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const { + Int128 Int128::divide(const Int128& divisor, Int128& remainder) const { // Split the dividend and divisor into integer pieces so that we can // work on them. uint32_t dividendArray[5]; @@ -263,7 +259,7 @@ namespace orc { bool divisorWasNegative; // leave an extra zero before the dividend dividendArray[0] = 0; - int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1; + int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative) + 1; int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative); // Handle some of the easy cases. @@ -273,8 +269,8 @@ namespace orc { } else if (divisorLength == 0) { throw std::range_error("Division by 0 in Int128"); } else if (divisorLength == 1) { - return singleDivide(dividendArray, dividendLength, divisorArray[0], - remainder, dividendWasNegative, divisorWasNegative); + return singleDivide(dividendArray, dividendLength, divisorArray[0], remainder, + dividendWasNegative, divisorWasNegative); } int64_t resultLength = dividendLength - divisorLength; @@ -288,11 +284,10 @@ namespace orc { shiftArrayLeft(dividendArray, dividendLength, normalizeBits); // compute each digit in the result - for(int64_t j=0; j < resultLength; ++j) { + for (int64_t j = 0; j < resultLength; ++j) { // Guess the next digit. At worst it is two too large uint32_t guess = UINT32_MAX; - uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | - dividendArray[j+1]; + uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | dividendArray[j + 1]; if (dividendArray[j] != divisorArray[0]) { guess = static_cast<uint32_t>(highDividend / divisorArray[0]); } @@ -300,10 +295,9 @@ namespace orc { // catch all of the cases where guess is two too large and most of the // cases where it is one too large uint32_t rhat = - static_cast<uint32_t>(highDividend - guess * - static_cast<uint64_t>(divisorArray[0])); + static_cast<uint32_t>(highDividend - guess * static_cast<uint64_t>(divisorArray[0])); while (static_cast<uint64_t>(divisorArray[1]) * guess > - (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) { + (static_cast<uint64_t>(rhat) << 32) + dividendArray[j + 2]) { guess -= 1; rhat += divisorArray[0]; if (static_cast<uint64_t>(rhat) < divisorArray[0]) { @@ -313,12 +307,12 @@ namespace orc { // subtract off the guess * divisor from the dividend uint64_t mult = 0; - for(int64_t i=divisorLength-1; i >= 0; --i) { + for (int64_t i = divisorLength - 1; i >= 0; --i) { mult += static_cast<uint64_t>(guess) * divisorArray[i]; - uint32_t prev = dividendArray[j+i+1]; - dividendArray[j+i+1] -= static_cast<uint32_t>(mult); + uint32_t prev = dividendArray[j + i + 1]; + dividendArray[j + i + 1] -= static_cast<uint32_t>(mult); mult >>= 32; - if (dividendArray[j+i+1] > prev) { + if (dividendArray[j + i + 1] > prev) { mult += 1; } } @@ -329,10 +323,9 @@ namespace orc { if (dividendArray[j] > prev) { guess -= 1; uint32_t carry = 0; - for(int64_t i=divisorLength-1; i >= 0; --i) { - uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + - dividendArray[j+i+1] + carry; - dividendArray[j+i+1] = static_cast<uint32_t>(sum); + for (int64_t i = divisorLength - 1; i >= 0; --i) { + uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + dividendArray[j + i + 1] + carry; + dividendArray[j + i + 1] = static_cast<uint32_t>(sum); carry = static_cast<uint32_t>(sum >> 32); } dividendArray[j] += carry; @@ -348,8 +341,7 @@ namespace orc { Int128 result; buildFromArray(result, resultArray, resultLength); buildFromArray(remainder, dividendArray, dividendLength); - fixDivisionSigns(result, remainder, - dividendWasNegative, divisorWasNegative); + fixDivisionSigns(result, remainder, dividendWasNegative, divisorWasNegative); return result; } @@ -400,8 +392,7 @@ namespace orc { int32_t len = static_cast<int32_t>(str.length()); if (len - 1 > scale) { result = str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(len)); + str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(len)); } else if (len - 1 == scale) { result = "-0." + str.substr(1, std::string::npos); } else { @@ -415,8 +406,7 @@ namespace orc { int32_t len = static_cast<int32_t>(str.length()); if (len > scale) { result = str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(len)); + str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(len)); } else if (len == scale) { result = "0." + str; } else { @@ -440,37 +430,41 @@ namespace orc { std::string Int128::toHexString() const { std::stringstream buf; - buf << std::hex << "0x" - << std::setw(16) << std::setfill('0') << highbits - << std::setw(16) << std::setfill('0') << lowbits; + buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits << std::setw(16) + << std::setfill('0') << lowbits; return buf.str(); } + double Int128::toDouble() const { + if (fitsInLong()) { + return static_cast<double>(toLong()); + } + return static_cast<double>(lowbits) + std::ldexp(static_cast<double>(highbits), 64); + } + const static int32_t MAX_PRECISION_64 = 18; - const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; - - Int128 scaleUpInt128ByPowerOfTen(Int128 value, - int32_t power, - bool &overflow) { + const static int32_t MAX_PRECISION_128 = 38; + const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + + Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power, bool& overflow) { overflow = false; Int128 remainder; @@ -479,7 +473,8 @@ namespace orc { if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) { overflow = true; return Int128::maximumValue(); - } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) { + } else if (value < 0 && + Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) { overflow = true; return Int128::minimumValue(); } @@ -501,4 +496,100 @@ namespace orc { return value; } -} + std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision, + int32_t toScale, bool round) { + if (toPrecision > MAX_PRECISION_128 || toPrecision < 1 || toScale < 0 || + toScale > toPrecision || fromScale < 0 || + std::abs(fromScale - toScale) > MAX_PRECISION_128) { + std::stringstream buf; + buf << "Invalid argument: fromScale=" << fromScale << ", toPrecision=" << toPrecision + << ", toScale=" << toScale; + throw std::invalid_argument(buf.str()); + } + std::pair<bool, Int128> result; + bool negative = value < 0; + result.second = value.abs(); + result.first = false; + + Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first); + int8_t roundOffset = 0; + int32_t deltaScale = fromScale - toScale; + + if (deltaScale > 0) { + Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder; + result.second = result.second.divide(scale, remainder); + remainder *= 2; + if (round && remainder >= scale) { + upperBound -= 1; + roundOffset = 1; + } + } else if (deltaScale < 0) { + if (result.second > upperBound) { + result.first = true; + return result; + } + result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first); + } + + if (result.second > upperBound) { + result.first = true; + return result; + } + + result.second += roundOffset; + if (negative) { + result.second *= -1; + } + return result; + } + + template <typename T> + std::enable_if_t<std::is_floating_point_v<T>, std::pair<bool, Int128>> convertDecimal( + T value, int32_t precision, int32_t scale) { + const static T upperbound = std::ldexp(static_cast<T>(1), 127); + const static T lowerbound = -upperbound; + + std::pair<bool, Int128> result = {false, 0}; + if (precision > MAX_PRECISION_128 || precision < 1 || scale > precision || scale < 0) { + result.first = true; + return result; + } + + if (std::isnan(value) || value <= lowerbound || value >= upperbound) { + result.first = true; + return result; + } + + bool isNegative = (value < 0); + Int128 i128, remainder; + value = std::fabs(value); + if (value >= std::ldexp(static_cast<T>(1.0), 64)) { + int64_t hi = static_cast<int64_t>(std::ldexp(value, -64)); + uint64_t lo = static_cast<uint64_t>(value - std::ldexp(static_cast<T>(hi), 64)); + i128 = Int128(hi, lo); + } else { + i128 = Int128(0, static_cast<uint64_t>(value)); + } + value = value - std::floor(value); + + bool overflow = false; + i128 = scaleUpInt128ByPowerOfTen(i128, scale, overflow); + if (overflow || i128 >= scaleUpInt128ByPowerOfTen(1, precision, overflow)) { + result.first = true; + return result; + } + + value = value * static_cast<T>(pow(10, scale)); + i128 += static_cast<int64_t>(std::round(value)); + if (isNegative) { + i128 = i128.negate(); + } + result.second = i128; + return result; + } + + template std::pair<bool, Int128> convertDecimal(float value, int32_t precision, int32_t scale); + + template std::pair<bool, Int128> convertDecimal(double value, int32_t precision, int32_t scale); + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc index 21bf194fed..f494f4b651 100644 --- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc +++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc @@ -1,15 +1,20 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #include "Adaptor.hh" @@ -24,8 +29,8 @@ namespace orc { static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3}; static const int32_t SIZE_OF_SHORT = 2; - static const int32_t SIZE_OF_INT = 4; - static const int32_t SIZE_OF_LONG = 8; + static const int32_t SIZE_OF_INT = 4; + static const int32_t SIZE_OF_LONG = 8; static std::string toHex(uint64_t val) { std::ostringstream out; @@ -39,45 +44,37 @@ namespace orc { return out.str(); } - class MalformedInputException: public ParseError { - public: - MalformedInputException(int64_t off - ) :ParseError("MalformedInputException at " + - toString(off)) { - } + class MalformedInputException : public ParseError { + public: + MalformedInputException(int64_t off) + : ParseError("MalformedInputException at " + toString(off)) {} - MalformedInputException(int64_t off, const std::string& msg - ): ParseError("MalformedInputException " + msg + - " at " + toString(off)) { - } + MalformedInputException(int64_t off, const std::string& msg) + : ParseError("MalformedInputException " + msg + " at " + toString(off)) {} - MalformedInputException(const MalformedInputException& other - ): ParseError(other.what()) { - } + MalformedInputException(const MalformedInputException& other) : ParseError(other.what()) {} - virtual ~MalformedInputException() noexcept; + ~MalformedInputException() noexcept override; }; MalformedInputException::~MalformedInputException() noexcept { // PASS } - uint64_t lzoDecompress(const char *inputAddress, - const char *inputLimit, - char *outputAddress, - char *outputLimit) { + uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* outputAddress, + char* outputLimit) { // nothing compresses to nothing if (inputAddress == inputLimit) { return 0; } // maximum offset in buffers to which it's safe to write long-at-a-time - char * const fastOutputLimit = outputLimit - SIZE_OF_LONG; + char* const fastOutputLimit = outputLimit - SIZE_OF_LONG; // LZO can concat two blocks together so, decode until the input data is // consumed - const char *input = inputAddress; - char *output = outputAddress; + const char* input = inputAddress; + char* output = outputAddress; while (input < inputLimit) { // // Note: For safety some of the code below may stop decoding early or @@ -127,8 +124,7 @@ namespace orc { literalLength = 0xf; uint32_t nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { + while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) { literalLength += 0xff; } literalLength += nextByte; @@ -191,8 +187,7 @@ namespace orc { matchLength = 0x7; int32_t nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { + while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) { matchLength += 0xff; } matchLength += nextByte; @@ -231,8 +226,7 @@ namespace orc { matchLength = 0x1f; int nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { + while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) { matchLength += 0xff; } matchLength += nextByte; @@ -276,8 +270,7 @@ namespace orc { literalLength = (command & 0x3); } else { throw MalformedInputException(input - inputAddress - 1, - "Invalid LZO command " + - toHex(command)); + "Invalid LZO command " + toHex(command)); } firstCommand = false; @@ -286,12 +279,11 @@ namespace orc { // lzo encodes match offset minus one matchOffset++; - char *matchAddress = output - matchOffset; - if (matchAddress < outputAddress || - output + matchLength > outputLimit) { + char* matchAddress = output - matchOffset; + if (matchAddress < outputAddress || output + matchLength > outputLimit) { throw MalformedInputException(input - inputAddress); } - char *matchOutputLimit = output + matchLength; + char* matchOutputLimit = output + matchLength; if (output > fastOutputLimit) { // slow match copy @@ -343,11 +335,11 @@ namespace orc { } } } - output = matchOutputLimit; // correction in case we over-copied + output = matchOutputLimit; // correction in case we over-copied } // copy literal - char *literalOutputLimit = output + literalLength; + char* literalOutputLimit = output + literalLength; if (literalOutputLimit > fastOutputLimit || input + literalLength > inputLimit - SIZE_OF_LONG) { if (literalOutputLimit > outputLimit) { @@ -373,8 +365,7 @@ namespace orc { lastLiteralLength = literalLength; } - if (input + SIZE_OF_SHORT > inputLimit && - *reinterpret_cast<const int16_t*>(input) != 0) { + if (input + SIZE_OF_SHORT > inputLimit && *reinterpret_cast<const int16_t*>(input) != 0) { throw MalformedInputException(input - inputAddress); } input += SIZE_OF_SHORT; @@ -383,4 +374,4 @@ namespace orc { return static_cast<uint64_t>(output - outputAddress); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh index 9de8537dd8..a37ce8e582 100644 --- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh +++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh @@ -33,10 +33,8 @@ namespace orc { * @param outputLimit one past the last byte of the output buffer * @result the number of bytes decompressed */ - uint64_t lzoDecompress(const char *inputAddress, - const char *inputLimit, - char *outputAddress, - char *outputLimit); -} + uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* outputAddress, + char* outputLimit); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc index ecfb295bae..8c8837aa64 100644 --- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc +++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc @@ -16,14 +16,14 @@ * limitations under the License. */ -#include "orc/Int128.hh" #include "orc/MemoryPool.hh" +#include "orc/Int128.hh" #include "Adaptor.hh" +#include <string.h> #include <cstdlib> #include <iostream> -#include <string.h> namespace orc { @@ -31,8 +31,8 @@ namespace orc { // PASS } - class MemoryPoolImpl: public MemoryPool { - public: + class MemoryPoolImpl : public MemoryPool { + public: virtual ~MemoryPoolImpl() override; char* malloc(uint64_t size) override; @@ -52,30 +52,26 @@ namespace orc { } template <class T> - DataBuffer<T>::DataBuffer(MemoryPool& pool, - uint64_t newSize - ): memoryPool(pool), - buf(nullptr), - currentSize(0), - currentCapacity(0) { - resize(newSize); + DataBuffer<T>::DataBuffer(MemoryPool& pool, uint64_t newSize) + : memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) { + reserve(newSize); + currentSize = newSize; } template <class T> - DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer - ) noexcept: - memoryPool(buffer.memoryPool), - buf(buffer.buf), - currentSize(buffer.currentSize), - currentCapacity(buffer.currentCapacity) { + DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer) noexcept + : memoryPool(buffer.memoryPool), + buf(buffer.buf), + currentSize(buffer.currentSize), + currentCapacity(buffer.currentCapacity) { buffer.buf = nullptr; buffer.currentSize = 0; buffer.currentCapacity = 0; } template <class T> - DataBuffer<T>::~DataBuffer(){ - for(uint64_t i=currentSize; i > 0; --i) { + DataBuffer<T>::~DataBuffer() { + for (uint64_t i = currentSize; i > 0; --i) { (buf + i - 1)->~T(); } if (buf) { @@ -87,11 +83,11 @@ namespace orc { void DataBuffer<T>::resize(uint64_t newSize) { reserve(newSize); if (currentSize > newSize) { - for(uint64_t i=currentSize; i > newSize; --i) { + for (uint64_t i = currentSize; i > newSize; --i) { (buf + i - 1)->~T(); } } else if (newSize > currentSize) { - for(uint64_t i=currentSize; i < newSize; ++i) { + for (uint64_t i = currentSize; i < newSize; ++i) { new (buf + i) T(); } } @@ -99,7 +95,7 @@ namespace orc { } template <class T> - void DataBuffer<T>::reserve(uint64_t newCapacity){ + void DataBuffer<T>::reserve(uint64_t newCapacity) { if (newCapacity > currentCapacity || !buf) { if (buf) { T* buf_old = buf; @@ -113,10 +109,23 @@ namespace orc { } } + template <class T> + void DataBuffer<T>::zeroOut() { + memset(buf, 0, sizeof(T) * currentCapacity); + } + + // Specializations for Int128 + template <> + void DataBuffer<Int128>::zeroOut() { + for (uint64_t i = 0; i < currentCapacity; ++i) { + new (buf + i) Int128(); + } + } + // Specializations for char template <> - DataBuffer<char>::~DataBuffer(){ + DataBuffer<char>::~DataBuffer() { if (buf) { memoryPool.free(reinterpret_cast<char*>(buf)); } @@ -134,7 +143,7 @@ namespace orc { // Specializations for char* template <> - DataBuffer<char*>::~DataBuffer(){ + DataBuffer<char*>::~DataBuffer() { if (buf) { memoryPool.free(reinterpret_cast<char*>(buf)); } @@ -152,7 +161,7 @@ namespace orc { // Specializations for double template <> - DataBuffer<double>::~DataBuffer(){ + DataBuffer<double>::~DataBuffer() { if (buf) { memoryPool.free(reinterpret_cast<char*>(buf)); } @@ -167,10 +176,28 @@ namespace orc { currentSize = newSize; } + // Specializations for float + + template <> + DataBuffer<float>::~DataBuffer() { + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<float>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(float)); + } + currentSize = newSize; + } + // Specializations for int64_t template <> - DataBuffer<int64_t>::~DataBuffer(){ + DataBuffer<int64_t>::~DataBuffer() { if (buf) { memoryPool.free(reinterpret_cast<char*>(buf)); } @@ -185,10 +212,64 @@ namespace orc { currentSize = newSize; } + // Specializations for int32_t + + template <> + DataBuffer<int32_t>::~DataBuffer() { + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<int32_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int32_t)); + } + currentSize = newSize; + } + + // Specializations for int16_t + + template <> + DataBuffer<int16_t>::~DataBuffer() { + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<int16_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int16_t)); + } + currentSize = newSize; + } + + // Specializations for int8_t + + template <> + DataBuffer<int8_t>::~DataBuffer() { + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<int8_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int8_t)); + } + currentSize = newSize; + } + // Specializations for uint64_t template <> - DataBuffer<uint64_t>::~DataBuffer(){ + DataBuffer<uint64_t>::~DataBuffer() { if (buf) { memoryPool.free(reinterpret_cast<char*>(buf)); } @@ -206,7 +287,7 @@ namespace orc { // Specializations for unsigned char template <> - DataBuffer<unsigned char>::~DataBuffer(){ + DataBuffer<unsigned char>::~DataBuffer() { if (buf) { memoryPool.free(reinterpret_cast<char*>(buf)); } @@ -221,24 +302,28 @@ namespace orc { currentSize = newSize; } - #ifdef __clang__ - #pragma clang diagnostic ignored "-Wweak-template-vtables" - #endif +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wweak-template-vtables" +#endif template class DataBuffer<char>; template class DataBuffer<char*>; template class DataBuffer<double>; + template class DataBuffer<float>; template class DataBuffer<Int128>; template class DataBuffer<int64_t>; + template class DataBuffer<int32_t>; + template class DataBuffer<int16_t>; + template class DataBuffer<int8_t>; template class DataBuffer<uint64_t>; template class DataBuffer<unsigned char>; - #ifdef __clang__ - #pragma clang diagnostic ignored "-Wexit-time-destructors" - #endif +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wexit-time-destructors" +#endif MemoryPool* getDefaultPool() { static MemoryPoolImpl internal; return &internal; } -} // namespace orc +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.cc b/contrib/libs/apache/orc/c++/src/Murmur3.cc index b45bd6d492..518e5e6de5 100644 --- a/contrib/libs/apache/orc/c++/src/Murmur3.cc +++ b/contrib/libs/apache/orc/c++/src/Murmur3.cc @@ -16,14 +16,14 @@ * limitations under the License. */ -#include "Adaptor.hh" #include "Murmur3.hh" +#include "Adaptor.hh" #define ROTL64(x, r) ((x << r) | (x >> (64 - r))) namespace orc { - inline uint64_t rotl64 ( uint64_t x, int8_t r ) { + inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); } @@ -36,17 +36,17 @@ namespace orc { return value; } - uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) { + uint64_t Murmur3::hash64(const uint8_t* data, uint32_t len) { return hash64(data, len, DEFAULT_SEED); } DIAGNOSTIC_PUSH #if defined(__clang__) - DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough") + DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough") #endif - uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) { + uint64_t Murmur3::hash64(const uint8_t* data, uint32_t len, uint32_t seed) { uint64_t h = seed; uint32_t blocks = len >> 3; @@ -69,16 +69,22 @@ namespace orc { switch (len - idx) { case 7: k ^= static_cast<uint64_t>(data[idx + 6]) << 48; + [[fallthrough]]; case 6: k ^= static_cast<uint64_t>(data[idx + 5]) << 40; + [[fallthrough]]; case 5: k ^= static_cast<uint64_t>(data[idx + 4]) << 32; + [[fallthrough]]; case 4: k ^= static_cast<uint64_t>(data[idx + 3]) << 24; + [[fallthrough]]; case 3: k ^= static_cast<uint64_t>(data[idx + 2]) << 16; + [[fallthrough]]; case 2: k ^= static_cast<uint64_t>(data[idx + 1]) << 8; + [[fallthrough]]; case 1: k ^= static_cast<uint64_t>(data[idx + 0]); @@ -95,4 +101,4 @@ namespace orc { DIAGNOSTIC_POP -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.hh b/contrib/libs/apache/orc/c++/src/Murmur3.hh index 02391811b0..e3db8654bf 100644 --- a/contrib/libs/apache/orc/c++/src/Murmur3.hh +++ b/contrib/libs/apache/orc/c++/src/Murmur3.hh @@ -24,17 +24,17 @@ namespace orc { class Murmur3 { - public: + public: static const uint32_t DEFAULT_SEED = 104729; static const uint64_t NULL_HASHCODE = 2862933555777941757LL; - static uint64_t hash64(const uint8_t *data, uint32_t len); + static uint64_t hash64(const uint8_t* data, uint32_t len); - private: + private: static uint64_t fmix64(uint64_t value); static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed); }; -} +} // namespace orc -#endif //ORC_MURMUR3_HH +#endif // ORC_MURMUR3_HH diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh index d8331b3c0a..51cd8efd64 100644 --- a/contrib/libs/apache/orc/c++/src/Options.hh +++ b/contrib/libs/apache/orc/c++/src/Options.hh @@ -34,31 +34,30 @@ namespace orc { ColumnSelection_TYPE_IDS = 3, }; -/** - * ReaderOptions Implementation - */ + /** + * ReaderOptions Implementation + */ struct ReaderOptionsPrivate { uint64_t tailLocation; std::ostream* errorStream; MemoryPool* memoryPool; std::string serializedTail; + ReaderMetrics* metrics; ReaderOptionsPrivate() { tailLocation = std::numeric_limits<uint64_t>::max(); errorStream = &std::cerr; memoryPool = getDefaultPool(); + metrics = nullptr; } }; - ReaderOptions::ReaderOptions(): - privateBits(std::unique_ptr<ReaderOptionsPrivate> - (new ReaderOptionsPrivate())) { + ReaderOptions::ReaderOptions() : privateBits(std::make_unique<ReaderOptionsPrivate>()) { // PASS } - ReaderOptions::ReaderOptions(const ReaderOptions& rhs): - privateBits(std::unique_ptr<ReaderOptionsPrivate> - (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) { + ReaderOptions::ReaderOptions(const ReaderOptions& rhs) + : privateBits(std::make_unique<ReaderOptionsPrivate>(*(rhs.privateBits.get()))) { // PASS } @@ -83,10 +82,19 @@ namespace orc { return *this; } - MemoryPool* ReaderOptions::getMemoryPool() const{ + MemoryPool* ReaderOptions::getMemoryPool() const { return privateBits->memoryPool; } + ReaderOptions& ReaderOptions::setReaderMetrics(ReaderMetrics* metrics) { + privateBits->metrics = metrics; + return *this; + } + + ReaderMetrics* ReaderOptions::getReaderMetrics() const { + return privateBits->metrics; + } + ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) { privateBits->tailLocation = offset; return *this; @@ -96,8 +104,7 @@ namespace orc { return privateBits->tailLocation; } - ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value - ) { + ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value) { privateBits->serializedTail = value; return *this; } @@ -115,9 +122,9 @@ namespace orc { return privateBits->errorStream; } -/** - * RowReaderOptions Implementation - */ + /** + * RowReaderOptions Implementation + */ struct RowReaderOptionsPrivate { ColumnSelection selection; @@ -131,6 +138,9 @@ namespace orc { std::shared_ptr<SearchArgument> sargs; std::string readerTimezone; RowReaderOptions::IdReadIntentMap idReadIntentMap; + bool useTightNumericVector; + std::shared_ptr<Type> readType; + bool throwOnSchemaEvolutionOverflow; RowReaderOptionsPrivate() { selection = ColumnSelection_NONE; @@ -140,18 +150,17 @@ namespace orc { forcedScaleOnHive11Decimal = 6; enableLazyDecoding = false; readerTimezone = "GMT"; + useTightNumericVector = false; + throwOnSchemaEvolutionOverflow = false; } }; - RowReaderOptions::RowReaderOptions(): - privateBits(std::unique_ptr<RowReaderOptionsPrivate> - (new RowReaderOptionsPrivate())) { + RowReaderOptions::RowReaderOptions() : privateBits(std::make_unique<RowReaderOptionsPrivate>()) { // PASS } - RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs): - privateBits(std::unique_ptr<RowReaderOptionsPrivate> - (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) { + RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs) + : privateBits(std::make_unique<RowReaderOptionsPrivate>(*(rhs.privateBits.get()))) { // PASS } @@ -195,8 +204,8 @@ namespace orc { return *this; } - RowReaderOptions& - RowReaderOptions::includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap) { + RowReaderOptions& RowReaderOptions::includeTypesWithIntents( + const IdReadIntentMap& idReadIntentMap) { privateBits->selection = ColumnSelection_TYPE_IDS; privateBits->includedColumnIndexes.clear(); privateBits->idReadIntentMap.clear(); @@ -242,7 +251,7 @@ namespace orc { return privateBits->dataLength; } - RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){ + RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow) { privateBits->throwOnHive11DecimalOverflow = shouldThrow; return *this; } @@ -251,8 +260,16 @@ namespace orc { return privateBits->throwOnHive11DecimalOverflow; } - RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale - ) { + RowReaderOptions& RowReaderOptions::throwOnSchemaEvolutionOverflow(bool shouldThrow) { + privateBits->throwOnSchemaEvolutionOverflow = shouldThrow; + return *this; + } + + bool RowReaderOptions::getThrowOnSchemaEvolutionOverflow() const { + return privateBits->throwOnSchemaEvolutionOverflow; + } + + RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale) { privateBits->forcedScaleOnHive11Decimal = forcedScale; return *this; } @@ -288,10 +305,27 @@ namespace orc { return privateBits->readerTimezone; } - const RowReaderOptions::IdReadIntentMap - RowReaderOptions::getIdReadIntentMap() const { + const RowReaderOptions::IdReadIntentMap RowReaderOptions::getIdReadIntentMap() const { return privateBits->idReadIntentMap; } -} + + RowReaderOptions& RowReaderOptions::setUseTightNumericVector(bool useTightNumericVector) { + privateBits->useTightNumericVector = useTightNumericVector; + return *this; + } + + bool RowReaderOptions::getUseTightNumericVector() const { + return privateBits->useTightNumericVector; + } + + RowReaderOptions& RowReaderOptions::setReadType(std::shared_ptr<Type> type) { + privateBits->readType = std::move(type); + return *this; + } + + std::shared_ptr<Type>& RowReaderOptions::getReadType() const { + return privateBits->readType; + } +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc index a0158bbadf..d4b6a86e2f 100644 --- a/contrib/libs/apache/orc/c++/src/OrcFile.cc +++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc @@ -16,15 +16,16 @@ * limitations under the License. */ -#include "Adaptor.hh" #include "orc/OrcFile.hh" +#include "Adaptor.hh" +#include "Utils.hh" #include "orc/Exceptions.hh" #include <errno.h> #include <fcntl.h> #include <stdio.h> -#include <sys/stat.h> #include <string.h> +#include <sys/stat.h> #ifdef _MSC_VER #include <io.h> @@ -32,6 +33,7 @@ #define S_IWUSR _S_IWRITE #define stat _stat64 #define fstat _fstat64 +#define fsync _commit #else #include <unistd.h> #define O_BINARY 0 @@ -39,15 +41,22 @@ namespace orc { + DIAGNOSTIC_PUSH + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wunused-private-field") +#endif + class FileInputStream : public InputStream { - private: + private: std::string filename; int file; uint64_t totalLength; + ReaderMetrics* metrics; - public: - FileInputStream(std::string _filename) { - filename = _filename; + public: + FileInputStream(std::string _filename, ReaderMetrics* _metrics) + : filename(_filename), metrics(_metrics) { file = open(filename.c_str(), O_BINARY | O_RDONLY); if (file == -1) { throw ParseError("Can't open " + filename); @@ -69,9 +78,8 @@ namespace orc { return 128 * 1024; } - void read(void* buf, - uint64_t length, - uint64_t offset) override { + void read(void* buf, uint64_t length, uint64_t offset) override { + SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); if (!buf) { throw ParseError("Buffer is null"); } @@ -94,42 +102,41 @@ namespace orc { close(file); } - std::unique_ptr<InputStream> readFile(const std::string& path) { + std::unique_ptr<InputStream> readFile(const std::string& path, ReaderMetrics* metrics) { #ifdef BUILD_LIBHDFSPP - if(strncmp (path.c_str(), "hdfs://", 7) == 0){ - return orc::readHdfsFile(std::string(path)); + if (strncmp(path.c_str(), "hdfs://", 7) == 0) { + return orc::readHdfsFile(std::string(path), metrics); } else { #endif - return orc::readLocalFile(std::string(path)); + return orc::readLocalFile(std::string(path), metrics); #ifdef BUILD_LIBHDFSPP - } + } #endif } - std::unique_ptr<InputStream> readLocalFile(const std::string& path) { - return std::unique_ptr<InputStream>(new FileInputStream(path)); + DIAGNOSTIC_POP + + std::unique_ptr<InputStream> readLocalFile(const std::string& path, ReaderMetrics* metrics) { + return std::make_unique<FileInputStream>(path, metrics); } - OutputStream::~OutputStream() { + OutputStream::~OutputStream(){ // PASS }; class FileOutputStream : public OutputStream { - private: + private: std::string filename; int file; uint64_t bytesWritten; bool closed; - public: + public: FileOutputStream(std::string _filename) { bytesWritten = 0; filename = _filename; closed = false; - file = open( - filename.c_str(), - O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, - S_IRUSR | S_IWUSR); + file = open(filename.c_str(), O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR); if (file == -1) { throw ParseError("Can't open " + filename); } @@ -169,6 +176,12 @@ namespace orc { closed = true; } } + + void flush() override { + if (!closed) { + ::fsync(file); + } + } }; FileOutputStream::~FileOutputStream() { @@ -179,6 +192,6 @@ namespace orc { } std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) { - return std::unique_ptr<OutputStream>(new FileOutputStream(path)); + return std::make_unique<FileOutputStream>(path); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc index 21f9082216..89aca6a10e 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.cc +++ b/contrib/libs/apache/orc/c++/src/RLE.cc @@ -1,20 +1,20 @@ /** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "RLEv1.hh" #include "RLEv2.hh" @@ -30,52 +30,53 @@ namespace orc { // PASS } - std::unique_ptr<RleEncoder> createRleEncoder - (std::unique_ptr<BufferedOutputStream> output, - bool isSigned, - RleVersion version, - MemoryPool&, - bool alignedBitpacking) { + std::unique_ptr<RleEncoder> createRleEncoder(std::unique_ptr<BufferedOutputStream> output, + bool isSigned, RleVersion version, MemoryPool&, + bool alignedBitpacking) { switch (static_cast<int64_t>(version)) { - case RleVersion_1: - // We don't have std::make_unique() yet. - return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output), - isSigned)); - case RleVersion_2: - return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output), - isSigned, alignedBitpacking)); - default: - throw NotImplementedYet("Not implemented yet"); + case RleVersion_1: + return std::make_unique<RleEncoderV1>(std::move(output), isSigned); + case RleVersion_2: + return std::make_unique<RleEncoderV2>(std::move(output), isSigned, alignedBitpacking); + default: + throw NotImplementedYet("Not implemented yet"); } } - std::unique_ptr<RleDecoder> createRleDecoder - (std::unique_ptr<SeekableInputStream> input, - bool isSigned, - RleVersion version, - MemoryPool& pool) { + std::unique_ptr<RleDecoder> createRleDecoder(std::unique_ptr<SeekableInputStream> input, + bool isSigned, RleVersion version, MemoryPool& pool, + ReaderMetrics* metrics) { switch (static_cast<int64_t>(version)) { - case RleVersion_1: - // We don't have std::make_unique() yet. - return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input), - isSigned)); - case RleVersion_2: - return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input), - isSigned, pool)); - default: - throw NotImplementedYet("Not implemented yet"); + case RleVersion_1: + return std::make_unique<RleDecoderV1>(std::move(input), isSigned, metrics); + case RleVersion_2: + return std::make_unique<RleDecoderV2>(std::move(input), isSigned, pool, metrics); + default: + throw NotImplementedYet("Not implemented yet"); } } - void RleEncoder::add(const int64_t* data, uint64_t numValues, - const char* notNull) { + template <typename T> + void RleEncoder::add(const T* data, uint64_t numValues, const char* notNull) { for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { - write(data[i]); + write(static_cast<int64_t>(data[i])); } } } + void RleEncoder::add(const int64_t* data, uint64_t numValues, const char* notNull) { + add<int64_t>(data, numValues, notNull); + } + + void RleEncoder::add(const int32_t* data, uint64_t numValues, const char* notNull) { + add<int32_t>(data, numValues, notNull); + } + + void RleEncoder::add(const int16_t* data, uint64_t numValues, const char* notNull) { + add<int16_t>(data, numValues, notNull); + } + void RleEncoder::writeVslong(int64_t val) { writeVulong((val << 1) ^ (val >> 63)); } @@ -96,7 +97,7 @@ namespace orc { void RleEncoder::writeByte(char c) { if (bufferPosition == bufferLength) { int addedSize = 0; - if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { + if (!outputStream->Next(reinterpret_cast<void**>(&buffer), &addedSize)) { throw std::bad_alloc(); } bufferPosition = 0; diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh index 6822bd812e..51f9b6f58a 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.hh +++ b/contrib/libs/apache/orc/c++/src/RLE.hh @@ -35,20 +35,18 @@ namespace orc { } class RleEncoder { - public: + public: // must be non-inline! virtual ~RleEncoder(); - RleEncoder( - std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned): - outputStream(std::move(outStream)), - bufferPosition(0), - bufferLength(0), - numLiterals(0), - isSigned(hasSigned), - buffer(nullptr){ - //pass + RleEncoder(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned) + : outputStream(std::move(outStream)), + bufferPosition(0), + bufferLength(0), + numLiterals(0), + isSigned(hasSigned), + buffer(nullptr) { + // pass } /** @@ -58,14 +56,19 @@ namespace orc { * @param notNull If the pointer is null, all values are read. If the * pointer is not null, positions that are false are skipped. */ - virtual void add(const int64_t* data, uint64_t numValues, - const char* notNull); + template <typename T> + void add(const T* data, uint64_t numValues, const char* notNull); + virtual void add(const int64_t* data, uint64_t numValues, const char* notNull); + + virtual void add(const int32_t* data, uint64_t numValues, const char* notNull); + + virtual void add(const int16_t* data, uint64_t numValues, const char* notNull); /** * Get size of buffer used so far. */ uint64_t getBufferSize() const { - return outputStream->getSize(); + return outputStream->getSize(); } /** @@ -81,7 +84,7 @@ namespace orc { virtual void write(int64_t val) = 0; - protected: + protected: std::unique_ptr<BufferedOutputStream> outputStream; size_t bufferPosition; size_t bufferLength; @@ -98,10 +101,14 @@ namespace orc { }; class RleDecoder { - public: + public: // must be non-inline! virtual ~RleDecoder(); + RleDecoder(ReaderMetrics* _metrics) : metrics(_metrics) { + // pass + } + /** * Seek to a particular spot. */ @@ -119,8 +126,14 @@ namespace orc { * @param notNull If the pointer is null, all values are read. If the * pointer is not null, positions that are false are skipped. */ - virtual void next(int64_t* data, uint64_t numValues, - const char* notNull) = 0; + virtual void next(int64_t* data, uint64_t numValues, const char* notNull) = 0; + + virtual void next(int32_t* data, uint64_t numValues, const char* notNull) = 0; + + virtual void next(int16_t* data, uint64_t numValues, const char* notNull) = 0; + + protected: + ReaderMetrics* metrics; }; /** @@ -130,12 +143,9 @@ namespace orc { * @param version version of RLE decoding to do * @param pool memory pool to use for allocation */ - std::unique_ptr<RleEncoder> createRleEncoder - (std::unique_ptr<BufferedOutputStream> output, - bool isSigned, - RleVersion version, - MemoryPool& pool, - bool alignedBitpacking); + std::unique_ptr<RleEncoder> createRleEncoder(std::unique_ptr<BufferedOutputStream> output, + bool isSigned, RleVersion version, MemoryPool& pool, + bool alignedBitpacking); /** * Create an RLE decoder. @@ -144,11 +154,9 @@ namespace orc { * @param version version of RLE decoding to do * @param pool memory pool to use for allocation */ - std::unique_ptr<RleDecoder> createRleDecoder - (std::unique_ptr<SeekableInputStream> input, - bool isSigned, - RleVersion version, - MemoryPool& pool); + std::unique_ptr<RleDecoder> createRleDecoder(std::unique_ptr<SeekableInputStream> input, + bool isSigned, RleVersion version, MemoryPool& pool, + ReaderMetrics* metrics); } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc index 12e2d057cd..be2c6e2875 100644 --- a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc +++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc @@ -1,19 +1,20 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file - * distributed with option work for additional information - * regarding copyright ownership. The ASF licenses option file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the - * "License"); you may not use option file except in compliance + * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #include "RLEV2Util.hh" @@ -21,50 +22,44 @@ namespace orc { // Map FBS enum to bit width value. - const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 26, 28, 30, 32, 40, 48, 56, 64 - }; + const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 26, 28, 30, 32, 40, 48, 56, 64}; // Map bit length i to closest fixed bit width that can contain i bits. const uint8_t ClosestFixedBitsMap[65] = { - 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 26, 26, 28, 28, 30, 30, 32, 32, - 40, 40, 40, 40, 40, 40, 40, 40, - 48, 48, 48, 48, 48, 48, 48, 48, - 56, 56, 56, 56, 56, 56, 56, 56, - 64, 64, 64, 64, 64, 64, 64, 64 - }; + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 26, 26, 28, 28, 30, 30, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64}; // Map bit length i to closest aligned fixed bit width that can contain i bits. const uint8_t ClosestAlignedFixedBitsMap[65] = { - 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, - 32, 32, 32, 32, 32, 32, 32, 32, - 40, 40, 40, 40, 40, 40, 40, 40, - 48, 48, 48, 48, 48, 48, 48, 48, - 56, 56, 56, 56, 56, 56, 56, 56, - 64, 64, 64, 64, 64, 64, 64, 64 - }; + 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, + 24, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64}; // Map bit width to FBS enum. const uint8_t BitWidthToFBSMap[65] = { - FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR, - FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT, - FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE, - FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, - FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY, - FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR, - FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX, - FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, - FixedBitSizes::THIRTY, FixedBitSizes::THIRTY, - FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO, - FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, - FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, - FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, - FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, - FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, - FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, - FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, - FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR - }; -} + FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, + FixedBitSizes::THREE, FixedBitSizes::FOUR, FixedBitSizes::FIVE, + FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT, + FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, + FixedBitSizes::TWELVE, FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, + FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, FixedBitSizes::SEVENTEEN, + FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY, + FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, + FixedBitSizes::TWENTYFOUR, FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX, + FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, FixedBitSizes::THIRTY, + FixedBitSizes::THIRTY, FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR}; +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh index 95a6826eaa..89c6913400 100644 --- a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh +++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh @@ -1,20 +1,20 @@ /** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef ORC_RLEV2UTIL_HH #define ORC_RLEV2UTIL_HH @@ -74,8 +74,8 @@ namespace orc { } inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) { - return static_cast<uint32_t >(encoding << 6); + return static_cast<uint32_t>(encoding << 6); } -} +} // namespace orc -#endif //ORC_RLEV2UTIL_HH +#endif // ORC_RLEV2UTIL_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc index fe333978db..b221e8b8aa 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.cc +++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc @@ -16,287 +16,295 @@ * limitations under the License. */ +#include "RLEv1.hh" #include "Adaptor.hh" #include "Compression.hh" +#include "Utils.hh" #include "orc/Exceptions.hh" -#include "RLEv1.hh" #include <algorithm> namespace orc { -const uint64_t MINIMUM_REPEAT = 3; -const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; + const uint64_t MINIMUM_REPEAT = 3; + const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; -const int64_t BASE_128_MASK = 0x7f; + const int64_t BASE_128_MASK = 0x7f; -const int64_t MAX_DELTA = 127; -const int64_t MIN_DELTA = -128; -const uint64_t MAX_LITERAL_SIZE = 128; + const int64_t MAX_DELTA = 127; + const int64_t MIN_DELTA = -128; + const uint64_t MAX_LITERAL_SIZE = 128; -RleEncoderV1::RleEncoderV1( - std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned): - RleEncoder(std::move(outStream), hasSigned) { - literals = new int64_t[MAX_LITERAL_SIZE]; - delta = 0; - repeat = false; - tailRunLength = 0; -} + RleEncoderV1::RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned) + : RleEncoder(std::move(outStream), hasSigned) { + literals = new int64_t[MAX_LITERAL_SIZE]; + delta = 0; + repeat = false; + tailRunLength = 0; + } -RleEncoderV1::~RleEncoderV1() { - delete [] literals; -} + RleEncoderV1::~RleEncoderV1() { + delete[] literals; + } -void RleEncoderV1::writeValues() { - if (numLiterals != 0) { - if (repeat) { - writeByte(static_cast<char> - (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); - writeByte(static_cast<char>(delta)); - if (isSigned) { - writeVslong(literals[0]); - } else { - writeVulong(literals[0]); - } - } else { - writeByte(static_cast<char>(-numLiterals)); - for(size_t i=0; i < numLiterals; ++i) { + void RleEncoderV1::writeValues() { + if (numLiterals != 0) { + if (repeat) { + writeByte(static_cast<char>(static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); + writeByte(static_cast<char>(delta)); if (isSigned) { - writeVslong(literals[i]); + writeVslong(literals[0]); } else { - writeVulong(literals[i]); + writeVulong(literals[0]); + } + } else { + writeByte(static_cast<char>(-numLiterals)); + for (size_t i = 0; i < numLiterals; ++i) { + if (isSigned) { + writeVslong(literals[i]); + } else { + writeVulong(literals[i]); + } } } + repeat = false; + numLiterals = 0; + tailRunLength = 0; } - repeat = false; - numLiterals = 0; - tailRunLength = 0; } -} -uint64_t RleEncoderV1::flush() { - writeValues(); - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; -} + uint64_t RleEncoderV1::flush() { + writeValues(); + outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; + } -void RleEncoderV1::write(int64_t value) { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { - numLiterals += 1; - if (numLiterals == MAXIMUM_REPEAT) { - writeValues(); - } - } else { - writeValues(); + void RleEncoderV1::write(int64_t value) { + if (numLiterals == 0) { literals[numLiterals++] = value; tailRunLength = 1; - } - } else { - if (tailRunLength == 1) { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; + } else if (repeat) { + if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { + numLiterals += 1; + if (numLiterals == MAXIMUM_REPEAT) { + writeValues(); + } } else { - tailRunLength = 2; + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; } - } else if (value == literals[numLiterals - 1] + delta) { - tailRunLength += 1; } else { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; + if (tailRunLength == 1) { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } else if (value == literals[numLiterals - 1] + delta) { + tailRunLength += 1; } else { - tailRunLength = 2; + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } } - } - if (tailRunLength == MINIMUM_REPEAT) { - if (numLiterals + 1 == MINIMUM_REPEAT) { - repeat = true; - numLiterals += 1; + if (tailRunLength == MINIMUM_REPEAT) { + if (numLiterals + 1 == MINIMUM_REPEAT) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); + int64_t base = literals[numLiterals]; + writeValues(); + literals[0] = base; + repeat = true; + numLiterals = MINIMUM_REPEAT; + } } else { - numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); - int64_t base = literals[numLiterals]; - writeValues(); - literals[0] = base; - repeat = true; - numLiterals = MINIMUM_REPEAT; + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); + } + } + + signed char RleDecoderV1::readByte() { + SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in readByte"); } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; } + return static_cast<signed char>(*(bufferStart++)); } -} -signed char RleDecoderV1::readByte() { - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in readByte"); + uint64_t RleDecoderV1::readLong() { + uint64_t result = 0; + int64_t offset = 0; + signed char ch = readByte(); + if (ch >= 0) { + result = static_cast<uint64_t>(ch); + } else { + result = static_cast<uint64_t>(ch) & BASE_128_MASK; + while ((ch = readByte()) < 0) { + offset += 7; + result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset; + } + result |= static_cast<uint64_t>(ch) << (offset + 7); } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; + return result; } - return *(bufferStart++); -} -uint64_t RleDecoderV1::readLong() { - uint64_t result = 0; - int64_t offset = 0; - signed char ch = readByte(); - if (ch >= 0) { - result = static_cast<uint64_t>(ch); - } else { - result = static_cast<uint64_t>(ch) & BASE_128_MASK; - while ((ch = readByte()) < 0) { - offset += 7; - result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset; + void RleDecoderV1::skipLongs(uint64_t numValues) { + while (numValues > 0) { + if (readByte() >= 0) { + --numValues; + } } - result |= static_cast<uint64_t>(ch) << (offset + 7); } - return result; -} -void RleDecoderV1::skipLongs(uint64_t numValues) { - while (numValues > 0) { - if (readByte() >= 0) { - --numValues; + void RleDecoderV1::readHeader() { + signed char ch = readByte(); + if (ch < 0) { + remainingValues = static_cast<uint64_t>(-ch); + repeating = false; + } else { + remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; + repeating = true; + delta = readByte(); + value = isSigned ? unZigZag(readLong()) : static_cast<int64_t>(readLong()); } } -} -void RleDecoderV1::readHeader() { - signed char ch = readByte(); - if (ch < 0) { - remainingValues = static_cast<uint64_t>(-ch); + void RleDecoderV1::reset() { + remainingValues = 0; + value = 0; + bufferStart = nullptr; + bufferEnd = nullptr; + delta = 0; repeating = false; - } else { - remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; - repeating = true; - delta = readByte(); - value = isSigned - ? unZigZag(readLong()) - : static_cast<int64_t>(readLong()); } -} - -RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, - bool hasSigned) - : inputStream(std::move(input)), - isSigned(hasSigned), - remainingValues(0), - value(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - delta(0), - repeating(false) { -} -void RleDecoderV1::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // force a re-read from the stream - bufferEnd = bufferStart; - // read a new header - readHeader(); - // skip ahead the given number of records - skip(location.next()); -} + RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool hasSigned, + ReaderMetrics* _metrics) + : RleDecoder(_metrics), inputStream(std::move(input)), isSigned(hasSigned) { + reset(); + } -void RleDecoderV1::skip(uint64_t numValues) { - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; - numValues -= count; - if (repeating) { - value += delta * static_cast<int64_t>(count); - } else { - skipLongs(count); - } + void RleDecoderV1::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // reset the decoder status and lazily call readHeader() + reset(); + // skip ahead the given number of records + skip(location.next()); } -} -void RleDecoderV1::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - uint64_t position = 0; - // skipNulls() - if (notNull) { - // Skip over null values. - while (position < numValues && !notNull[position]) { - ++position; + void RleDecoderV1::skip(uint64_t numValues) { + while (numValues > 0) { + if (remainingValues == 0) { + readHeader(); + } + uint64_t count = std::min(numValues, remainingValues); + remainingValues -= count; + numValues -= count; + if (repeating) { + value += delta * static_cast<int64_t>(count); + } else { + skipLongs(count); + } } } - while (position < numValues) { - // If we are out of values, read more. - if (remainingValues == 0) { - readHeader(); + + template <typename T> + void RleDecoderV1::next(T* const data, const uint64_t numValues, const char* const notNull) { + SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall); + uint64_t position = 0; + // skipNulls() + if (notNull) { + // Skip over null values. + while (position < numValues && !notNull[position]) { + ++position; + } } - // How many do we read out of this block? - uint64_t count = std::min(numValues - position, remainingValues); - uint64_t consumed = 0; - if (repeating) { - if (notNull) { - for (uint64_t i = 0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = value + static_cast<int64_t>(consumed) * delta; - consumed += 1; - } - } - } else { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = value + static_cast<int64_t>(i) * delta; - } - consumed = count; + while (position < numValues) { + // If we are out of values, read more. + if (remainingValues == 0) { + readHeader(); } - value += static_cast<int64_t>(consumed) * delta; - } else { - if (notNull) { - for (uint64_t i = 0 ; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = isSigned - ? unZigZag(readLong()) - : static_cast<int64_t>(readLong()); - ++consumed; + // How many do we read out of this block? + uint64_t count = std::min(numValues - position, remainingValues); + uint64_t consumed = 0; + if (repeating) { + if (notNull) { + for (uint64_t i = 0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = static_cast<T>(value + static_cast<int64_t>(consumed) * delta); + consumed += 1; + } + } + } else { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = static_cast<T>(value + static_cast<int64_t>(i) * delta); } + consumed = count; } + value += static_cast<int64_t>(consumed) * delta; } else { - if (isSigned) { + if (notNull) { for (uint64_t i = 0; i < count; ++i) { - data[position + i] = unZigZag(readLong()); + if (notNull[position + i]) { + data[position + i] = + isSigned ? static_cast<T>(unZigZag(readLong())) : static_cast<T>(readLong()); + ++consumed; + } } } else { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = static_cast<int64_t>(readLong()); + if (isSigned) { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = static_cast<T>(unZigZag(readLong())); + } + } else { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = static_cast<T>(readLong()); + } } + consumed = count; } - consumed = count; } - } - remainingValues -= consumed; - position += count; + remainingValues -= consumed; + position += count; - // skipNulls() - if (notNull) { - // Skip over null values. - while (position < numValues && !notNull[position]) { - ++position; + // skipNulls() + if (notNull) { + // Skip over null values. + while (position < numValues && !notNull[position]) { + ++position; + } } } } -} + void RleDecoderV1::next(int64_t* data, uint64_t numValues, const char* notNull) { + next<int64_t>(data, numValues, notNull); + } + + void RleDecoderV1::next(int32_t* data, uint64_t numValues, const char* notNull) { + next<int32_t>(data, numValues, notNull); + } + + void RleDecoderV1::next(int16_t* data, uint64_t numValues, const char* notNull) { + next<int16_t>(data, numValues, notNull); + } } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh index 8e31d70873..fbe6b0f9c6 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh @@ -1,20 +1,20 @@ /** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef ORC_RLEV1_HH #define ORC_RLEV1_HH @@ -26,11 +26,10 @@ namespace orc { -class RleEncoderV1 : public RleEncoder { -public: - RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned); - ~RleEncoderV1() override ; + class RleEncoderV1 : public RleEncoder { + public: + RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned); + ~RleEncoderV1() override; /** * Flushing underlying BufferedOutputStream @@ -39,36 +38,41 @@ public: void write(int64_t val) override; -private: + private: int64_t delta; bool repeat; uint64_t tailRunLength; void writeValues(); -}; + }; -class RleDecoderV1 : public RleDecoder { -public: - RleDecoderV1(std::unique_ptr<SeekableInputStream> input, - bool isSigned); + class RleDecoderV1 : public RleDecoder { + public: + RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool isSigned, ReaderMetrics* metrics); /** - * Seek to a particular spot. - */ + * Seek to a particular spot. + */ void seek(PositionProvider&) override; /** - * Seek over a given number of values. - */ + * Seek over a given number of values. + */ void skip(uint64_t numValues) override; /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; + * Read a number of values into the batch. + */ + template <typename T> + void next(T* data, uint64_t numValues, const char* notNull); + + void next(int64_t* data, uint64_t numValues, const char* notNull) override; -private: + void next(int32_t* data, uint64_t numValues, const char* notNull) override; + + void next(int16_t* data, uint64_t numValues, const char* notNull) override; + + private: inline signed char readByte(); inline void readHeader(); @@ -77,15 +81,17 @@ private: inline void skipLongs(uint64_t numValues); + inline void reset(); + const std::unique_ptr<SeekableInputStream> inputStream; const bool isSigned; uint64_t remainingValues; int64_t value; - const char *bufferStart; - const char *bufferEnd; + const char* bufferStart; + const char* bufferEnd; int64_t delta; bool repeating; -}; + }; } // namespace orc #endif // ORC_RLEV1_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh index b1e68fb125..1cee59d0a6 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv2.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh @@ -1,27 +1,27 @@ /** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef ORC_RLEV2_HH #define ORC_RLEV2_HH #include "Adaptor.hh" -#include "orc/Exceptions.hh" #include "RLE.hh" +#include "orc/Exceptions.hh" #include <vector> @@ -30,46 +30,76 @@ #define HIST_LEN 32 namespace orc { -struct FixedBitSizes { + struct FixedBitSizes { enum FBS { - ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, - TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE + ONE = 0, + TWO, + THREE, + FOUR, + FIVE, + SIX, + SEVEN, + EIGHT, + NINE, + TEN, + ELEVEN, + TWELVE, + THIRTEEN, + FOURTEEN, + FIFTEEN, + SIXTEEN, + SEVENTEEN, + EIGHTEEN, + NINETEEN, + TWENTY, + TWENTYONE, + TWENTYTWO, + TWENTYTHREE, + TWENTYFOUR, + TWENTYSIX, + TWENTYEIGHT, + THIRTY, + THIRTYTWO, + FORTY, + FORTYEIGHT, + FIFTYSIX, + SIXTYFOUR, + SIZE }; -}; - -enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; - -struct EncodingOption { - EncodingType encoding; - int64_t fixedDelta; - int64_t gapVsPatchListCount; - int64_t zigzagLiteralsCount; - int64_t baseRedLiteralsCount; - int64_t adjDeltasCount; - uint32_t zzBits90p; - uint32_t zzBits100p; - uint32_t brBits95p; - uint32_t brBits100p; - uint32_t bitsDeltaMax; - uint32_t patchWidth; - uint32_t patchGapWidth; - uint32_t patchLength; - int64_t min; - bool isFixedDelta; -}; - -class RleEncoderV2 : public RleEncoder { -public: - RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true); + }; + + enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 }; + + struct EncodingOption { + EncodingType encoding; + int64_t fixedDelta; + int64_t gapVsPatchListCount; + int64_t zigzagLiteralsCount; + int64_t baseRedLiteralsCount; + int64_t adjDeltasCount; + uint32_t zzBits90p; + uint32_t zzBits100p; + uint32_t brBits95p; + uint32_t brBits100p; + uint32_t bitsDeltaMax; + uint32_t patchWidth; + uint32_t patchGapWidth; + uint32_t patchLength; + int64_t min; + bool isFixedDelta; + }; + + class RleEncoderV2 : public RleEncoder { + public: + RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, + bool alignBitPacking = true); ~RleEncoderV2() override { - delete [] literals; - delete [] gapVsPatchList; - delete [] zigzagLiterals; - delete [] baseRedLiterals; - delete [] adjDeltas; + delete[] literals; + delete[] gapVsPatchList; + delete[] zigzagLiterals; + delete[] baseRedLiterals; + delete[] adjDeltas; } /** * Flushing underlying BufferedOutputStream @@ -78,20 +108,19 @@ public: void write(int64_t val) override; -private: - + private: const bool alignedBitPacking; uint32_t fixedRunLength; uint32_t variableRunLength; int64_t prevDelta; int32_t histgram[HIST_LEN]; - // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), - // it is move here for performance consideration. + // The four list below should actually belong to EncodingOption since it only holds temporal + // values in write(int64_t val), it is move here for performance consideration. int64_t* gapVsPatchList; - int64_t* zigzagLiterals; - int64_t* baseRedLiterals; - int64_t* adjDeltas; + int64_t* zigzagLiterals; + int64_t* baseRedLiterals; + int64_t* adjDeltas; uint32_t getOpCode(EncodingType encoding); int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); @@ -106,97 +135,155 @@ private: void writeDirectValues(EncodingOption& option); void writePatchedBasedValues(EncodingOption& option); void writeDeltaValues(EncodingOption& option); - uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); -}; - -class RleDecoderV2 : public RleDecoder { -public: - RleDecoderV2(std::unique_ptr<SeekableInputStream> input, - bool isSigned, MemoryPool& pool); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; - -private: - - /** - * Decode the next gap and patch from 'unpackedPatch' and update the index on it. - * Used by PATCHED_BASE. - * - * @param patchBitSize bit size of the patch value - * @param patchMask mask for the patch value - * @param resGap result of gap - * @param resPatch result of patch - * @param patchIdx current index in the 'unpackedPatch' buffer - */ - void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, - int64_t* resGap, int64_t* resPatch, uint64_t* patchIdx); - - void resetReadLongs() { - bitsLeft = 0; - curByte = 0; - } + uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, + bool reuseHist = false); + }; - void resetRun() { - resetReadLongs(); - } + class RleDecoderV2 : public RleDecoder { + public: + RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned, MemoryPool& pool, + ReaderMetrics* metrics); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + template <typename T> + void next(T* data, uint64_t numValues, const char* notNull); + + void next(int64_t* data, uint64_t numValues, const char* notNull) override; + + void next(int32_t* data, uint64_t numValues, const char* notNull) override; + + void next(int16_t* data, uint64_t numValues, const char* notNull) override; + + unsigned char readByte(); + + void setBufStart(const char* start) { + bufferStart = const_cast<char*>(start); + } - unsigned char readByte(); - - int64_t readLongBE(uint64_t bsz); - int64_t readVslong(); - uint64_t readVulong(); - void readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs); - void plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs); - - void unrolledUnpack4(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack8(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack16(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack24(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack32(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack40(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len); - - uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - - uint64_t copyDataFromBuffer(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - - unsigned char firstByte; - uint64_t runLength; // Length of the current run - uint64_t runRead; // Number of returned values of the current run - const char *bufferStart; - const char *bufferEnd; - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs - DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE - DataBuffer<int64_t> literals; // Values of the current run -}; + char* getBufStart() { + return bufferStart; + } + + void setBufEnd(const char* end) { + bufferEnd = const_cast<char*>(end); + } + + char* getBufEnd() { + return bufferEnd; + } + + uint64_t bufLength() { + return bufferEnd - bufferStart; + } + + void setBitsLeft(const uint32_t bits) { + bitsLeft = bits; + } + + void setCurByte(const uint32_t byte) { + curByte = byte; + } + + uint32_t getBitsLeft() { + return bitsLeft; + } + + uint32_t getCurByte() { + return curByte; + } + + /** + * Most hotspot of this function locates in saving stack, so inline this function to have + * performance gain. + */ + inline void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); + + private: + /** + * Decode the next gap and patch from 'unpackedPatch' and update the index on it. + * Used by PATCHED_BASE. + * + * @param patchBitSize bit size of the patch value + * @param patchMask mask for the patch value + * @param resGap result of gap + * @param resPatch result of patch + * @param patchIdx current index in the 'unpackedPatch' buffer + */ + void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, + int64_t* resPatch, uint64_t* patchIdx); + + void resetReadLongs() { + bitsLeft = 0; + curByte = 0; + } + + void resetRun() { + resetReadLongs(); + } + + int64_t readLongBE(uint64_t bsz); + int64_t readVslong(); + uint64_t readVulong(); + void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + + template <typename T> + uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template <typename T> + uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template <typename T> + uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template <typename T> + uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template <typename T> + uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + + const std::unique_ptr<SeekableInputStream> inputStream; + const bool isSigned; + unsigned char firstByte; + char* bufferStart; + char* bufferEnd; + uint64_t runLength; // Length of the current run + uint64_t runRead; // Number of returned values of the current run + uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + uint32_t curByte; // Used by anything that uses readLongs + DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE + DataBuffer<int64_t> literals; // Values of the current run + }; + + inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { + uint64_t remainingLen = bufLength(); + int bufferLength = 0; + const void* bufferPointer = nullptr; + + if (backupByteLen != 0) { + inputStream->BackUp(backupByteLen); + } + + if (len >= remainingLen && resetBuf) { + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::resetBufferStart"); + } + } + + if (bufferPointer == nullptr) { + bufferStart += len; + } else { + bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer)); + bufferEnd = bufferStart + bufferLength; + } + } } // namespace orc #endif // ORC_RLEV2_HH diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc index 6a9068f202..2cc88fbb80 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.cc +++ b/contrib/libs/apache/orc/c++/src/Reader.cc @@ -16,43 +16,49 @@ * limitations under the License. */ +#include "Reader.hh" #include "Adaptor.hh" #include "BloomFilter.hh" #include "Options.hh" -#include "Reader.hh" #include "Statistics.hh" #include "StripeStream.hh" +#include "Utils.hh" #include "wrap/coded-stream-wrapper.h" #include <algorithm> #include <iostream> +#include <iterator> #include <memory> +#include <set> #include <sstream> #include <string> #include <vector> -#include <iterator> -#include <set> namespace orc { // ORC files writen by these versions of cpp writers have inconsistent bloom filter // hashing. Bloom filters of them should not be used. static const char* BAD_CPP_BLOOM_FILTER_VERSIONS[] = { - "1.6.0", "1.6.1", "1.6.2", "1.6.3", "1.6.4", "1.6.5", "1.6.6", "1.6.7", "1.6.8", - "1.6.9", "1.6.10", "1.6.11", "1.7.0"}; + "1.6.0", "1.6.1", "1.6.2", "1.6.3", "1.6.4", "1.6.5", "1.6.6", + "1.6.7", "1.6.8", "1.6.9", "1.6.10", "1.6.11", "1.7.0"}; + + ReaderMetrics* getDefaultReaderMetrics() { + static ReaderMetrics internal; + return &internal; + } const RowReaderOptions::IdReadIntentMap EMPTY_IDREADINTENTMAP() { return {}; } - const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() { + const WriterVersionImpl& WriterVersionImpl::VERSION_HIVE_8732() { static const WriterVersionImpl version(WriterVersion_HIVE_8732); return version; } uint64_t getCompressionBlockSize(const proto::PostScript& ps) { - if (ps.has_compressionblocksize()) { - return ps.compressionblocksize(); + if (ps.has_compression_block_size()) { + return ps.compression_block_size(); } else { return 256 * 1024; } @@ -67,31 +73,29 @@ namespace orc { } std::string ColumnSelector::toDotColumnPath() { - if (columns.empty()) { - return std::string(); - } - std::ostringstream columnStream; - std::copy(columns.begin(), columns.end(), + if (columns.empty()) { + return std::string(); + } + std::ostringstream columnStream; + std::copy(columns.begin(), columns.end(), std::ostream_iterator<std::string>(columnStream, ".")); - std::string columnPath = columnStream.str(); - return columnPath.substr(0, columnPath.length() - 1); + std::string columnPath = columnStream.str(); + return columnPath.substr(0, columnPath.length() - 1); } - WriterVersion getWriterVersionImpl(const FileContents * contents) { - if (!contents->postscript->has_writerversion()) { + WriterVersion getWriterVersionImpl(const FileContents* contents) { + if (!contents->postscript->has_writer_version()) { return WriterVersion_ORIGINAL; } - return static_cast<WriterVersion>(contents->postscript->writerversion()); + return static_cast<WriterVersion>(contents->postscript->writer_version()); } void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) { return selectChildren(selectedColumns, type, EMPTY_IDREADINTENTMAP()); } - void ColumnSelector::selectChildren( - std::vector<bool> &selectedColumns, - const Type &type, - const RowReaderOptions::IdReadIntentMap& idReadIntentMap) { + void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type, + const RowReaderOptions::IdReadIntentMap& idReadIntentMap) { size_t id = static_cast<size_t>(type.getColumnId()); TypeKind kind = type.getKind(); if (!selectedColumns[id]) { @@ -99,8 +103,7 @@ namespace orc { bool selectChild = true; if (kind == TypeKind::LIST || kind == TypeKind::MAP || kind == TypeKind::UNION) { auto elem = idReadIntentMap.find(id); - if (elem != idReadIntentMap.end() && - elem->second == ReadIntent_OFFSETS) { + if (elem != idReadIntentMap.end() && elem->second == ReadIntent_OFFSETS) { selectChild = false; } } @@ -121,7 +124,7 @@ namespace orc { size_t id = static_cast<size_t>(type.getColumnId()); bool result = selectedColumns[id]; uint64_t numSubtypeSelected = 0; - for(uint64_t c=0; c < type.getSubtypeCount(); ++c) { + for (uint64_t c = 0; c < type.getSubtypeCount(); ++c) { if (selectParents(selectedColumns, *type.getSubtype(c))) { result = true; numSubtypeSelected++; @@ -169,20 +172,19 @@ namespace orc { const RowReaderOptions& options) { selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) { - for(std::list<uint64_t>::const_iterator field = options.getInclude().begin(); - field != options.getInclude().end(); ++field) { + for (std::list<uint64_t>::const_iterator field = options.getInclude().begin(); + field != options.getInclude().end(); ++field) { updateSelectedByFieldId(selectedColumns, *field); } } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { - for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); - field != options.getIncludeNames().end(); ++field) { + for (std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); + field != options.getIncludeNames().end(); ++field) { updateSelectedByName(selectedColumns, *field); } } else if (options.getTypeIdsSet()) { - const RowReaderOptions::IdReadIntentMap idReadIntentMap = - options.getIdReadIntentMap(); - for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); - typeId != options.getInclude().end(); ++typeId) { + const RowReaderOptions::IdReadIntentMap idReadIntentMap = options.getIdReadIntentMap(); + for (std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); + typeId != options.getInclude().end(); ++typeId) { updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap); } } else { @@ -190,7 +192,7 @@ namespace orc { std::fill(selectedColumns.begin(), selectedColumns.end(), true); } selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default + selectedColumns[0] = true; // column 0 is selected by default } void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns, @@ -210,16 +212,14 @@ namespace orc { } void ColumnSelector::updateSelectedByTypeId( - std::vector<bool> &selectedColumns, - uint64_t typeId, + std::vector<bool>& selectedColumns, uint64_t typeId, const RowReaderOptions::IdReadIntentMap& idReadIntentMap) { if (typeId < selectedColumns.size()) { const Type& type = *idTypeMap[typeId]; selectChildren(selectedColumns, type, idReadIntentMap); } else { std::stringstream buffer; - buffer << "Invalid type id selected " << typeId << " out of " - << selectedColumns.size(); + buffer << "Invalid type id selected " << typeId << " out of " << selectedColumns.size(); throw ParseError(buffer.str()); } } @@ -242,36 +242,39 @@ namespace orc { } } - ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) { + ColumnSelector::ColumnSelector(const FileContents* _contents) : contents(_contents) { buildTypeNameIdMap(contents->schema.get()); } RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents, - const RowReaderOptions& opts - ): localTimezone(getLocalTimezone()), - contents(_contents), - throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), - forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), - footer(contents->footer.get()), - firstRowOfStripe(*contents->pool, 0), - enableEncodedBlock(opts.getEnableLazyDecoding()), - readerTimezone(getTimezoneByName(opts.getTimezoneName())) { + const RowReaderOptions& opts) + : localTimezone(getLocalTimezone()), + contents(_contents), + throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), + forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), + footer(contents->footer.get()), + firstRowOfStripe(*contents->pool, 0), + enableEncodedBlock(opts.getEnableLazyDecoding()), + readerTimezone(getTimezoneByName(opts.getTimezoneName())), + schemaEvolution(opts.getReadType(), contents->schema.get()) { uint64_t numberOfStripes; numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); currentStripe = numberOfStripes; lastStripe = 0; currentRowInStripe = 0; rowsInCurrentStripe = 0; + numRowGroupsInStripeRange = 0; + useTightNumericVector = opts.getUseTightNumericVector(); + throwOnSchemaEvolutionOverflow = opts.getThrowOnSchemaEvolutionOverflow(); uint64_t rowTotal = 0; firstRowOfStripe.resize(numberOfStripes); - for(size_t i=0; i < numberOfStripes; ++i) { + for (size_t i = 0; i < numberOfStripes; ++i) { firstRowOfStripe[i] = rowTotal; - proto::StripeInformation stripeInfo = - footer->stripes(static_cast<int>(i)); - rowTotal += stripeInfo.numberofrows(); + proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(i)); + rowTotal += stripeInfo.number_of_rows(); bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() && - stripeInfo.offset() < opts.getOffset() + opts.getLength(); + stripeInfo.offset() < opts.getOffset() + opts.getLength(); if (isStripeInRange) { if (i < currentStripe) { currentStripe = i; @@ -279,28 +282,33 @@ namespace orc { if (i >= lastStripe) { lastStripe = i + 1; } + if (footer->row_index_stride() > 0) { + numRowGroupsInStripeRange += + (stripeInfo.number_of_rows() + footer->row_index_stride() - 1) / + footer->row_index_stride(); + } } } firstStripe = currentStripe; + processingStripe = lastStripe; if (currentStripe == 0) { previousRow = (std::numeric_limits<uint64_t>::max)(); } else if (currentStripe == numberOfStripes) { - previousRow = footer->numberofrows(); + previousRow = footer->number_of_rows(); } else { - previousRow = firstRowOfStripe[firstStripe]-1; + previousRow = firstRowOfStripe[firstStripe] - 1; } ColumnSelector column_selector(contents.get()); column_selector.updateSelected(selectedColumns, opts); // prepare SargsApplier if SearchArgument is available - if (opts.getSearchArgument() && footer->rowindexstride() > 0) { + if (opts.getSearchArgument() && footer->row_index_stride() > 0) { sargs = opts.getSearchArgument(); - sargsApplier.reset(new SargsApplier(*contents->schema, - sargs.get(), - footer->rowindexstride(), - getWriterVersionImpl(_contents.get()))); + sargsApplier.reset( + new SargsApplier(*contents->schema, sargs.get(), footer->row_index_stride(), + getWriterVersionImpl(_contents.get()), contents->readerMetrics)); } skipBloomFilters = hasBadBloomFilters(); @@ -314,9 +322,9 @@ namespace orc { // 1.6.x releases before 1.6.11 won't have it. On the other side, the C++ writer // supports writing bloom filters since 1.6.0. So files written by the C++ writer // and with 'softwareVersion' unset would have bad bloom filters. - if (!footer->has_softwareversion()) return true; + if (!footer->has_software_version()) return true; - const std::string &fullVersion = footer->softwareversion(); + const std::string& fullVersion = footer->software_version(); std::string version; // Deal with snapshot versions, e.g. 1.6.12-SNAPSHOT. if (fullVersion.find('-') != std::string::npos) { @@ -324,7 +332,7 @@ namespace orc { } else { version = fullVersion; } - for (const char *v : BAD_CPP_BLOOM_FILTER_VERSIONS) { + for (const char* v : BAD_CPP_BLOOM_FILTER_VERSIONS) { if (version == v) { return true; } @@ -346,8 +354,7 @@ namespace orc { const Type& RowReaderImpl::getSelectedType() const { if (selectedSchema.get() == nullptr) { - selectedSchema = buildSelectedType(contents->schema.get(), - selectedColumns); + selectedSchema = buildSelectedType(contents->schema.get(), selectedColumns); } return *(selectedSchema.get()); } @@ -369,49 +376,56 @@ namespace orc { // seeking past lastStripe uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size()); - if ( (lastStripe == num_stripes - && rowNumber >= footer->numberofrows()) || - (lastStripe < num_stripes - && rowNumber >= firstRowOfStripe[lastStripe]) ) { + if ((lastStripe == num_stripes && rowNumber >= footer->number_of_rows()) || + (lastStripe < num_stripes && rowNumber >= firstRowOfStripe[lastStripe])) { currentStripe = num_stripes; - previousRow = footer->numberofrows(); + previousRow = footer->number_of_rows(); return; } uint64_t seekToStripe = 0; - while (seekToStripe+1 < lastStripe && - firstRowOfStripe[seekToStripe+1] <= rowNumber) { + while (seekToStripe + 1 < lastStripe && firstRowOfStripe[seekToStripe + 1] <= rowNumber) { seekToStripe++; } // seeking before the first stripe if (seekToStripe < firstStripe) { currentStripe = num_stripes; - previousRow = footer->numberofrows(); + previousRow = footer->number_of_rows(); return; } - currentStripe = seekToStripe; - currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; previousRow = rowNumber; - startNextStripe(); + auto rowIndexStride = footer->row_index_stride(); + if (!isCurrentStripeInited() || currentStripe != seekToStripe || rowIndexStride == 0 || + currentStripeInfo.index_length() == 0) { + // current stripe is not initialized or + // target stripe is not current stripe or + // current stripe doesn't have row indexes + currentStripe = seekToStripe; + currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; + startNextStripe(); + if (currentStripe >= lastStripe) { + return; + } + } else { + currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; + if (sargsApplier) { + // advance to selected row group if predicate pushdown is enabled + currentRowInStripe = + advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, + footer->row_index_stride(), sargsApplier->getNextSkippedRows()); + } + } uint64_t rowsToSkip = currentRowInStripe; - auto rowIndexStride = footer->rowindexstride(); // seek to the target row group if row indexes exists - if (rowIndexStride > 0 && currentStripeInfo.indexlength() > 0) { - // when predicate push down is enabled, above call to startNextStripe() - // will move current row to 1st matching row group; here we only need - // to deal with the case when PPD is not enabled. - if (!sargsApplier) { - if (rowIndexes.empty()) { - loadStripeIndex(); - } - auto rowGroupId = static_cast<uint32_t>(rowsToSkip / rowIndexStride); - if (rowGroupId != 0) { - seekToRowGroup(rowGroupId); - } + if (rowIndexStride > 0 && currentStripeInfo.index_length() > 0) { + if (rowIndexes.empty()) { + loadStripeIndex(); } + // TODO(ORC-1175): process the failures of loadStripeIndex() call + seekToRowGroup(static_cast<uint32_t>(rowsToSkip / rowIndexStride)); // skip leading rows in the target row group rowsToSkip %= rowIndexStride; } @@ -432,19 +446,14 @@ namespace orc { for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { const proto::Stream& pbStream = currentStripeFooter.streams(i); uint64_t colId = pbStream.column(); - if (selectedColumns[colId] && pbStream.has_kind() - && (pbStream.kind() == proto::Stream_Kind_ROW_INDEX || - pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) { - std::unique_ptr<SeekableInputStream> inStream = - createDecompressor(getCompression(), - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream - (contents->stream.get(), - offset, - pbStream.length(), - *contents->pool)), - getCompressionSize(), - *contents->pool); + if (selectedColumns[colId] && pbStream.has_kind() && + (pbStream.kind() == proto::Stream_Kind_ROW_INDEX || + pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) { + std::unique_ptr<SeekableInputStream> inStream = createDecompressor( + getCompression(), + std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream( + contents->stream.get(), offset, pbStream.length(), *contents->pool)), + getCompressionSize(), *contents->pool, contents->readerMetrics); if (pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { proto::RowIndex rowIndex; @@ -452,17 +461,16 @@ namespace orc { throw ParseError("Failed to parse the row index"); } rowIndexes[colId] = rowIndex; - } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8 + } else if (!skipBloomFilters) { // Stream_Kind_BLOOM_FILTER_UTF8 proto::BloomFilterIndex pbBFIndex; if (!pbBFIndex.ParseFromZeroCopyStream(inStream.get())) { throw ParseError("Failed to parse bloom filter index"); } BloomFilterIndex bfIndex; - for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) { + for (int j = 0; j < pbBFIndex.bloom_filter_size(); j++) { bfIndex.entries.push_back(BloomFilterUTF8Utils::deserialize( - pbStream.kind(), - currentStripeFooter.columns(static_cast<int>(pbStream.column())), - pbBFIndex.bloomfilter(j))); + pbStream.kind(), currentStripeFooter.columns(static_cast<int>(pbStream.column())), + pbBFIndex.bloom_filter(j))); } // add bloom filters to result for one column bloomFilterIndex[pbStream.column()] = bfIndex; @@ -478,11 +486,10 @@ namespace orc { // store position providers for selected colimns std::unordered_map<uint64_t, PositionProvider> positionProviders; - for (auto rowIndex = rowIndexes.cbegin(); - rowIndex != rowIndexes.cend(); ++rowIndex) { + for (auto rowIndex = rowIndexes.cbegin(); rowIndex != rowIndexes.cend(); ++rowIndex) { uint64_t colId = rowIndex->first; const proto::RowIndexEntry& entry = - rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); + rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); // copy index positions for a specific column positions.push_back({}); @@ -514,22 +521,16 @@ namespace orc { proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, const FileContents& contents) { - uint64_t stripeFooterStart = info.offset() + info.indexlength() + - info.datalength(); - uint64_t stripeFooterLength = info.footerlength(); - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents.compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents.stream.get(), - stripeFooterStart, - stripeFooterLength, - *contents.pool)), - contents.blockSize, - *contents.pool); + uint64_t stripeFooterStart = info.offset() + info.index_length() + info.data_length(); + uint64_t stripeFooterLength = info.footer_length(); + std::unique_ptr<SeekableInputStream> pbStream = createDecompressor( + contents.compression, + std::make_unique<SeekableFileInputStream>(contents.stream.get(), stripeFooterStart, + stripeFooterLength, *contents.pool), + contents.blockSize, *contents.pool, contents.readerMetrics); proto::StripeFooter result; if (!result.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError(std::string("bad StripeFooter from ") + - pbStream->getName()); + throw ParseError(std::string("bad StripeFooter from ") + pbStream->getName()); } // Verify StripeFooter in case it's corrupt if (result.columns_size() != contents.footer->types_size()) { @@ -541,31 +542,29 @@ namespace orc { return result; } - ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, - const ReaderOptions& opts, - uint64_t _fileLength, - uint64_t _postscriptLength - ): contents(std::move(_contents)), - options(opts), - fileLength(_fileLength), - postscriptLength(_postscriptLength), - footer(contents->footer.get()) { + ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, const ReaderOptions& opts, + uint64_t _fileLength, uint64_t _postscriptLength) + : contents(std::move(_contents)), + options(opts), + fileLength(_fileLength), + postscriptLength(_postscriptLength), + footer(contents->footer.get()) { isMetadataLoaded = false; checkOrcVersion(); numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer)); + contents->schema = convertType(footer->types(0), *footer); contents->blockSize = getCompressionBlockSize(*contents->postscript); - contents->compression= convertCompressionKind(*contents->postscript); + contents->compression = convertCompressionKind(*contents->postscript); } std::string ReaderImpl::getSerializedFileTail() const { proto::FileTail tail; - proto::PostScript *mutable_ps = tail.mutable_postscript(); + proto::PostScript* mutable_ps = tail.mutable_postscript(); mutable_ps->CopyFrom(*contents->postscript); - proto::Footer *mutableFooter = tail.mutable_footer(); + proto::Footer* mutableFooter = tail.mutable_footer(); mutableFooter->CopyFrom(*footer); - tail.set_filelength(fileLength); - tail.set_postscriptlength(postscriptLength); + tail.set_file_length(fileLength); + tail.set_postscript_length(postscriptLength); TString result; if (!tail.SerializeToString(&result)) { throw ParseError("Failed to serialize file tail"); @@ -593,29 +592,21 @@ namespace orc { if (!isMetadataLoaded) { readMetadata(); } - return contents->metadata == nullptr ? 0 : - static_cast<uint64_t>(contents->metadata->stripestats_size()); + return contents->metadata == nullptr + ? 0 + : static_cast<uint64_t>(contents->metadata->stripe_stats_size()); } - std::unique_ptr<StripeInformation> - ReaderImpl::getStripe(uint64_t stripeIndex) const { + std::unique_ptr<StripeInformation> ReaderImpl::getStripe(uint64_t stripeIndex) const { if (stripeIndex > getNumberOfStripes()) { throw std::logic_error("stripe index out of range"); } - proto::StripeInformation stripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); + proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(stripeIndex)); - return std::unique_ptr<StripeInformation> - (new StripeInformationImpl - (stripeInfo.offset(), - stripeInfo.indexlength(), - stripeInfo.datalength(), - stripeInfo.footerlength(), - stripeInfo.numberofrows(), - contents->stream.get(), - *contents->pool, - contents->compression, - contents->blockSize)); + return std::unique_ptr<StripeInformation>(new StripeInformationImpl( + stripeInfo.offset(), stripeInfo.index_length(), stripeInfo.data_length(), + stripeInfo.footer_length(), stripeInfo.number_of_rows(), contents->stream.get(), + *contents->pool, contents->compression, contents->blockSize, contents->readerMetrics)); } FileVersion ReaderImpl::getFormatVersion() const { @@ -626,16 +617,16 @@ namespace orc { } uint64_t ReaderImpl::getNumberOfRows() const { - return footer->numberofrows(); + return footer->number_of_rows(); } WriterId ReaderImpl::getWriterId() const { if (footer->has_writer()) { uint32_t id = footer->writer(); - if (id > WriterId::TRINO_WRITER) { + if (id > WriterId::CUDF_WRITER) { return WriterId::UNKNOWN_WRITER; } else { - return static_cast<WriterId>(id); + return static_cast<WriterId>(id); } } return WriterId::ORC_JAVA_WRITER; @@ -652,8 +643,8 @@ namespace orc { std::string ReaderImpl::getSoftwareVersion() const { std::ostringstream buffer; buffer << writerIdToString(getWriterIdValue()); - if (footer->has_softwareversion()) { - buffer << " " << footer->softwareversion(); + if (footer->has_software_version()) { + buffer << " " << footer->software_version(); } return buffer.str(); } @@ -663,15 +654,15 @@ namespace orc { } uint64_t ReaderImpl::getContentLength() const { - return footer->contentlength(); + return footer->content_length(); } uint64_t ReaderImpl::getStripeStatisticsLength() const { - return contents->postscript->metadatalength(); + return contents->postscript->metadata_length(); } uint64_t ReaderImpl::getFileFooterLength() const { - return contents->postscript->footerlength(); + return contents->postscript->footer_length(); } uint64_t ReaderImpl::getFilePostscriptLength() const { @@ -683,7 +674,7 @@ namespace orc { } uint64_t ReaderImpl::getRowIndexStride() const { - return footer->rowindexstride(); + return footer->row_index_stride(); } const std::string& ReaderImpl::getStreamName() const { @@ -692,14 +683,14 @@ namespace orc { std::list<std::string> ReaderImpl::getMetadataKeys() const { std::list<std::string> result; - for(int i=0; i < footer->metadata_size(); ++i) { + for (int i = 0; i < footer->metadata_size(); ++i) { result.push_back(footer->metadata(i).name()); } return result; } std::string ReaderImpl::getMetadataValue(const std::string& key) const { - for(int i=0; i < footer->metadata_size(); ++i) { + for (int i = 0; i < footer->metadata_size(); ++i) { if (footer->metadata(i).name() == TString(key)) { return footer->metadata(i).value(); } @@ -707,12 +698,13 @@ namespace orc { throw std::range_error("key not found"); } - void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo, - uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const { + void ReaderImpl::getRowIndexStatistics( + const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics>>* indexStats) const { int num_streams = currentStripeFooter.streams_size(); uint64_t offset = stripeInfo.offset(); - uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength(); + uint64_t indexEnd = stripeInfo.offset() + stripeInfo.index_length(); for (int i = 0; i < num_streams; i++) { const proto::Stream& stream = currentStripeFooter.streams(i); StreamKind streamKind = static_cast<StreamKind>(stream.kind()); @@ -722,19 +714,15 @@ namespace orc { std::stringstream msg; msg << "Malformed RowIndex stream meta in stripe " << stripeIndex << ": streamOffset=" << offset << ", streamLength=" << length - << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" - << stripeInfo.indexlength(); + << ", stripeOffset=" << stripeInfo.offset() + << ", stripeIndexLength=" << stripeInfo.index_length(); throw ParseError(msg.str()); } std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - offset, - length, - *contents->pool)), - contents->blockSize, - *(contents->pool)); + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream( + contents->stream.get(), offset, length, *contents->pool)), + contents->blockSize, *(contents->pool), contents->readerMetrics); proto::RowIndex rowIndex; if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) { @@ -752,7 +740,7 @@ namespace orc { } bool ReaderImpl::hasMetadataValue(const std::string& key) const { - for(int i=0; i < footer->metadata_size(); ++i) { + for (int i = 0; i < footer->metadata_size(); ++i) { if (footer->metadata(i).name() == TString(key)) { return true; } @@ -764,8 +752,7 @@ namespace orc { return *(contents->schema.get()); } - std::unique_ptr<StripeStatistics> - ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { + std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { if (!isMetadataLoaded) { readMetadata(); } @@ -773,48 +760,40 @@ namespace orc { throw std::logic_error("No stripe statistics in file"); } size_t num_cols = static_cast<size_t>( - contents->metadata->stripestats( - static_cast<int>(stripeIndex)).colstats_size()); - std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols); + contents->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size()); + std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols); - proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - proto::StripeFooter currentStripeFooter = - getStripeFooter(currentStripeInfo, *contents.get()); + proto::StripeInformation currentStripeInfo = footer->stripes(static_cast<int>(stripeIndex)); + proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - const Timezone& writerTZ = - currentStripeFooter.has_writertimezone() ? - getTimezoneByName(currentStripeFooter.writertimezone()) : - getLocalTimezone(); + const Timezone& writerTZ = currentStripeFooter.has_writer_timezone() + ? getTimezoneByName(currentStripeFooter.writer_timezone()) + : getLocalTimezone(); StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::unique_ptr<StripeStatistics> - (new StripeStatisticsImpl(contents->metadata->stripestats(static_cast<int>(stripeIndex)), - indexStats, statContext)); + return std::make_unique<StripeStatisticsImpl>( + contents->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext); } std::unique_ptr<Statistics> ReaderImpl::getStatistics() const { StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr<Statistics> - (new StatisticsImpl(*footer, statContext)); + return std::make_unique<StatisticsImpl>(*footer, statContext); } - std::unique_ptr<ColumnStatistics> - ReaderImpl::getColumnStatistics(uint32_t index) const { + std::unique_ptr<ColumnStatistics> ReaderImpl::getColumnStatistics(uint32_t index) const { if (index >= static_cast<uint64_t>(footer->statistics_size())) { throw std::logic_error("column index out of range"); } - proto::ColumnStatistics col = - footer->statistics(static_cast<int32_t>(index)); + proto::ColumnStatistics col = footer->statistics(static_cast<int32_t>(index)); StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext)); + return std::unique_ptr<ColumnStatistics>(convertColumnStatistics(col, statContext)); } void ReaderImpl::readMetadata() const { - uint64_t metadataSize = contents->postscript->metadatalength(); - uint64_t footerLength = contents->postscript->footerlength(); + uint64_t metadataSize = contents->postscript->metadata_length(); + uint64_t footerLength = contents->postscript->footer_length(); if (fileLength < metadataSize + footerLength + postscriptLength + 1) { std::stringstream msg; msg << "Invalid Metadata length: fileLength=" << fileLength @@ -824,15 +803,11 @@ namespace orc { } uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; if (metadataSize != 0) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - metadataStart, - metadataSize, - *contents->pool)), - contents->blockSize, - *contents->pool); + std::unique_ptr<SeekableInputStream> pbStream = createDecompressor( + contents->compression, + std::make_unique<SeekableFileInputStream>(contents->stream.get(), metadataStart, + metadataSize, *contents->pool), + contents->blockSize, *contents->pool, contents->readerMetrics); contents->metadata.reset(new proto::Metadata()); if (!contents->metadata->ParseFromZeroCopyStream(pbStream.get())) { throw ParseError("Failed to parse the metadata"); @@ -848,10 +823,9 @@ namespace orc { void ReaderImpl::checkOrcVersion() { FileVersion version = getFormatVersion(); if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) { - *(options.getErrorStream()) - << "Warning: ORC file " << contents->stream->getName() - << " was written in an unknown format version " - << version.toString() << "\n"; + *(options.getErrorStream()) << "Warning: ORC file " << contents->stream->getName() + << " was written in an unknown format version " + << version.toString() << "\n"; } } @@ -860,13 +834,12 @@ namespace orc { return createRowReader(defaultOpts); } - std::unique_ptr<RowReader> ReaderImpl::createRowReader( - const RowReaderOptions& opts) const { + std::unique_ptr<RowReader> ReaderImpl::createRowReader(const RowReaderOptions& opts) const { if (opts.getSearchArgument() && !isMetadataLoaded) { // load stripe statistics for PPD readMetadata(); } - return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts)); + return std::make_unique<RowReaderImpl>(contents, opts); } uint64_t maxStreamsForType(const proto::Type& type) { @@ -895,8 +868,8 @@ namespace orc { case proto::Type_Kind_VARCHAR: return 4; default: - return 0; - } + return 0; + } } uint64_t ReaderImpl::getMemoryUse(int stripeIx) { @@ -910,8 +883,8 @@ namespace orc { selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); ColumnSelector column_selector(contents.get()); if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) { - for(std::list<uint64_t>::const_iterator field = include.begin(); - field != include.end(); ++field) { + for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end(); + ++field) { column_selector.updateSelectedByFieldId(selectedColumns, *field); } } else { @@ -919,7 +892,7 @@ namespace orc { std::fill(selectedColumns.begin(), selectedColumns.end(), true); } column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default + selectedColumns[0] = true; // column 0 is selected by default return getMemoryUse(stripeIx, selectedColumns); } @@ -928,8 +901,8 @@ namespace orc { selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); ColumnSelector column_selector(contents.get()); if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) { - for(std::list<std::string>::const_iterator field = names.begin(); - field != names.end(); ++field) { + for (std::list<std::string>::const_iterator field = names.begin(); field != names.end(); + ++field) { column_selector.updateSelectedByName(selectedColumns, *field); } } else { @@ -937,7 +910,7 @@ namespace orc { std::fill(selectedColumns.begin(), selectedColumns.end(), true); } column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default + selectedColumns[0] = true; // column 0 is selected by default return getMemoryUse(stripeIx, selectedColumns); } @@ -946,8 +919,8 @@ namespace orc { selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); ColumnSelector column_selector(contents.get()); if (include.begin() != include.end()) { - for(std::list<uint64_t>::const_iterator field = include.begin(); - field != include.end(); ++field) { + for (std::list<uint64_t>::const_iterator field = include.begin(); field != include.end(); + ++field) { column_selector.updateSelectedByTypeId(selectedColumns, *field); } } else { @@ -955,7 +928,7 @@ namespace orc { std::fill(selectedColumns.begin(), selectedColumns.end(), true); } column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default + selectedColumns[0] = true; // column 0 is selected by default return getMemoryUse(stripeIx, selectedColumns); } @@ -963,13 +936,13 @@ namespace orc { uint64_t maxDataLength = 0; if (stripeIx >= 0 && stripeIx < footer->stripes_size()) { - uint64_t stripe = footer->stripes(stripeIx).datalength(); + uint64_t stripe = footer->stripes(stripeIx).data_length(); if (maxDataLength < stripe) { maxDataLength = stripe; } } else { - for (int i=0; i < footer->stripes_size(); i++) { - uint64_t stripe = footer->stripes(i).datalength(); + for (int i = 0; i < footer->stripes_size(); i++) { + uint64_t stripe = footer->stripes(i).data_length(); if (maxDataLength < stripe) { maxDataLength = stripe; } @@ -978,10 +951,10 @@ namespace orc { bool hasStringColumn = false; uint64_t nSelectedStreams = 0; - for (int i=0; !hasStringColumn && i < footer->types_size(); i++) { + for (int i = 0; !hasStringColumn && i < footer->types_size(); i++) { if (selectedColumns[static_cast<size_t>(i)]) { const proto::Type& type = footer->types(i); - nSelectedStreams += maxStreamsForType(type) ; + nSelectedStreams += maxStreamsForType(type); switch (static_cast<int64_t>(type.kind())) { case proto::Type_Kind_CHAR: case proto::Type_Kind_STRING: @@ -997,22 +970,23 @@ namespace orc { } } - /* If a string column is read, use stripe datalength as a memory estimate + /* If a string column is read, use stripe data_length as a memory estimate * because we don't know the dictionary size. Multiply by 2 because * a string column requires two buffers: * in the input stream and in the seekable input stream. * If no string column is read, estimate from the number of streams. */ - uint64_t memory = hasStringColumn ? 2 * maxDataLength : - std::min(uint64_t(maxDataLength), - nSelectedStreams * contents->stream->getNaturalReadSize()); + uint64_t memory = hasStringColumn + ? 2 * maxDataLength + : std::min(uint64_t(maxDataLength), + nSelectedStreams * contents->stream->getNaturalReadSize()); // Do we need even more memory to read the footer or the metadata? - if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) { - memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS; + if (memory < contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS) { + memory = contents->postscript->footer_length() + DIRECTORY_SIZE_GUESS; } - if (memory < contents->postscript->metadatalength()) { - memory = contents->postscript->metadatalength(); + if (memory < contents->postscript->metadata_length()) { + memory = contents->postscript->metadata_length(); } // Account for firstRowOfStripe. @@ -1021,7 +995,7 @@ namespace orc { // Decompressors need buffers for each stream uint64_t decompressorMemory = 0; if (contents->compression != CompressionKind_NONE) { - for (int i=0; i < footer->types_size(); i++) { + for (int i = 0; i < footer->types_size(); i++) { if (selectedColumns[static_cast<size_t>(i)]) { const proto::Type& type = footer->types(i); decompressorMemory += maxStreamsForType(type) * contents->blockSize; @@ -1032,7 +1006,7 @@ namespace orc { } } - return memory + decompressorMemory ; + return memory + decompressorMemory; } // Update fields to indicate we've reached the end of file @@ -1045,17 +1019,17 @@ namespace orc { previousRow = 0; } else { previousRow = firstRowOfStripe[lastStripe - 1] + - footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows(); + footer->stripes(static_cast<int>(lastStripe - 1)).number_of_rows(); } } void RowReaderImpl::startNextStripe() { - reader.reset(); // ColumnReaders use lots of memory; free old memory first + reader.reset(); // ColumnReaders use lots of memory; free old memory first rowIndexes.clear(); bloomFilterIndex.clear(); // evaluate file statistics if it exists - if (sargsApplier && !sargsApplier->evaluateFileStatistics(*footer)) { + if (sargsApplier && !sargsApplier->evaluateFileStatistics(*footer, numRowGroupsInStripeRange)) { // skip the entire file markEndOfFile(); return; @@ -1064,25 +1038,32 @@ namespace orc { do { currentStripeInfo = footer->stripes(static_cast<int>(currentStripe)); uint64_t fileLength = contents->stream->getLength(); - if (currentStripeInfo.offset() + currentStripeInfo.indexlength() + - currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) { + if (currentStripeInfo.offset() + currentStripeInfo.index_length() + + currentStripeInfo.data_length() + currentStripeInfo.footer_length() >= + fileLength) { std::stringstream msg; - msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength=" - << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength=" - << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength() - << ", footerLength=" << currentStripeInfo.footerlength() << ")"; + msg << "Malformed StripeInformation at stripe index " << currentStripe + << ": fileLength=" << fileLength + << ", StripeInfo=(offset=" << currentStripeInfo.offset() + << ", indexLength=" << currentStripeInfo.index_length() + << ", dataLength=" << currentStripeInfo.data_length() + << ", footerLength=" << currentStripeInfo.footer_length() << ")"; throw ParseError(msg.str()); } currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); - rowsInCurrentStripe = currentStripeInfo.numberofrows(); + rowsInCurrentStripe = currentStripeInfo.number_of_rows(); + processingStripe = currentStripe; if (sargsApplier) { bool isStripeNeeded = true; if (contents->metadata) { const auto& currentStripeStats = - contents->metadata->stripestats(static_cast<int>(currentStripe)); + contents->metadata->stripe_stats(static_cast<int>(currentStripe)); // skip this stripe after stats fail to satisfy sargs - isStripeNeeded = sargsApplier->evaluateStripeStatistics(currentStripeStats); + uint64_t stripeRowGroupCount = + (rowsInCurrentStripe + footer->row_index_stride() - 1) / footer->row_index_stride(); + isStripeNeeded = + sargsApplier->evaluateStripeStatistics(currentStripeStats, stripeRowGroupCount); } if (isStripeNeeded) { @@ -1090,9 +1071,7 @@ namespace orc { loadStripeIndex(); // select row groups to read in the current stripe - sargsApplier->pickRowGroups(rowsInCurrentStripe, - rowIndexes, - bloomFilterIndex); + sargsApplier->pickRowGroups(rowsInCurrentStripe, rowIndexes, bloomFilterIndex); if (sargsApplier->hasSelectedFrom(currentRowInStripe)) { // current stripe has at least one row group matching the predicate break; @@ -1110,26 +1089,23 @@ namespace orc { if (currentStripe < lastStripe) { // get writer timezone info from stripe footer to help understand timestamp values. const Timezone& writerTimezone = - currentStripeFooter.has_writertimezone() ? - getTimezoneByName(currentStripeFooter.writertimezone()) : - localTimezone; - StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, - currentStripeFooter, - currentStripeInfo.offset(), - *contents->stream, - writerTimezone, + currentStripeFooter.has_writer_timezone() + ? getTimezoneByName(currentStripeFooter.writer_timezone()) + : localTimezone; + StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, currentStripeFooter, + currentStripeInfo.offset(), *contents->stream, writerTimezone, readerTimezone); - reader = buildReader(*contents->schema, stripeStreams); + reader = buildReader(*contents->schema, stripeStreams, useTightNumericVector, + throwOnSchemaEvolutionOverflow, /*convertToReadType=*/true); if (sargsApplier) { // move to the 1st selected row group when PPD is enabled. - currentRowInStripe = advanceToNextRowGroup(currentRowInStripe, - rowsInCurrentStripe, - footer->rowindexstride(), - sargsApplier->getNextSkippedRows()); + currentRowInStripe = + advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, + footer->row_index_stride(), sargsApplier->getNextSkippedRows()); previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe - 1; if (currentRowInStripe > 0) { - seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride())); + seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride())); } } } else { @@ -1139,6 +1115,7 @@ namespace orc { } bool RowReaderImpl::next(ColumnVectorBatch& data) { + SCOPED_STOPWATCH(contents->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); if (currentStripe >= lastStripe) { data.numElements = 0; markEndOfFile(); @@ -1148,14 +1125,10 @@ namespace orc { startNextStripe(); } uint64_t rowsToRead = - std::min(static_cast<uint64_t>(data.capacity), - rowsInCurrentStripe - currentRowInStripe); + std::min(static_cast<uint64_t>(data.capacity), rowsInCurrentStripe - currentRowInStripe); if (sargsApplier && rowsToRead > 0) { - rowsToRead = computeBatchSize(rowsToRead, - currentRowInStripe, - rowsInCurrentStripe, - footer->rowindexstride(), - sargsApplier->getNextSkippedRows()); + rowsToRead = computeBatchSize(rowsToRead, currentRowInStripe, rowsInCurrentStripe, + footer->row_index_stride(), sargsApplier->getNextSkippedRows()); } data.numElements = rowsToRead; if (rowsToRead == 0) { @@ -1164,8 +1137,7 @@ namespace orc { } if (enableEncodedBlock) { reader->nextEncoded(data, rowsToRead, nullptr); - } - else { + } else { reader->next(data, rowsToRead, nullptr); } // update row number @@ -1174,15 +1146,14 @@ namespace orc { // check if we need to advance to next selected row group if (sargsApplier) { - uint64_t nextRowToRead = advanceToNextRowGroup(currentRowInStripe, - rowsInCurrentStripe, - footer->rowindexstride(), - sargsApplier->getNextSkippedRows()); + uint64_t nextRowToRead = + advanceToNextRowGroup(currentRowInStripe, rowsInCurrentStripe, footer->row_index_stride(), + sargsApplier->getNextSkippedRows()); if (currentRowInStripe != nextRowToRead) { // it is guaranteed to be at start of a row group currentRowInStripe = nextRowToRead; if (currentRowInStripe < rowsInCurrentStripe) { - seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride())); + seekToRowGroup(static_cast<uint32_t>(currentRowInStripe / footer->row_index_stride())); } } } @@ -1194,10 +1165,8 @@ namespace orc { return rowsToRead != 0; } - uint64_t RowReaderImpl::computeBatchSize(uint64_t requestedSize, - uint64_t currentRowInStripe, - uint64_t rowsInCurrentStripe, - uint64_t rowIndexStride, + uint64_t RowReaderImpl::computeBatchSize(uint64_t requestedSize, uint64_t currentRowInStripe, + uint64_t rowsInCurrentStripe, uint64_t rowIndexStride, const std::vector<uint64_t>& nextSkippedRows) { // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row // groups are selected then marker position is set to the end of range (subset of row groups @@ -1240,18 +1209,39 @@ namespace orc { return rowsInCurrentStripe; } - std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch - (uint64_t capacity) const { - return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock); + static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) { + columnIds.insert(type->getColumnId()); + for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) { + getColumnIds(type->getSubtype(i), columnIds); + } } - void ensureOrcFooter(InputStream* stream, - DataBuffer<char> *buffer, - uint64_t postscriptLength) { + std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const { + // If the read type is specified, then check that the selected schema matches the read type + // on the first call to createRowBatch. + if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) { + auto fileSchema = &getSelectedType(); + auto readType = schemaEvolution.getReadType(); + std::set<uint64_t> readColumns, fileColumns; + getColumnIds(readType, readColumns); + getColumnIds(fileSchema, fileColumns); + if (readColumns != fileColumns) { + std::ostringstream ss; + ss << "The selected schema " << fileSchema->toString() << " doesn't match read type " + << readType->toString(); + throw SchemaEvolutionError(ss.str()); + } + } + const Type& readType = + schemaEvolution.getReadType() ? *schemaEvolution.getReadType() : getSelectedType(); + return readType.createRowBatch(capacity, *contents->pool, enableEncodedBlock, + useTightNumericVector); + } + void ensureOrcFooter(InputStream* stream, DataBuffer<char>* buffer, uint64_t postscriptLength) { const std::string MAGIC("ORC"); const uint64_t magicLength = MAGIC.length(); - const char * const bufferStart = buffer->data(); + const char* const bufferStart = buffer->data(); const uint64_t bufferLength = buffer->size(); if (postscriptLength < magicLength || bufferLength < magicLength) { @@ -1263,7 +1253,7 @@ namespace orc { if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) { // If there is no magic string at the end, check the beginning. // Only files written by Hive 0.11.0 don't have the tail ORC string. - std::unique_ptr<char[]> frontBuffer( new char[magicLength] ); + std::unique_ptr<char[]> frontBuffer(new char[magicLength]); stream->read(frontBuffer.get(), magicLength, 0); bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0; @@ -1279,28 +1269,25 @@ namespace orc { * @param buffer the buffer with the tail of the file. * @param postscriptSize the length of postscript in bytes */ - std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream, - DataBuffer<char> *buffer, + std::unique_ptr<proto::PostScript> readPostscript(InputStream* stream, DataBuffer<char>* buffer, uint64_t postscriptSize) { - char *ptr = buffer->data(); + char* ptr = buffer->data(); uint64_t readSize = buffer->size(); ensureOrcFooter(stream, buffer, postscriptSize); - std::unique_ptr<proto::PostScript> postscript = - std::unique_ptr<proto::PostScript>(new proto::PostScript()); + auto postscript = std::make_unique<proto::PostScript>(); if (readSize < 1 + postscriptSize) { std::stringstream msg; - msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = " - << stream->getLength(); + msg << "Invalid ORC postscript length: " << postscriptSize + << ", file length = " << stream->getLength(); throw ParseError(msg.str()); } if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize, - static_cast<int>(postscriptSize))) { - throw ParseError("Failed to parse the postscript from " + - stream->getName()); + static_cast<int>(postscriptSize))) { + throw ParseError("Failed to parse the postscript from " + stream->getName()); } - return REDUNDANT_MOVE(postscript); + return postscript; } /** @@ -1308,7 +1295,7 @@ namespace orc { * so we won't crash when we convert the proto::Types to TypeImpls (ORC-317). * For STRUCT types, fieldName size should match subTypes size (ORC-581). */ - void checkProtoTypes(const proto::Footer &footer) { + void checkProtoTypes(const proto::Footer& footer) { std::stringstream msg; int maxId = footer.types_size(); if (maxId <= 0) { @@ -1316,17 +1303,16 @@ namespace orc { } for (int i = 0; i < maxId; ++i) { const proto::Type& type = footer.types(i); - if (type.kind() == proto::Type_Kind_STRUCT - && type.subtypes_size() != type.fieldnames_size()) { + if (type.kind() == proto::Type_Kind_STRUCT && + type.subtypes_size() != type.field_names_size()) { msg << "Footer is corrupt: STRUCT type " << i << " has " << type.subtypes_size() - << " subTypes, but has " << type.fieldnames_size() << " fieldNames"; + << " subTypes, but has " << type.field_names_size() << " fieldNames"; throw ParseError(msg.str()); } for (int j = 0; j < type.subtypes_size(); ++j) { int subTypeId = static_cast<int>(type.subtypes(j)); if (subTypeId <= i) { - msg << "Footer is corrupt: malformed link from type " << i << " to " - << subTypeId; + msg << "Footer is corrupt: malformed link from type " << i << " to " << subTypeId; throw ParseError(msg.str()); } if (subTypeId >= maxId) { @@ -1334,9 +1320,8 @@ namespace orc { throw ParseError(msg.str()); } if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) { - msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j - << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= " - << subTypeId << ")"; + msg << "Footer is corrupt: subType(" << (j - 1) << ") >= subType(" << j << ") in types(" + << i << "). (" << type.subtypes(j - 1) << " >= " << subTypeId << ")"; throw ParseError(msg.str()); } } @@ -1351,37 +1336,31 @@ namespace orc { * @param ps the file's postscript * @param memoryPool the memory pool to use */ - std::unique_ptr<proto::Footer> readFooter(InputStream* stream, - const DataBuffer<char> *buffer, - uint64_t footerOffset, - const proto::PostScript& ps, - MemoryPool& memoryPool) { - const char *footerPtr = buffer->data() + footerOffset; - - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(convertCompressionKind(ps), - std::unique_ptr<SeekableInputStream> - (new SeekableArrayInputStream(footerPtr, - ps.footerlength())), - getCompressionBlockSize(ps), - memoryPool); - - std::unique_ptr<proto::Footer> footer = - std::unique_ptr<proto::Footer>(new proto::Footer()); + std::unique_ptr<proto::Footer> readFooter(InputStream* stream, const DataBuffer<char>* buffer, + uint64_t footerOffset, const proto::PostScript& ps, + MemoryPool& memoryPool, ReaderMetrics* readerMetrics) { + const char* footerPtr = buffer->data() + footerOffset; + + std::unique_ptr<SeekableInputStream> pbStream = createDecompressor( + convertCompressionKind(ps), + std::make_unique<SeekableArrayInputStream>(footerPtr, ps.footer_length()), + getCompressionBlockSize(ps), memoryPool, readerMetrics); + + auto footer = std::make_unique<proto::Footer>(); if (!footer->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the footer from " + - stream->getName()); + throw ParseError("Failed to parse the footer from " + stream->getName()); } checkProtoTypes(*footer); - return REDUNDANT_MOVE(footer); + return footer; } std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream, const ReaderOptions& options) { - std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents()); + auto contents = std::make_shared<FileContents>(); contents->pool = options.getMemoryPool(); contents->errorStream = options.getErrorStream(); + contents->readerMetrics = options.getReaderMetrics(); std::string serializedFooter = options.getSerializedFileTail(); uint64_t fileLength; uint64_t postscriptLength; @@ -1391,27 +1370,25 @@ namespace orc { if (!tail.ParseFromString(TString(serializedFooter))) { throw ParseError("Failed to parse the file tail from string"); } - contents->postscript.reset(new proto::PostScript(tail.postscript())); - contents->footer.reset(new proto::Footer(tail.footer())); - fileLength = tail.filelength(); - postscriptLength = tail.postscriptlength(); + contents->postscript = std::make_unique<proto::PostScript>(tail.postscript()); + contents->footer = std::make_unique<proto::Footer>(tail.footer()); + fileLength = tail.file_length(); + postscriptLength = tail.postscript_length(); } else { // figure out the size of the file using the option or filesystem - fileLength = std::min(options.getTailLocation(), - static_cast<uint64_t>(stream->getLength())); + fileLength = std::min(options.getTailLocation(), static_cast<uint64_t>(stream->getLength())); - //read last bytes into buffer to get PostScript + // read last bytes into buffer to get PostScript uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS); if (readSize < 4) { throw ParseError("File size too small"); } - std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) ); + auto buffer = std::make_unique<DataBuffer<char>>(*contents->pool, readSize); stream->read(buffer->data(), readSize, fileLength - readSize); postscriptLength = buffer->data()[readSize - 1] & 0xff; - contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(), - buffer.get(), postscriptLength)); - uint64_t footerSize = contents->postscript->footerlength(); + contents->postscript = readPostscript(stream.get(), buffer.get(), postscriptLength); + uint64_t footerSize = contents->postscript->footer_length(); uint64_t tailSize = 1 + postscriptLength + footerSize; if (tailSize >= fileLength) { std::stringstream msg; @@ -1428,8 +1405,8 @@ namespace orc { footerOffset = readSize - tailSize; } - contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(), - footerOffset, *contents->postscript, *contents->pool)); + contents->footer = readFooter(stream.get(), buffer.get(), footerOffset, *contents->postscript, + *contents->pool, contents->readerMetrics); } contents->isDecimalAsLong = false; if (contents->postscript->version_size() == 2) { @@ -1439,27 +1416,23 @@ namespace orc { } } contents->stream = std::move(stream); - return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents), - options, - fileLength, - postscriptLength)); + return std::make_unique<ReaderImpl>(std::move(contents), options, fileLength, postscriptLength); } - std::map<uint32_t, BloomFilterIndex> - ReaderImpl::getBloomFilters(uint32_t stripeIndex, - const std::set<uint32_t>& included) const { + std::map<uint32_t, BloomFilterIndex> ReaderImpl::getBloomFilters( + uint32_t stripeIndex, const std::set<uint32_t>& included) const { std::map<uint32_t, BloomFilterIndex> ret; // find stripe info if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) { - throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex))); + throw std::logic_error("Illegal stripe index: " + + to_string(static_cast<int64_t>(stripeIndex))); } const proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - const proto::StripeFooter currentStripeFooter = - getStripeFooter(currentStripeInfo, *contents); + footer->stripes(static_cast<int>(stripeIndex)); + const proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents); - // iterate stripe footer to get stream of bloomfilter + // iterate stripe footer to get stream of bloom_filter uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset()); for (int i = 0; i < currentStripeFooter.streams_size(); i++) { const proto::Stream& stream = currentStripeFooter.streams(i); @@ -1469,16 +1442,11 @@ namespace orc { // a bloom filter stream from a selected column is found if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 && (included.empty() || included.find(column) != included.end())) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - offset, - length, - *contents->pool)), - contents->blockSize, - *(contents->pool)); + createDecompressor(contents->compression, + std::make_unique<SeekableFileInputStream>( + contents->stream.get(), offset, length, *contents->pool), + contents->blockSize, *(contents->pool), contents->readerMetrics); proto::BloomFilterIndex pbBFIndex; if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) { @@ -1486,11 +1454,10 @@ namespace orc { } BloomFilterIndex bfIndex; - for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) { - std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize( - stream.kind(), - currentStripeFooter.columns(static_cast<int>(stream.column())), - pbBFIndex.bloomfilter(j)); + for (int j = 0; j < pbBFIndex.bloom_filter_size(); j++) { + std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize( + stream.kind(), currentStripeFooter.columns(static_cast<int>(stream.column())), + pbBFIndex.bloom_filter(j)); bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry))); } @@ -1512,10 +1479,8 @@ namespace orc { // PASS } - InputStream::~InputStream() { - // PASS + InputStream::~InputStream(){ + // PASS }; - - -}// namespace +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh index ffaff4176e..a1367e4bd3 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.hh +++ b/contrib/libs/apache/orc/c++/src/Reader.hh @@ -26,20 +26,22 @@ #include "ColumnReader.hh" #include "RLE.hh" -#include "sargs/SargsApplier.hh" +#include "SchemaEvolution.hh" #include "TypeImpl.hh" +#include "sargs/SargsApplier.hh" namespace orc { static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; /** - * WriterVersion Implementation - */ + * WriterVersion Implementation + */ class WriterVersionImpl { - private: + private: WriterVersion version; - public: + + public: // Known Versions with issues resolved // The static method below is to fix global constructors Clang warning static const WriterVersionImpl& VERSION_HIVE_8732(); @@ -52,8 +54,8 @@ namespace orc { }; /** - * State shared between Reader and Row Reader - */ + * State shared between Reader and Row Reader + */ struct FileContents { std::unique_ptr<InputStream> stream; std::unique_ptr<proto::PostScript> postscript; @@ -61,12 +63,13 @@ namespace orc { std::unique_ptr<Type> schema; uint64_t blockSize; CompressionKind compression; - MemoryPool *pool; - std::ostream *errorStream; + MemoryPool* pool; + std::ostream* errorStream; /// Decimal64 in ORCv2 uses RLE to store values. This flag indicates whether /// this new encoding is used. bool isDecimalAsLong; std::unique_ptr<proto::Metadata> metadata; + ReaderMetrics* readerMetrics; }; proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, @@ -109,10 +112,10 @@ namespace orc { // is selected. bool selectParents(std::vector<bool>& selectedColumns, const Type& type); - /** - * Constructor that selects columns. - * @param contents of the file - */ + /** + * Constructor that selects columns. + * @param contents of the file + */ ColumnSelector(const FileContents* contents); // Select the columns from the RowReaderoptions object @@ -122,9 +125,8 @@ namespace orc { void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); }; - class RowReaderImpl : public RowReader { - private: + private: const Timezone& localTimezone; // contents @@ -145,14 +147,19 @@ namespace orc { uint64_t previousRow; uint64_t firstStripe; uint64_t currentStripe; - uint64_t lastStripe; // the stripe AFTER the last one + uint64_t lastStripe; // the stripe AFTER the last one + uint64_t processingStripe; uint64_t currentRowInStripe; uint64_t rowsInCurrentStripe; + // number of row groups between first stripe and last stripe + uint64_t numRowGroupsInStripeRange; proto::StripeInformation currentStripeInfo; proto::StripeFooter currentStripeFooter; std::unique_ptr<ColumnReader> reader; bool enableEncodedBlock; + bool useTightNumericVector; + bool throwOnSchemaEvolutionOverflow; // internal methods void startNextStripe(); inline void markEndOfFile(); @@ -166,27 +173,32 @@ namespace orc { // desired timezone to return data of timestamp types. const Timezone& readerTimezone; + // match read and file types + SchemaEvolution schemaEvolution; + // load stripe index if not done so void loadStripeIndex(); // In case of PPD, batch size should be aware of row group boundaries. // If only a subset of row groups are selected then the next read should // stop at the end of selected range. - static uint64_t computeBatchSize(uint64_t requestedSize, - uint64_t currentRowInStripe, - uint64_t rowsInCurrentStripe, - uint64_t rowIndexStride, + static uint64_t computeBatchSize(uint64_t requestedSize, uint64_t currentRowInStripe, + uint64_t rowsInCurrentStripe, uint64_t rowIndexStride, const std::vector<uint64_t>& nextSkippedRows); // Skip non-selected rows - static uint64_t advanceToNextRowGroup(uint64_t currentRowInStripe, - uint64_t rowsInCurrentStripe, + static uint64_t advanceToNextRowGroup(uint64_t currentRowInStripe, uint64_t rowsInCurrentStripe, uint64_t rowIndexStride, const std::vector<uint64_t>& nextSkippedRows); friend class TestRowReader_advanceToNextRowGroup_Test; friend class TestRowReader_computeBatchSize_Test; + // whether the current stripe is initialized + inline bool isCurrentStripeInited() const { + return currentStripe == processingStripe; + } + /** * Seek to the start of a row group in the current stripe * @param rowGroupEntryId the row group id to seek to @@ -200,22 +212,20 @@ namespace orc { */ bool hasBadBloomFilters(); - public: - /** - * Constructor that lets the user specify additional options. - * @param contents of the file - * @param options options for reading - */ - RowReaderImpl(std::shared_ptr<FileContents> contents, - const RowReaderOptions& options); + public: + /** + * Constructor that lets the user specify additional options. + * @param contents of the file + * @param options options for reading + */ + RowReaderImpl(std::shared_ptr<FileContents> contents, const RowReaderOptions& options); // Select the columns from the options object const std::vector<bool> getSelectedColumns() const override; const Type& getSelectedType() const override; - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size - ) const override; + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const override; bool next(ColumnVectorBatch& data) override; @@ -231,6 +241,10 @@ namespace orc { bool getThrowOnHive11DecimalOverflow() const; bool getIsDecimalAsLong() const; int32_t getForcedScaleOnHive11Decimal() const; + + const SchemaEvolution* getSchemaEvolution() const { + return &schemaEvolution; + } }; class ReaderImpl : public Reader { @@ -251,12 +265,14 @@ namespace orc { // internal methods void readMetadata() const; void checkOrcVersion(); - void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, - const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; + void getRowIndexStatistics( + const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; // metadata mutable bool isMetadataLoaded; + public: /** * Constructor that lets the user specify additional options. @@ -265,10 +281,8 @@ namespace orc { * @param fileLength the length of the file in bytes * @param postscriptLength the length of the postscript in bytes */ - ReaderImpl(std::shared_ptr<FileContents> contents, - const ReaderOptions& options, - uint64_t fileLength, - uint64_t postscriptLength); + ReaderImpl(std::shared_ptr<FileContents> contents, const ReaderOptions& options, + uint64_t fileLength, uint64_t postscriptLength); const ReaderOptions& getReaderOptions() const; @@ -298,20 +312,17 @@ namespace orc { uint64_t getNumberOfStripes() const override; - std::unique_ptr<StripeInformation> getStripe(uint64_t - ) const override; + std::unique_ptr<StripeInformation> getStripe(uint64_t) const override; uint64_t getNumberOfStripeStatistics() const override; const std::string& getStreamName() const override; - std::unique_ptr<StripeStatistics> - getStripeStatistics(uint64_t stripeIndex) const override; + std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const override; std::unique_ptr<RowReader> createRowReader() const override; - std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options - ) const override; + std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options) const override; uint64_t getContentLength() const override; uint64_t getStripeStatisticsLength() const override; @@ -321,8 +332,7 @@ namespace orc { std::unique_ptr<Statistics> getStatistics() const override; - std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId - ) const override; + std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId) const override; std::string getSerializedFileTail() const override; @@ -330,28 +340,41 @@ namespace orc { bool hasCorrectStatistics() const override; - const proto::PostScript* getPostscript() const {return contents->postscript.get();} + const ReaderMetrics* getReaderMetrics() const override { + return contents->readerMetrics; + } - uint64_t getBlockSize() const {return contents->blockSize;} + const proto::PostScript* getPostscript() const { + return contents->postscript.get(); + } - const proto::Footer* getFooter() const {return contents->footer.get();} + uint64_t getBlockSize() const { + return contents->blockSize; + } - const Type* getSchema() const {return contents->schema.get();} + const proto::Footer* getFooter() const { + return contents->footer.get(); + } - InputStream* getStream() const {return contents->stream.get();} + const Type* getSchema() const { + return contents->schema.get(); + } + + InputStream* getStream() const { + return contents->stream.get(); + } uint64_t getMemoryUse(int stripeIx = -1) override; - uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; + uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx = -1) override; - uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; + uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx = -1) override; - uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; + uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx = -1) override; - std::map<uint32_t, BloomFilterIndex> - getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override; + std::map<uint32_t, BloomFilterIndex> getBloomFilters( + uint32_t stripeIndex, const std::set<uint32_t>& included) const override; }; - -}// namespace +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc index 8ab57b1f6e..ae05a70a36 100644 --- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc @@ -17,731 +17,439 @@ */ #include "Adaptor.hh" +#include "BpackingDefault.hh" +#if defined(ORC_HAVE_RUNTIME_AVX512) +#error #include "BpackingAvx512.hh" +#endif #include "Compression.hh" -#include "RLEv2.hh" +#include "Dispatch.hh" #include "RLEV2Util.hh" +#include "RLEv2.hh" +#include "Utils.hh" namespace orc { -unsigned char RleDecoderV2::readByte() { - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::readByte"); + unsigned char RleDecoderV2::readByte() { + SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::readByte"); + } + bufferStart = const_cast<char*>(static_cast<const char*>(bufferPointer)); + bufferEnd = bufferStart + bufferLength; } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - unsigned char result = static_cast<unsigned char>(*bufferStart++); - return result; -} - -int64_t RleDecoderV2::readLongBE(uint64_t bsz) { - int64_t ret = 0, val; - uint64_t n = bsz; - while (n > 0) { - n--; - val = readByte(); - ret |= (val << (n * 8)); - } - return ret; -} - -inline int64_t RleDecoderV2::readVslong() { - return unZigZag(readVulong()); -} - -uint64_t RleDecoderV2::readVulong() { - uint64_t ret = 0, b; - uint64_t offset = 0; - do { - b = readByte(); - ret |= (0x7f & b) << offset; - offset += 7; - } while (b >= 0x80); - return ret; -} - -void RleDecoderV2::readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs) { - switch (fbs) { - case 4: - unrolledUnpack4(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 16: - unrolledUnpack16(data, offset, len); - return; - case 24: - unrolledUnpack24(data, offset, len); - return; - case 32: - unrolledUnpack32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs); - return; + unsigned char result = static_cast<unsigned char>(*bufferStart++); + return result; } -} - -void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (bitsLeft > 0 && curIdx < offset + len) { - bitsLeft -= 4; - data[curIdx++] = (curByte >> bitsLeft) & 15; - } - if (curIdx == offset + len) return; - - // Exhaust the buffer - uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = std::min(numGroups, static_cast<uint64_t>(bufferEnd - bufferStart)); - // Avoid updating 'bufferStart' inside the loop. - const auto *buffer = reinterpret_cast<const unsigned char*>(bufferStart); - uint32_t localByte; - for (uint64_t i = 0; i < numGroups; ++i) { - localByte = *buffer++; - data[curIdx] = (localByte >> 4) & 15; - data[curIdx + 1] = localByte & 15; - curIdx += 2; - } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - // readByte() will update 'bufferStart' and 'bufferEnd' - curByte = readByte(); - bitsLeft = 8; - } -} - -void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = bufferEnd - bufferStart; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - data[curIdx++] = *buffer++; + int64_t RleDecoderV2::readLongBE(uint64_t bsz) { + int64_t ret = 0, val; + uint64_t n = bsz; + while (n > 0) { + n--; + val = readByte(); + ret |= (val << (n * 8)); } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; + return ret; + } - // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = readByte(); + inline int64_t RleDecoderV2::readVslong() { + return unZigZag(readVulong()); } -} - -void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 2; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint16_t b0, b1; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint16_t>(*buffer); - b1 = static_cast<uint16_t>(*(buffer + 1)); - buffer += 2; - data[curIdx++] = (b0 << 8) | b1; - } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - data[curIdx++] = (b0 << 8) | b1; + uint64_t RleDecoderV2::readVulong() { + uint64_t ret = 0, b; + uint64_t offset = 0; + do { + b = readByte(); + ret |= (0x7f & b) << offset; + offset += 7; + } while (b >= 0x80); + return ret; } -} - -void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 3; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint32_t b0, b1, b2; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint32_t>(*buffer); - b1 = static_cast<uint32_t>(*(buffer + 1)); - b2 = static_cast<uint32_t>(*(buffer + 2)); - buffer += 3; - data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2); + + struct UnpackDynamicFunction { + using FunctionType = decltype(&BitUnpack::readLongs); + + static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() { +#if defined(ORC_HAVE_RUNTIME_AVX512) + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}, + {DispatchLevel::AVX512, BitUnpackAVX512::readLongs}}; +#else + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}}; +#endif } - bufferStart += bufferNum * 3; - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - data[curIdx++] = static_cast<int64_t>((b0 << 16) | (b1 << 8) | b2); + }; + + void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + static DynamicDispatch<UnpackDynamicFunction> dispatch; + return dispatch.func(this, data, offset, len, fbs); } -} - -void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 4; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint32_t b0, b1, b2, b3; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint32_t>(*buffer); - b1 = static_cast<uint32_t>(*(buffer + 1)); - b2 = static_cast<uint32_t>(*(buffer + 2)); - b3 = static_cast<uint32_t>(*(buffer + 3)); - buffer += 4; - data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - data[curIdx++] = static_cast<int64_t>((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + + RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool _isSigned, + MemoryPool& pool, ReaderMetrics* _metrics) + : RleDecoder(_metrics), + inputStream(std::move(input)), + isSigned(_isSigned), + firstByte(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + runLength(0), + runRead(0), + bitsLeft(0), + curByte(0), + unpackedPatch(pool, 0), + literals(pool, MAX_LITERAL_SIZE) { + // PASS } -} - -void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 5; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint32_t>(*buffer); - b1 = static_cast<uint32_t>(*(buffer + 1)); - b2 = static_cast<uint32_t>(*(buffer + 2)); - b3 = static_cast<uint32_t>(*(buffer + 3)); - b4 = static_cast<uint32_t>(*(buffer + 4)); - buffer += 5; - data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - data[curIdx++] = static_cast<int64_t>((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + + void RleDecoderV2::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // clear state + bufferEnd = bufferStart = nullptr; + runRead = runLength = 0; + // skip ahead the given number of records + skip(location.next()); } -} - -void RleDecoderV2::unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 6; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint32_t>(*buffer); - b1 = static_cast<uint32_t>(*(buffer + 1)); - b2 = static_cast<uint32_t>(*(buffer + 2)); - b3 = static_cast<uint32_t>(*(buffer + 3)); - b4 = static_cast<uint32_t>(*(buffer + 4)); - b5 = static_cast<uint32_t>(*(buffer + 5)); - buffer += 6; - data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + + void RleDecoderV2::skip(uint64_t numValues) { + // simple for now, until perf tests indicate something encoding specific is + // needed + const uint64_t N = 64; + int64_t dummy[N]; + + while (numValues) { + uint64_t nRead = std::min(N, numValues); + next(dummy, nRead, nullptr); + numValues -= nRead; } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - data[curIdx++] = static_cast<int64_t>((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); } -} - -void RleDecoderV2::unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 7; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint32_t>(*buffer); - b1 = static_cast<uint32_t>(*(buffer + 1)); - b2 = static_cast<uint32_t>(*(buffer + 2)); - b3 = static_cast<uint32_t>(*(buffer + 3)); - b4 = static_cast<uint32_t>(*(buffer + 4)); - b5 = static_cast<uint32_t>(*(buffer + 5)); - b6 = static_cast<uint32_t>(*(buffer + 6)); - buffer += 7; - data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); + + template <typename T> + void RleDecoderV2::next(T* const data, const uint64_t numValues, const char* const notNull) { + SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall); + uint64_t nRead = 0; + + while (nRead < numValues) { + // Skip any nulls before attempting to read first byte. + while (notNull && !notNull[nRead]) { + if (++nRead == numValues) { + return; // ended with null values + } + } + + if (runRead == runLength) { + resetRun(); + firstByte = readByte(); + } + + uint64_t offset = nRead, length = numValues - nRead; + + EncodingType enc = static_cast<EncodingType>((firstByte >> 6) & 0x03); + switch (static_cast<int64_t>(enc)) { + case SHORT_REPEAT: + nRead += nextShortRepeats(data, offset, length, notNull); + break; + case DIRECT: + nRead += nextDirect(data, offset, length, notNull); + break; + case PATCHED_BASE: + nRead += nextPatched(data, offset, length, notNull); + break; + case DELTA: + nRead += nextDelta(data, offset, length, notNull); + break; + default: + throw ParseError("unknown encoding"); + } } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - data[curIdx++] = static_cast<int64_t>((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); } -} - -void RleDecoderV2::unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 8; - bufferNum = std::min(bufferNum, static_cast<int64_t>(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6, b7; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast<const unsigned char*>(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast<uint32_t>(*buffer); - b1 = static_cast<uint32_t>(*(buffer + 1)); - b2 = static_cast<uint32_t>(*(buffer + 2)); - b3 = static_cast<uint32_t>(*(buffer + 3)); - b4 = static_cast<uint32_t>(*(buffer + 4)); - b5 = static_cast<uint32_t>(*(buffer + 5)); - b6 = static_cast<uint32_t>(*(buffer + 6)); - b7 = static_cast<uint32_t>(*(buffer + 7)); - buffer += 8; - data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - bufferStart = reinterpret_cast<const char*>(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - b7 = readByte(); - data[curIdx++] = static_cast<int64_t>((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + + void RleDecoderV2::next(int64_t* data, uint64_t numValues, const char* notNull) { + next<int64_t>(data, numValues, notNull); } -} - -void RleDecoderV2::plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, - uint64_t fbs) { - for (uint64_t i = offset; i < (offset + len); i++) { - uint64_t result = 0; - uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast<uint32_t>(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast<int64_t>(result); + void RleDecoderV2::next(int32_t* data, uint64_t numValues, const char* notNull) { + next<int32_t>(data, numValues, notNull); } -} - -RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, - bool _isSigned, MemoryPool& pool - ): inputStream(std::move(input)), - isSigned(_isSigned), - firstByte(0), - runLength(0), - runRead(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - bitsLeft(0), - curByte(0), - unpackedPatch(pool, 0), - literals(pool, MAX_LITERAL_SIZE) { - // PASS -} - -void RleDecoderV2::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // clear state - bufferEnd = bufferStart = nullptr; - runRead = runLength = 0; - // skip ahead the given number of records - skip(location.next()); -} - -void RleDecoderV2::skip(uint64_t numValues) { - // simple for now, until perf tests indicate something encoding specific is - // needed - const uint64_t N = 64; - int64_t dummy[N]; - - while (numValues) { - uint64_t nRead = std::min(N, numValues); - next(dummy, nRead, nullptr); - numValues -= nRead; + + void RleDecoderV2::next(int16_t* data, uint64_t numValues, const char* notNull) { + next<int16_t>(data, numValues, notNull); } -} - -void RleDecoderV2::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - uint64_t nRead = 0; - - while (nRead < numValues) { - // Skip any nulls before attempting to read first byte. - while (notNull && !notNull[nRead]) { - if (++nRead == numValues) { - return; // ended with null values - } - } + template <typename T> + uint64_t RleDecoderV2::nextShortRepeats(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { if (runRead == runLength) { - resetRun(); - firstByte = readByte(); - } + // extract the number of fixed bytes + uint64_t byteSize = (firstByte >> 3) & 0x07; + byteSize += 1; - uint64_t offset = nRead, length = numValues - nRead; - - EncodingType enc = static_cast<EncodingType> - ((firstByte >> 6) & 0x03); - switch(static_cast<int64_t>(enc)) { - case SHORT_REPEAT: - nRead += nextShortRepeats(data, offset, length, notNull); - break; - case DIRECT: - nRead += nextDirect(data, offset, length, notNull); - break; - case PATCHED_BASE: - nRead += nextPatched(data, offset, length, notNull); - break; - case DELTA: - nRead += nextDelta(data, offset, length, notNull); - break; - default: - throw ParseError("unknown encoding"); - } - } -} - -uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bytes - uint64_t byteSize = (firstByte >> 3) & 0x07; - byteSize += 1; - - runLength = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - runLength += MIN_REPEAT; - runRead = 0; - - // read the repeated value which is store using fixed bytes - literals[0] = readLongBE(byteSize); - - if (isSigned) { - literals[0] = unZigZag(static_cast<uint64_t>(literals[0])); + runLength = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + runLength += MIN_REPEAT; + runRead = 0; + + // read the repeated value which is store using fixed bytes + literals[0] = readLongBE(byteSize); + + if (isSigned) { + literals[0] = unZigZag(static_cast<uint64_t>(literals[0])); + } } - } - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength - runRead, numValues); - if (notNull) { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = literals[0]; + if (notNull) { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = static_cast<T>(literals[0]); + ++runRead; + } + } + } else { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = static_cast<T>(literals[0]); ++runRead; } } - } else { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = literals[0]; - ++runRead; - } + + return nRead; } - return nRead; -} - -uint64_t RleDecoderV2::nextDirect(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - readLongs(literals.data(), 0, runLength, bitSize); - if (isSigned) { - for (uint64_t i = 0; i < runLength; ++i) { - literals[i] = unZigZag(static_cast<uint64_t>(literals[i])); + template <typename T> + uint64_t RleDecoderV2::nextDirect(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + readLongs(literals.data(), 0, runLength, bitSize); + if (isSigned) { + for (uint64_t i = 0; i < runLength; ++i) { + literals[i] = unZigZag(static_cast<uint64_t>(literals[i])); + } } } + + return copyDataFromBuffer(data, offset, numValues, notNull); } - return copyDataFromBuffer(data, offset, numValues, notNull); -} - -void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, - int64_t* resGap, int64_t* resPatch, - uint64_t* patchIdx) { - uint64_t idx = *patchIdx; - uint64_t gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize; - int64_t patch = unpackedPatch[idx] & patchMask; - int64_t actualGap = 0; - - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (gap == 255 && patch == 0) { - actualGap += 255; - ++idx; - gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize; - patch = unpackedPatch[idx] & patchMask; + void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, + int64_t* resPatch, uint64_t* patchIdx) { + uint64_t idx = *patchIdx; + uint64_t gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize; + int64_t patch = unpackedPatch[idx] & patchMask; + int64_t actualGap = 0; + + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (gap == 255 && patch == 0) { + actualGap += 255; + ++idx; + gap = static_cast<uint64_t>(unpackedPatch[idx]) >> patchBitSize; + patch = unpackedPatch[idx] & patchMask; + } + // add the left over gap + actualGap += gap; + + *resGap = actualGap; + *resPatch = patch; + *patchIdx = idx; } - // add the left over gap - actualGap += gap; - *resGap = actualGap; - *resPatch = patch; - *patchIdx = idx; -} + template <typename T> + uint64_t RleDecoderV2::nextPatched(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + // extract the number of bytes occupied by base + uint64_t thirdByte = readByte(); + uint64_t byteSize = (thirdByte >> 5) & 0x07; + // base width is one off + byteSize += 1; + + // extract patch width + uint32_t pwo = thirdByte & 0x1f; + uint32_t patchBitSize = decodeBitWidth(pwo); + + // read fourth byte and extract patch gap width + uint64_t fourthByte = readByte(); + uint32_t pgw = (fourthByte >> 5) & 0x07; + // patch gap width is one off + pgw += 1; + + // extract the length of the patch list + size_t pl = fourthByte & 0x1f; + if (pl == 0) { + throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); + } -uint64_t RleDecoderV2::nextPatched(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(); - uint64_t byteSize = (thirdByte >> 5) & 0x07; - // base width is one off - byteSize += 1; - - // extract patch width - uint32_t pwo = thirdByte & 0x1f; - uint32_t patchBitSize = decodeBitWidth(pwo); - - // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(); - uint32_t pgw = (fourthByte >> 5) & 0x07; - // patch gap width is one off - pgw += 1; - - // extract the length of the patch list - size_t pl = fourthByte & 0x1f; - if (pl == 0) { - throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); - } + // read the next base width number of bytes to extract base value + int64_t base = readLongBE(byteSize); + int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1)); + // if mask of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } - // read the next base width number of bytes to extract base value - int64_t base = readLongBE(byteSize); - int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1)); - // if mask of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } + readLongs(literals.data(), 0, runLength, bitSize); + // any remaining bits are thrown out + resetReadLongs(); + + // TODO: something more efficient than resize + unpackedPatch.resize(pl); + // TODO: Skip corrupt? + // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { + if ((patchBitSize + pgw) > 64) { + throw ParseError( + "Corrupt PATCHED_BASE encoded data " + "(patchBitSize + pgw > 64)!"); + } + uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); + readLongs(unpackedPatch.data(), 0, pl, cfb); + // any remaining bits are thrown out + resetReadLongs(); - readLongs(literals.data(), 0, runLength, bitSize); - // any remaining bits are thrown out - resetReadLongs(); - - // TODO: something more efficient than resize - unpackedPatch.resize(pl); - // TODO: Skip corrupt? - // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { - if ((patchBitSize + pgw) > 64) { - throw ParseError("Corrupt PATCHED_BASE encoded data " - "(patchBitSize + pgw > 64)!"); - } - uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); - // any remaining bits are thrown out - resetReadLongs(); - - // apply the patch directly when decoding the packed data - int64_t patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1); - - int64_t gap = 0; - int64_t patch = 0; - uint64_t patchIdx = 0; - adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); - - for (uint64_t i = 0; i < runLength; ++i) { - if (static_cast<int64_t>(i) != gap) { - // no patching required. add base to unpacked value to get final value - literals[i] += base; - } else { - // extract the patch value - int64_t patchedVal = literals[i] | (patch << bitSize); + // apply the patch directly when decoding the packed data + int64_t patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1); + + int64_t gap = 0; + int64_t patch = 0; + uint64_t patchIdx = 0; + adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); + + for (uint64_t i = 0; i < runLength; ++i) { + if (static_cast<int64_t>(i) != gap) { + // no patching required. add base to unpacked value to get final value + literals[i] += base; + } else { + // extract the patch value + int64_t patchedVal = literals[i] | (patch << bitSize); - // add base to patched value - literals[i] = base + patchedVal; + // add base to patched value + literals[i] = base + patchedVal; - // increment the patch to point to next entry in patch list - ++patchIdx; + // increment the patch to point to next entry in patch list + ++patchIdx; - if (patchIdx < unpackedPatch.size()) { - adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, - &patchIdx); + if (patchIdx < unpackedPatch.size()) { + adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); - // next gap is relative to the current gap - gap += i; + // next gap is relative to the current gap + gap += i; + } } } } + + return copyDataFromBuffer(data, offset, numValues, notNull); } - return copyDataFromBuffer(data, offset, numValues, notNull); -} - -uint64_t RleDecoderV2::nextDelta(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize; - if (fbo != 0) { - bitSize = decodeBitWidth(fbo); - } else { - bitSize = 0; - } + template <typename T> + uint64_t RleDecoderV2::nextDelta(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize; + if (fbo != 0) { + bitSize = decodeBitWidth(fbo); + } else { + bitSize = 0; + } - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = 0; + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + ++runLength; // account for first value + runRead = 0; - int64_t prevValue; - // read the first value stored as vint - if (isSigned) { - prevValue = readVslong(); - } else { - prevValue = static_cast<int64_t>(readVulong()); - } + int64_t prevValue; + // read the first value stored as vint + if (isSigned) { + prevValue = readVslong(); + } else { + prevValue = static_cast<int64_t>(readVulong()); + } - literals[0] = prevValue; + literals[0] = prevValue; - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - int64_t deltaBase = readVslong(); + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + int64_t deltaBase = readVslong(); - if (bitSize == 0) { - // add fixed deltas to adjacent values - for (uint64_t i = 1; i < runLength; ++i) { - literals[i] = literals[i - 1] + deltaBase; - } - } else { - prevValue = literals[1] = prevValue + deltaBase; - if (runLength < 2) { - std::stringstream ss; - ss << "Illegal run length for delta encoding: " << runLength; - throw ParseError(ss.str()); - } - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence. - // read deltas using the literals buffer. - readLongs(literals.data(), 2, runLength - 2, bitSize); - if (deltaBase < 0) { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue - literals[i]; + if (bitSize == 0) { + // add fixed deltas to adjacent values + for (uint64_t i = 1; i < runLength; ++i) { + literals[i] = literals[i - 1] + deltaBase; } } else { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue + literals[i]; + prevValue = literals[1] = prevValue + deltaBase; + if (runLength < 2) { + std::stringstream ss; + ss << "Illegal run length for delta encoding: " << runLength; + throw ParseError(ss.str()); + } + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence. + // read deltas using the literals buffer. + readLongs(literals.data(), 2, runLength - 2, bitSize); + if (deltaBase < 0) { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue - literals[i]; + } + } else { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue + literals[i]; + } } } } - } - return copyDataFromBuffer(data, offset, numValues, notNull); -} + return copyDataFromBuffer(data, offset, numValues, notNull); + } -uint64_t RleDecoderV2::copyDataFromBuffer(int64_t* data, uint64_t offset, - uint64_t numValues, const char* notNull) { - uint64_t nRead = std::min(runLength - runRead, numValues); - if (notNull) { - for (uint64_t i = offset; i < (offset + nRead); ++i) { - if (notNull[i]) { - data[i] = literals[runRead++]; + template <typename T> + uint64_t RleDecoderV2::copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, + const char* notNull) { + uint64_t nRead = std::min(runLength - runRead, numValues); + if (notNull) { + for (uint64_t i = offset; i < (offset + nRead); ++i) { + if (notNull[i]) { + data[i] = static_cast<T>(literals[runRead++]); + } + } + } else { + for (uint64_t i = offset; i < (offset + nRead); ++i) { + data[i] = static_cast<T>(literals[runRead++]); } } - } else { - memcpy(data + offset, literals.data() + runRead, nRead * sizeof(int64_t)); - runRead += nRead; + return nRead; } - return nRead; -} } // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc index 4e7a145a5a..a75aeac2eb 100644 --- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc @@ -1,133 +1,135 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file - * distributed with option work for additional information - * regarding copyright ownership. The ASF licenses option file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the - * "License"); you may not use option file except in compliance + * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #include "Adaptor.hh" #include "Compression.hh" -#include "RLEv2.hh" #include "RLEV2Util.hh" +#include "RLEv2.hh" #define MAX_SHORT_REPEAT_LENGTH 10 namespace orc { -/** - * Compute the bits required to represent pth percentile value - * @param data - array - * @param p - percentile value (>=0.0 to <=1.0) - * @return pth percentile bits - */ -uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { + /** + * Compute the bits required to represent pth percentile value + * @param data - array + * @param p - percentile value (>=0.0 to <=1.0) + * @return pth percentile bits + */ + uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, + bool reuseHist) { if ((p > 1.0) || (p <= 0.0)) { - throw InvalidArgument("Invalid p value: " + to_string(p)); + throw InvalidArgument("Invalid p value: " + to_string(p)); } if (!reuseHist) { - // histogram that store the encoded bit requirement for each values. - // maximum number of bits that can encoded is 32 (refer FixedBitSizes) - memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); - // compute the histogram - for(size_t i = offset; i < (offset + length); i++) { - uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); - histgram[idx] += 1; - } + // histogram that store the encoded bit requirement for each values. + // maximum number of bits that can encoded is 32 (refer FixedBitSizes) + memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); + // compute the histogram + for (size_t i = offset; i < (offset + length); i++) { + uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); + histgram[idx] += 1; + } } int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); // return the bits required by pth percentile length - for(int32_t i = HIST_LEN - 1; i >= 0; i--) { - perLen -= histgram[i]; - if (perLen < 0) { - return decodeBitWidth(static_cast<uint32_t>(i)); - } + for (int32_t i = HIST_LEN - 1; i >= 0; i--) { + perLen -= histgram[i]; + if (perLen < 0) { + return decodeBitWidth(static_cast<uint32_t>(i)); + } } return 0; -} + } -RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned, bool alignBitPacking) : - RleEncoder(std::move(outStream), hasSigned), + RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, + bool alignBitPacking) + : RleEncoder(std::move(outStream), hasSigned), alignedBitPacking(alignBitPacking), - prevDelta(0){ + prevDelta(0) { literals = new int64_t[MAX_LITERAL_SIZE]; gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; zigzagLiterals = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr; baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; adjDeltas = new int64_t[MAX_LITERAL_SIZE]; -} + } -void RleEncoderV2::write(int64_t val) { - if(numLiterals == 0) { - initializeLiterals(val); - return; + void RleEncoderV2::write(int64_t val) { + if (numLiterals == 0) { + initializeLiterals(val); + return; } - if(numLiterals == 1) { - prevDelta = val - literals[0]; - literals[numLiterals++] = val; + if (numLiterals == 1) { + prevDelta = val - literals[0]; + literals[numLiterals++] = val; - if(val == literals[0]) { - fixedRunLength = 2; - variableRunLength = 0; - } else { - fixedRunLength = 0; - variableRunLength = 2; - } - return; + if (val == literals[0]) { + fixedRunLength = 2; + variableRunLength = 0; + } else { + fixedRunLength = 0; + variableRunLength = 2; + } + return; } int64_t currentDelta = val - literals[numLiterals - 1]; EncodingOption option = {}; if (prevDelta == 0 && currentDelta == 0) { - // case 1: fixed delta run - literals[numLiterals++] = val; - - if (variableRunLength > 0) { - // if variable run is non-zero then we are seeing repeating - // values at the end of variable run in which case fixed Run - // length is 2 - fixedRunLength = 2; - } - fixedRunLength++; - - // if fixed run met the minimum condition and if variable - // run is non-zero then flush the variable run and shift the - // tail fixed runs to start of the buffer - if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { - numLiterals -= MIN_REPEAT; - variableRunLength -= (MIN_REPEAT - 1); - - determineEncoding(option); - writeValues(option); - - // shift tail fixed runs to beginning of the buffer - for (size_t i = 0; i < MIN_REPEAT; ++i) { - literals[i] = val; - } - numLiterals = MIN_REPEAT; - } + // case 1: fixed delta run + literals[numLiterals++] = val; + + if (variableRunLength > 0) { + // if variable run is non-zero then we are seeing repeating + // values at the end of variable run in which case fixed Run + // length is 2 + fixedRunLength = 2; + } + fixedRunLength++; + + // if fixed run met the minimum condition and if variable + // run is non-zero then flush the variable run and shift the + // tail fixed runs to start of the buffer + if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { + numLiterals -= MIN_REPEAT; + variableRunLength -= (MIN_REPEAT - 1); + + determineEncoding(option); + writeValues(option); - if (fixedRunLength == MAX_LITERAL_SIZE) { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); + // shift tail fixed runs to beginning of the buffer + for (size_t i = 0; i < MIN_REPEAT; ++i) { + literals[i] = val; } - return; + numLiterals = MIN_REPEAT; + } + + if (fixedRunLength == MAX_LITERAL_SIZE) { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); + } + return; } // case 2: variable delta run @@ -136,45 +138,45 @@ void RleEncoderV2::write(int64_t val) { // short repeat conditions then write the values as short repeats // else use delta encoding if (fixedRunLength >= MIN_REPEAT) { - if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - } - writeValues(option); + if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + } + writeValues(option); } // if fixed run length is <MIN_REPEAT and current value is // different from previous then treat it as variable run if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; + variableRunLength = fixedRunLength; + fixedRunLength = 0; } // after writing values re-initialize the variables if (numLiterals == 0) { - initializeLiterals(val); + initializeLiterals(val); } else { - prevDelta = val - literals[numLiterals - 1]; - literals[numLiterals++] = val; - variableRunLength++; + prevDelta = val - literals[numLiterals - 1]; + literals[numLiterals++] = val; + variableRunLength++; - if (variableRunLength == MAX_LITERAL_SIZE) { - determineEncoding(option); - writeValues(option); - } + if (variableRunLength == MAX_LITERAL_SIZE) { + determineEncoding(option); + writeValues(option); + } } -} + } -void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { - assert (isSigned); + void RleEncoderV2::computeZigZagLiterals(EncodingOption& option) { + assert(isSigned); for (size_t i = 0; i < numLiterals; i++) { - zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]); + zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]); } -} + } -void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { + void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { // mask will be max value beyond which patch will be generated int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; @@ -190,9 +192,9 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { // gap and patch together in a long. To make sure gap and patch can be // packed together adjust the patch width if (option.patchWidth == 64) { - option.patchWidth = 56; - option.brBits95p = 8; - mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; + option.patchWidth = 56; + option.brBits95p = 8; + mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; } uint32_t gapIdx = 0; @@ -203,27 +205,27 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { std::vector<int64_t> gapList; std::vector<int64_t> patchList; - for(size_t i = 0; i < numLiterals; i++) { - // if value is above mask then create the patch and record the gap - if (baseRedLiterals[i] > mask) { - size_t gap = i - prev; - if (gap > maxGap) { - maxGap = gap; - } - - // gaps are relative, so store the previous patched value index - prev = i; - gapList.push_back(static_cast<int64_t>(gap)); - gapIdx++; - - // extract the most significant bits that are over mask bits - int64_t patch = baseRedLiterals[i] >> option.brBits95p; - patchList.push_back(patch); - patchIdx++; - - // strip off the MSB to enable safe bit packing - baseRedLiterals[i] &= mask; + for (size_t i = 0; i < numLiterals; i++) { + // if value is above mask then create the patch and record the gap + if (baseRedLiterals[i] > mask) { + size_t gap = i - prev; + if (gap > maxGap) { + maxGap = gap; } + + // gaps are relative, so store the previous patched value index + prev = i; + gapList.push_back(static_cast<int64_t>(gap)); + gapIdx++; + + // extract the most significant bits that are over mask bits + int64_t patch = baseRedLiterals[i] >> option.brBits95p; + patchList.push_back(patch); + patchIdx++; + + // strip off the MSB to enable safe bit packing + baseRedLiterals[i] &= mask; + } } // adjust the patch length to number of entries in gap list @@ -232,9 +234,9 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { // if the element to be patched is the first and only element then // max gap will be 0, but to store the gap as 0 we need atleast 1 bit if (maxGap == 0 && option.patchLength != 0) { - option.patchGapWidth = 1; + option.patchGapWidth = 1; } else { - option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); + option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); } // special case: if the patch gap width is greater than 256, then @@ -250,58 +252,58 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { // 255 gap width => 0 for patch value // 1 gap width => actual patch value if (option.patchGapWidth > 8) { - option.patchGapWidth = 8; - // for gap = 511, we need two additional entries in patch list - if (maxGap == 511) { - option.patchLength += 2; - } else { - option.patchLength += 1; - } + option.patchGapWidth = 8; + // for gap = 511, we need two additional entries in patch list + if (maxGap == 511) { + option.patchLength += 2; + } else { + option.patchLength += 1; + } } // create gap vs patch list gapIdx = 0; patchIdx = 0; - for(size_t i = 0; i < option.patchLength; i++) { - int64_t g = gapList[gapIdx++]; - int64_t p = patchList[patchIdx++]; - while (g > 255) { - gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); - i++; - g -= 255; - } + for (size_t i = 0; i < option.patchLength; i++) { + int64_t g = gapList[gapIdx++]; + int64_t p = patchList[patchIdx++]; + while (g > 255) { + gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); + i++; + g -= 255; + } - // store patch value in LSBs and gap in MSBs - gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); + // store patch value in LSBs and gap in MSBs + gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); } -} + } -/** - * Prepare for Direct or PatchedBase encoding - * compute zigZagLiterals and zzBits100p (Max number of encoding bits required) - * @return zigzagLiterals - */ -int64_t* RleEncoderV2::prepareForDirectOrPatchedBase(EncodingOption& option) { + /** + * Prepare for Direct or PatchedBase encoding + * compute zigZagLiterals and zzBits100p (Max number of encoding bits required) + * @return zigzagLiterals + */ + int64_t* RleEncoderV2::prepareForDirectOrPatchedBase(EncodingOption& option) { if (isSigned) { - computeZigZagLiterals(option); + computeZigZagLiterals(option); } int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals; option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0); return currentZigzagLiterals; -} + } -void RleEncoderV2::determineEncoding(EncodingOption& option) { + void RleEncoderV2::determineEncoding(EncodingOption& option) { // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings, // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag // computation when it's determined to be necessary. // not a big win for shorter runs to determine encoding if (numLiterals <= MIN_REPEAT) { - // we need to compute zigzag values for DIRECT encoding if we decide to - // break early for delta overflows or for shorter runs - prepareForDirectOrPatchedBase(option); - option.encoding = DIRECT; - return; + // we need to compute zigzag values for DIRECT encoding if we decide to + // break early for delta overflows or for shorter runs + prepareForDirectOrPatchedBase(option); + option.encoding = DIRECT; + return; } // DELTA encoding check @@ -319,29 +321,29 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) { adjDeltas[option.adjDeltasCount++] = initialDelta; for (size_t i = 1; i < numLiterals; i++) { - const int64_t l1 = literals[i]; - const int64_t l0 = literals[i - 1]; - currDelta = l1 - l0; - option.min = std::min(option.min, l1); - max = std::max(max, l1); - - isIncreasing &= (l0 <= l1); - isDecreasing &= (l0 >= l1); - - option.isFixedDelta &= (currDelta == initialDelta); - if (i > 1) { - adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); - deltaMax = std::max(deltaMax, adjDeltas[i - 1]); - } + const int64_t l1 = literals[i]; + const int64_t l0 = literals[i - 1]; + currDelta = l1 - l0; + option.min = std::min(option.min, l1); + max = std::max(max, l1); + + isIncreasing &= (l0 <= l1); + isDecreasing &= (l0 >= l1); + + option.isFixedDelta &= (currDelta == initialDelta); + if (i > 1) { + adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); + deltaMax = std::max(deltaMax, adjDeltas[i - 1]); + } } // it's faster to exit under delta overflow condition without checking for // PATCHED_BASE condition as encoding using DIRECT is faster and has less // overhead than PATCHED_BASE if (!isSafeSubtract(max, option.min)) { - prepareForDirectOrPatchedBase(option); - option.encoding = DIRECT; - return; + prepareForDirectOrPatchedBase(option); + option.encoding = DIRECT; + return; } // invariant - subtracting any number from any other in the literals after @@ -350,42 +352,42 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) { // if min is equal to max then the delta is 0, option condition happens for // fixed values run >10 which cannot be encoded with SHORT_REPEAT if (option.min == max) { - if (!option.isFixedDelta) { - throw InvalidArgument(to_string(option.min) + "==" + - to_string(max) + ", isFixedDelta cannot be false"); - } + if (!option.isFixedDelta) { + throw InvalidArgument(to_string(option.min) + "==" + to_string(max) + + ", isFixedDelta cannot be false"); + } - if(currDelta != 0) { - throw InvalidArgument(to_string(option.min) + "==" + - to_string(max) + ", currDelta should be zero"); - } - option.fixedDelta = 0; - option.encoding = DELTA; - return; + if (currDelta != 0) { + throw InvalidArgument(to_string(option.min) + "==" + to_string(max) + + ", currDelta should be zero"); + } + option.fixedDelta = 0; + option.encoding = DELTA; + return; } if (option.isFixedDelta) { - if (currDelta != initialDelta) { - throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); - } + if (currDelta != initialDelta) { + throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); + } - option.encoding = DELTA; - option.fixedDelta = currDelta; - return; + option.encoding = DELTA; + option.fixedDelta = currDelta; + return; } // if initialDelta is 0 then we cannot delta encode as we cannot identify // the sign of deltas (increasing or decreasing) if (initialDelta != 0) { - // stores the number of bits required for packing delta blob in - // delta encoding - option.bitsDeltaMax = findClosestNumBits(deltaMax); - - // monotonic condition - if (isIncreasing || isDecreasing) { - option.encoding = DELTA; - return; - } + // stores the number of bits required for packing delta blob in + // delta encoding + option.bitsDeltaMax = findClosestNumBits(deltaMax); + + // monotonic condition + if (isIncreasing || isDecreasing) { + option.encoding = DELTA; + return; + } } // PATCHED_BASE encoding check @@ -402,106 +404,105 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) { // if the difference between 90th percentile and 100th percentile fixed // bits is > 1 then we need patch the values if (diffBitsLH > 1) { + // patching is done only on base reduced values. + // remove base from literals + for (size_t i = 0; i < numLiterals; i++) { + baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); + } - // patching is done only on base reduced values. - // remove base from literals - for (size_t i = 0; i < numLiterals; i++) { - baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); - } - - // 95th percentile width is used to determine max allowed value - // after which patching will be done - option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); - - // 100th percentile is used to compute the max patch width - option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); - - // after base reducing the values, if the difference in bits between - // 95th percentile and 100th percentile value is zero then there - // is no point in patching the values, in which case we will - // fallback to DIRECT encoding. - // The decision to use patched base was based on zigzag values, but the - // actual patching is done on base reduced literals. - if ((option.brBits100p - option.brBits95p) != 0) { - option.encoding = PATCHED_BASE; - preparePatchedBlob(option); - return; - } else { - option.encoding = DIRECT; - return; - } - } else { - // if difference in bits between 95th percentile and 100th percentile is - // 0, then patch length will become 0. Hence we will fallback to direct + // 95th percentile width is used to determine max allowed value + // after which patching will be done + option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); + + // 100th percentile is used to compute the max patch width + option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); + + // after base reducing the values, if the difference in bits between + // 95th percentile and 100th percentile value is zero then there + // is no point in patching the values, in which case we will + // fallback to DIRECT encoding. + // The decision to use patched base was based on zigzag values, but the + // actual patching is done on base reduced literals. + if ((option.brBits100p - option.brBits95p) != 0) { + option.encoding = PATCHED_BASE; + preparePatchedBlob(option); + return; + } else { option.encoding = DIRECT; return; + } + } else { + // if difference in bits between 95th percentile and 100th percentile is + // 0, then patch length will become 0. Hence we will fallback to direct + option.encoding = DIRECT; + return; } -} + } -uint64_t RleEncoderV2::flush() { + uint64_t RleEncoderV2::flush() { if (numLiterals != 0) { - EncodingOption option = {}; - if (variableRunLength != 0) { - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength != 0) { - if (fixedRunLength < MIN_REPEAT) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength >= MIN_REPEAT - && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - writeValues(option); - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); - } + EncodingOption option = {}; + if (variableRunLength != 0) { + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength != 0) { + if (fixedRunLength < MIN_REPEAT) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength >= MIN_REPEAT && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + writeValues(option); + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); } + } } outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); uint64_t dataSize = outputStream->flush(); bufferLength = bufferPosition = 0; return dataSize; -} + } -void RleEncoderV2::writeValues(EncodingOption& option) { + void RleEncoderV2::writeValues(EncodingOption& option) { if (numLiterals != 0) { - switch (option.encoding) { - case SHORT_REPEAT: - writeShortRepeatValues(option); - break; - case DIRECT: - writeDirectValues(option); - break; - case PATCHED_BASE: - writePatchedBasedValues(option); - break; - case DELTA: - writeDeltaValues(option); - break; - default: - throw NotImplementedYet("Not implemented yet"); - } + switch (option.encoding) { + case SHORT_REPEAT: + writeShortRepeatValues(option); + break; + case DIRECT: + writeDirectValues(option); + break; + case PATCHED_BASE: + writePatchedBasedValues(option); + break; + case DELTA: + writeDeltaValues(option); + break; + default: + throw NotImplementedYet("Not implemented yet"); + } - numLiterals = 0; - prevDelta = 0; + numLiterals = 0; + prevDelta = 0; } -} + } -void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { + void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { int64_t repeatVal; if (isSigned) { - repeatVal = zigZag(literals[0]); + repeatVal = zigZag(literals[0]); } else { - repeatVal = literals[0]; + repeatVal = literals[0]; } const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal); - const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); + const uint32_t numBytesRepeatVal = + numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); uint32_t header = getOpCode(SHORT_REPEAT); @@ -511,19 +512,19 @@ void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { writeByte(static_cast<char>(header)); - for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { - int64_t b = ((repeatVal >> (i * 8)) & 0xff); - writeByte(static_cast<char>(b)); + for (int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { + int64_t b = ((repeatVal >> (i * 8)) & 0xff); + writeByte(static_cast<char>(b)); } fixedRunLength = 0; -} + } -void RleEncoderV2::writeDirectValues(EncodingOption& option) { + void RleEncoderV2::writeDirectValues(EncodingOption& option) { // write the number of fixed bits required in next 5 bits uint32_t fb = option.zzBits100p; if (alignedBitPacking) { - fb = getClosestAlignedFixedBits(fb); + fb = getClosestAlignedFixedBits(fb); } const uint32_t efb = encodeBitWidth(fb) << 1; @@ -550,9 +551,9 @@ void RleEncoderV2::writeDirectValues(EncodingOption& option) { // reset run length variableRunLength = 0; -} + } -void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { + void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding // because patch is applied to MSB bits. For example: If fixed bit width of // base value is 7 bits and if patch is 3 bits, the actual value is @@ -578,7 +579,7 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { // if the min value is negative toggle the sign const bool isNegative = (option.min < 0); if (isNegative) { - option.min = -option.min; + option.min = -option.min; } // find the number of bytes required for base and shift it by 5 bits @@ -590,7 +591,7 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { // if the base value is negative then set MSB to 1 if (isNegative) { - option.min |= (1LL << ((baseBytes * 8) - 1)); + option.min |= (1LL << ((baseBytes * 8) - 1)); } // third byte contains 3 bits for number of bytes occupied by base @@ -599,7 +600,8 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { // fourth byte contains 3 bits for page gap width and 5 bits for // patch length - const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); + const char headerFourthByte = + static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); // write header writeByte(headerFirstByte); @@ -608,9 +610,9 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { writeByte(headerFourthByte); // write the base value using fixed bytes in big endian order - for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { - char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); - writeByte(b); + for (int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { + char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); + writeByte(b); } // base reduced literals are bit packed @@ -625,39 +627,39 @@ void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { // reset run length variableRunLength = 0; -} + } -void RleEncoderV2::writeDeltaValues(EncodingOption& option) { + void RleEncoderV2::writeDeltaValues(EncodingOption& option) { uint32_t len = 0; uint32_t fb = option.bitsDeltaMax; uint32_t efb = 0; if (alignedBitPacking) { - fb = getClosestAlignedFixedBits(fb); + fb = getClosestAlignedFixedBits(fb); } if (option.isFixedDelta) { - // if fixed run length is greater than threshold then it will be fixed - // delta sequence with delta value 0 else fixed delta sequence with - // non-zero delta value - if (fixedRunLength > MIN_REPEAT) { - // ex. sequence: 2 2 2 2 2 2 2 2 - len = fixedRunLength - 1; - fixedRunLength = 0; - } else { - // ex. sequence: 4 6 8 10 12 14 16 - len = variableRunLength - 1; - variableRunLength = 0; - } - } else { - // fixed width 0 is used for long repeating values. - // sequences that require only 1 bit to encode will have an additional bit - if (fb == 1) { - fb = 2; - } - efb = encodeBitWidth(fb) << 1; + // if fixed run length is greater than threshold then it will be fixed + // delta sequence with delta value 0 else fixed delta sequence with + // non-zero delta value + if (fixedRunLength > MIN_REPEAT) { + // ex. sequence: 2 2 2 2 2 2 2 2 + len = fixedRunLength - 1; + fixedRunLength = 0; + } else { + // ex. sequence: 4 6 8 10 12 14 16 len = variableRunLength - 1; variableRunLength = 0; + } + } else { + // fixed width 0 is used for long repeating values. + // sequences that require only 1 bit to encode will have an additional bit + if (fb == 1) { + fb = 2; + } + efb = encodeBitWidth(fb) << 1; + len = variableRunLength - 1; + variableRunLength = 0; } // extract the 9th bit of run length @@ -675,106 +677,106 @@ void RleEncoderV2::writeDeltaValues(EncodingOption& option) { // store the first value from zigzag literal array if (isSigned) { - writeVslong(literals[0]); + writeVslong(literals[0]); } else { - writeVulong(literals[0]); + writeVulong(literals[0]); } if (option.isFixedDelta) { - // if delta is fixed then we don't need to store delta blob - writeVslong(option.fixedDelta); + // if delta is fixed then we don't need to store delta blob + writeVslong(option.fixedDelta); } else { - // store the first value as delta value using zigzag encoding - writeVslong(adjDeltas[0]); + // store the first value as delta value using zigzag encoding + writeVslong(adjDeltas[0]); - // adjacent delta values are bit packed. The length of adjDeltas array is - // always one less than the number of literals (delta difference for n - // elements is n-1). We have already written one element, write the - // remaining numLiterals - 2 elements here - writeInts(adjDeltas, 1, numLiterals - 2, fb); + // adjacent delta values are bit packed. The length of adjDeltas array is + // always one less than the number of literals (delta difference for n + // elements is n-1). We have already written one element, write the + // remaining numLiterals - 2 elements here + writeInts(adjDeltas, 1, numLiterals - 2, fb); } -} + } -void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { - if(input == nullptr || len < 1 || bitSize < 1) { + void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { + if (input == nullptr || len < 1 || bitSize < 1) { return; - } + } - if (getClosestAlignedFixedBits(bitSize) == bitSize) { - uint32_t numBytes; - uint32_t endOffSet = static_cast<uint32_t>(offset + len); - if (bitSize < 8 ) { - char bitMask = static_cast<char>((1 << bitSize) - 1); - uint32_t numHops = 8 / bitSize; - uint32_t remainder = static_cast<uint32_t>(len % numHops); - uint32_t endUnroll = endOffSet - remainder; - for (uint32_t i = offset; i < endUnroll; i+=numHops) { - char toWrite = 0; - for (uint32_t j = 0; j < numHops; ++j) { - toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize)); + if (getClosestAlignedFixedBits(bitSize) == bitSize) { + uint32_t numBytes; + uint32_t endOffSet = static_cast<uint32_t>(offset + len); + if (bitSize < 8) { + char bitMask = static_cast<char>((1 << bitSize) - 1); + uint32_t numHops = 8 / bitSize; + uint32_t remainder = static_cast<uint32_t>(len % numHops); + uint32_t endUnroll = endOffSet - remainder; + for (uint32_t i = offset; i < endUnroll; i += numHops) { + char toWrite = 0; + for (uint32_t j = 0; j < numHops; ++j) { + toWrite |= static_cast<char>((input[i + j] & bitMask) << (8 - (j + 1) * bitSize)); + } + writeByte(toWrite); } - writeByte(toWrite); - } - if (remainder > 0) { - uint32_t startShift = 8 - bitSize; - char toWrite = 0; - for (uint32_t i = endUnroll; i < endOffSet; ++i) { - toWrite |= static_cast<char>((input[i] & bitMask) << startShift); - startShift -= bitSize; + if (remainder > 0) { + uint32_t startShift = 8 - bitSize; + char toWrite = 0; + for (uint32_t i = endUnroll; i < endOffSet; ++i) { + toWrite |= static_cast<char>((input[i] & bitMask) << startShift); + startShift -= bitSize; + } + writeByte(toWrite); } - writeByte(toWrite); - } - } else { - numBytes = bitSize / 8; + } else { + numBytes = bitSize / 8; - for (uint32_t i = offset; i < endOffSet; ++i) { - for (uint32_t j = 0; j < numBytes; ++j) { - char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); - writeByte(toWrite); + for (uint32_t i = offset; i < endOffSet; ++i) { + for (uint32_t j = 0; j < numBytes; ++j) { + char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); + writeByte(toWrite); + } } } - } - return; - } + return; + } - // write for unaligned bit size - uint32_t bitsLeft = 8; - char current = 0; - for(uint32_t i = offset; i < (offset + len); i++) { - int64_t value = input[i]; - uint32_t bitsToWrite = bitSize; - while (bitsToWrite > bitsLeft) { - // add the bits to the bottom of the current word - current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); - // subtract out the bits we just added - bitsToWrite -= bitsLeft; - // zero out the bits above bitsToWrite - value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; - writeByte(current); - current = 0; - bitsLeft = 8; + // write for unaligned bit size + uint32_t bitsLeft = 8; + char current = 0; + for (uint32_t i = offset; i < (offset + len); i++) { + int64_t value = input[i]; + uint32_t bitsToWrite = bitSize; + while (bitsToWrite > bitsLeft) { + // add the bits to the bottom of the current word + current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); + // subtract out the bits we just added + bitsToWrite -= bitsLeft; + // zero out the bits above bitsToWrite + value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; + writeByte(current); + current = 0; + bitsLeft = 8; + } + bitsLeft -= bitsToWrite; + current |= static_cast<char>(value << bitsLeft); + if (bitsLeft == 0) { + writeByte(current); + current = 0; + bitsLeft = 8; + } } - bitsLeft -= bitsToWrite; - current |= static_cast<char>(value << bitsLeft); - if (bitsLeft == 0) { + + // flush + if (bitsLeft != 8) { writeByte(current); - current = 0; - bitsLeft = 8; } } - // flush - if (bitsLeft != 8) { - writeByte(current); - } -} - -void RleEncoderV2::initializeLiterals(int64_t val) { + void RleEncoderV2::initializeLiterals(int64_t val) { literals[numLiterals++] = val; fixedRunLength = 1; variableRunLength = 1; -} -} + } +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc new file mode 100644 index 0000000000..b8c4fd4048 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc @@ -0,0 +1,255 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SchemaEvolution.hh" +#include "orc/Exceptions.hh" + +namespace orc { + + SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& _readType, const Type* fileType) + : readType(_readType) { + if (readType) { + buildConversion(readType.get(), fileType); + } else { + for (uint64_t i = 0; i <= fileType->getMaximumColumnId(); ++i) { + safePPDConversionMap.insert(i); + } + } + } + + const Type* SchemaEvolution::getReadType(const Type& fileType) const { + auto ret = readTypeMap.find(fileType.getColumnId()); + return ret == readTypeMap.cend() ? &fileType : ret->second; + } + + inline void invalidConversion(const Type* readType, const Type* fileType) { + throw SchemaEvolutionError("Cannot convert from " + fileType->toString() + " to " + + readType->toString()); + } + + struct EnumClassHash { + template <typename T> + std::size_t operator()(T t) const { + return static_cast<std::size_t>(t); + } + }; + + bool isNumeric(const Type& type) { + auto kind = type.getKind(); + return kind == BOOLEAN || kind == BYTE || kind == SHORT || kind == INT || kind == LONG || + kind == FLOAT || kind == DOUBLE; + } + + bool isStringVariant(const Type& type) { + auto kind = type.getKind(); + return kind == STRING || kind == CHAR || kind == VARCHAR; + } + + bool isDecimal(const Type& type) { + auto kind = type.getKind(); + return kind == DECIMAL; + } + + bool isTimestamp(const Type& type) { + auto kind = type.getKind(); + return kind == TIMESTAMP || kind == TIMESTAMP_INSTANT; + } + + struct ConversionCheckResult { + bool isValid; + bool needConvert; + }; + + ConversionCheckResult checkConversion(const Type& readType, const Type& fileType) { + ConversionCheckResult ret = {false, false}; + if (readType.getKind() == fileType.getKind()) { + ret.isValid = true; + if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) { + ret.isValid = readType.getMaximumLength() == fileType.getMaximumLength(); + } else if (fileType.getKind() == DECIMAL) { + ret.needConvert = readType.getPrecision() != fileType.getPrecision() || + readType.getScale() != fileType.getScale(); + } + } else { + switch (fileType.getKind()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: { + ret.isValid = ret.needConvert = isNumeric(readType) || isStringVariant(readType) || + isDecimal(readType) || isTimestamp(readType); + break; + } + case DECIMAL: { + ret.isValid = ret.needConvert = isNumeric(readType); + break; + } + case STRING: + case CHAR: + case VARCHAR: + case TIMESTAMP: + case TIMESTAMP_INSTANT: + case DATE: + case BINARY: { + // Not support + break; + } + case STRUCT: + case LIST: + case MAP: + case UNION: { + ret.isValid = ret.needConvert = false; + break; + } + default: + break; + } + } + return ret; + } + + void SchemaEvolution::buildConversion(const Type* _readType, const Type* fileType) { + if (fileType == nullptr) { + throw SchemaEvolutionError("File does not have " + _readType->toString()); + } + + auto [valid, convert] = checkConversion(*_readType, *fileType); + if (!valid) { + invalidConversion(_readType, fileType); + } + readTypeMap.emplace(_readType->getColumnId(), convert ? _readType : fileType); + + // check whether PPD conversion is safe + buildSafePPDConversionMap(_readType, fileType); + + for (uint64_t i = 0; i < _readType->getSubtypeCount(); ++i) { + auto subType = _readType->getSubtype(i); + if (subType) { + // null subType means that this is a sub column of map/list type + // and it does not exist in the file. simply skip it. + buildConversion(subType, fileType->getTypeByColumnId(subType->getColumnId())); + } + } + } + + bool SchemaEvolution::needConvert(const Type& fileType) const { + auto _readType = getReadType(fileType); + if (_readType == &fileType) { + return false; + } + // it does not check valid here as verified by buildConversion() + return checkConversion(*_readType, fileType).needConvert; + } + + inline bool isPrimitive(const Type* type) { + auto kind = type->getKind(); + return kind != STRUCT && kind != MAP && kind != LIST && kind != UNION; + } + + void SchemaEvolution::buildSafePPDConversionMap(const Type* _readType, const Type* fileType) { + if (_readType == nullptr || !isPrimitive(_readType) || fileType == nullptr || + !isPrimitive(fileType)) { + return; + } + + bool isSafe = false; + if (_readType == fileType) { + // short cut for same type + isSafe = true; + } else if (_readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) { + // for decimals alone do equality check to not mess up with precision change + if (fileType->getPrecision() == readType->getPrecision() && + fileType->getScale() == readType->getScale()) { + isSafe = true; + } + } else { + // only integer and string evolutions are safe + // byte -> short -> int -> long + // string <-> char <-> varchar + // NOTE: Float to double evolution is not safe as floats are stored as + // doubles in ORC's internal index, but when doing predicate evaluation + // for queries like "select * from orc_float where f = 74.72" the constant + // on the filter is converted from string -> double so the precisions will + // be different and the comparison will fail. + // Soon, we should convert all sargs that compare equality between floats + // or doubles to range predicates. + // Similarly string -> char and varchar -> char and vice versa is impossible + // as ORC stores char with padded spaces in its internal index. + switch (fileType->getKind()) { + case BYTE: { + if (readType->getKind() == SHORT || readType->getKind() == INT || + readType->getKind() == LONG) { + isSafe = true; + } + break; + } + case SHORT: { + if (readType->getKind() == INT || readType->getKind() == LONG) { + isSafe = true; + } + break; + } + case INT: { + if (readType->getKind() == LONG) { + isSafe = true; + } + break; + } + case STRING: { + if (readType->getKind() == VARCHAR) { + isSafe = true; + } + break; + } + case VARCHAR: { + if (readType->getKind() == STRING) { + isSafe = true; + } + break; + } + case BOOLEAN: + case LONG: + case FLOAT: + case DOUBLE: + case BINARY: + case TIMESTAMP: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DECIMAL: + case DATE: + case CHAR: + case TIMESTAMP_INSTANT: + break; + } + } + + if (isSafe) { + safePPDConversionMap.insert(fileType->getColumnId()); + } + } + + bool SchemaEvolution::isSafePPDConversion(uint64_t columnId) const { + return safePPDConversionMap.find(columnId) != safePPDConversionMap.cend(); + } + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh new file mode 100644 index 0000000000..ef9020eba4 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.hh @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_SCHEMA_EVOLUTION_HH +#define ORC_SCHEMA_EVOLUTION_HH + +#include "orc/Type.hh" + +#include <unordered_map> +#include <unordered_set> + +namespace orc { + + /** + * Utility class to compare read type and file type to match their columns + * and check type conversion. + */ + class SchemaEvolution { + public: + SchemaEvolution(const std::shared_ptr<Type>& readType, const Type* fileType); + + // get read type by column id from file type. or return the file type if + // read type is not provided (i.e. no schema evolution requested). + const Type* getReadType(const Type& fileType) const; + + // check if we need to convert file type to read type for primitive type. + bool needConvert(const Type& fileType) const; + + // check if the PPD conversion is safe + bool isSafePPDConversion(uint64_t columnId) const; + + // return selected read type + const Type* getReadType() const { + return readType.get(); + } + + private: + void buildConversion(const Type* readType, const Type* fileType); + void buildSafePPDConversionMap(const Type* readType, const Type* fileType); + + private: + const std::shared_ptr<Type> readType; + std::unordered_map<uint64_t, const Type*> readTypeMap; + std::unordered_set<uint64_t> safePPDConversionMap; + }; + +} // namespace orc + +#endif // ORC_SCHEMA_EVOLUTION_HH diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc index ccc54c291c..8ed29d0e7c 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.cc +++ b/contrib/libs/apache/orc/c++/src/Statistics.cc @@ -1,4 +1,4 @@ - /** +/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -16,9 +16,9 @@ * limitations under the License. */ -#include "orc/Exceptions.hh" -#include "RLE.hh" #include "Statistics.hh" +#include "RLE.hh" +#include "orc/Exceptions.hh" #include "wrap/coded-stream-wrapper.h" @@ -26,23 +26,23 @@ namespace orc { ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, const StatContext& statContext) { - if (s.has_intstatistics()) { + if (s.has_int_statistics()) { return new IntegerColumnStatisticsImpl(s); - } else if (s.has_doublestatistics()) { + } else if (s.has_double_statistics()) { return new DoubleColumnStatisticsImpl(s); - } else if (s.has_collectionstatistics()) { + } else if (s.has_collection_statistics()) { return new CollectionColumnStatisticsImpl(s); - } else if (s.has_stringstatistics()) { + } else if (s.has_string_statistics()) { return new StringColumnStatisticsImpl(s, statContext); - } else if (s.has_bucketstatistics()) { + } else if (s.has_bucket_statistics()) { return new BooleanColumnStatisticsImpl(s, statContext); - } else if (s.has_decimalstatistics()) { + } else if (s.has_decimal_statistics()) { return new DecimalColumnStatisticsImpl(s, statContext); - } else if (s.has_timestampstatistics()) { + } else if (s.has_timestamp_statistics()) { return new TimestampColumnStatisticsImpl(s, statContext); - } else if (s.has_datestatistics()) { + } else if (s.has_date_statistics()) { return new DateColumnStatisticsImpl(s, statContext); - } else if (s.has_binarystatistics()) { + } else if (s.has_binary_statistics()) { return new BinaryColumnStatisticsImpl(s, statContext); } else { return new ColumnStatisticsImpl(s); @@ -51,24 +51,20 @@ namespace orc { StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext) { - for(int i = 0; i < stripeStats.colstats_size(); i++) { - colStats.push_back( - convertColumnStatistics(stripeStats.colstats(i), statContext)); + for (int i = 0; i < stripeStats.col_stats_size(); i++) { + colStats.push_back(convertColumnStatistics(stripeStats.col_stats(i), statContext)); } } - StatisticsImpl::StatisticsImpl(const proto::Footer& footer, - const StatContext& statContext) { - for(int i = 0; i < footer.statistics_size(); i++) { - colStats.push_back( - convertColumnStatistics(footer.statistics(i), statContext)); + StatisticsImpl::StatisticsImpl(const proto::Footer& footer, const StatContext& statContext) { + for (int i = 0; i < footer.statistics_size(); i++) { + colStats.push_back(convertColumnStatistics(footer.statistics(i), statContext)); } } StatisticsImpl::~StatisticsImpl() { - for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); - ptr != colStats.end(); - ++ptr) { + for (std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); ptr != colStats.end(); + ++ptr) { delete *ptr; } } @@ -86,21 +82,19 @@ namespace orc { } StripeStatisticsImpl::StripeStatisticsImpl( - const proto::StripeStatistics& stripeStats, - std::vector<std::vector<proto::ColumnStatistics> >& indexStats, - const StatContext& statContext) { - columnStats.reset(new StatisticsImpl(stripeStats, statContext)); + const proto::StripeStatistics& stripeStats, + std::vector<std::vector<proto::ColumnStatistics> >& indexStats, + const StatContext& statContext) { + columnStats = std::make_unique<StatisticsImpl>(stripeStats, statContext); rowIndexStats.resize(indexStats.size()); - for(size_t i = 0; i < rowIndexStats.size(); i++) { - for(size_t j = 0; j < indexStats[i].size(); j++) { - rowIndexStats[i].push_back( - std::shared_ptr<const ColumnStatistics>( - convertColumnStatistics(indexStats[i][j], statContext))); + for (size_t i = 0; i < rowIndexStats.size(); i++) { + for (size_t j = 0; j < indexStats[i].size(); j++) { + rowIndexStats[i].push_back(std::shared_ptr<const ColumnStatistics>( + convertColumnStatistics(indexStats[i][j], statContext))); } } } - ColumnStatistics::~ColumnStatistics() { // PASS } @@ -185,59 +179,57 @@ namespace orc { // PASS } - ColumnStatisticsImpl::ColumnStatisticsImpl - (const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); } - BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_binarystatistics() && statContext.correctStats) { - _stats.setHasTotalLength(pb.binarystatistics().has_sum()); - _stats.setTotalLength( - static_cast<uint64_t>(pb.binarystatistics().sum())); + BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb, + const StatContext& statContext) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (pb.has_binary_statistics() && statContext.correctStats) { + _stats.setHasTotalLength(pb.binary_statistics().has_sum()); + _stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum())); } } - BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_bucketstatistics() && statContext.correctStats) { + BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb, + const StatContext& statContext) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (pb.has_bucket_statistics() && statContext.correctStats) { _hasCount = true; - _trueCount = pb.bucketstatistics().count(0); + _trueCount = pb.bucket_statistics().count(0); } else { _hasCount = false; _trueCount = 0; } } - DateColumnStatisticsImpl::DateColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_datestatistics() || !statContext.correctStats) { + DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb, + const StatContext& statContext) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (!pb.has_date_statistics() || !statContext.correctStats) { // hasMinimum_ is false by default; // hasMaximum_ is false by default; _stats.setMinimum(0); _stats.setMaximum(0); } else { - _stats.setHasMinimum(pb.datestatistics().has_minimum()); - _stats.setHasMaximum(pb.datestatistics().has_maximum()); - _stats.setMinimum(pb.datestatistics().minimum()); - _stats.setMaximum(pb.datestatistics().maximum()); + _stats.setHasMinimum(pb.date_statistics().has_minimum()); + _stats.setHasMaximum(pb.date_statistics().has_maximum()); + _stats.setMinimum(pb.date_statistics().minimum()); + _stats.setMaximum(pb.date_statistics().maximum()); } } - DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_decimalstatistics() && statContext.correctStats) { - const proto::DecimalStatistics& stats = pb.decimalstatistics(); + DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb, + const StatContext& statContext) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (pb.has_decimal_statistics() && statContext.correctStats) { + const proto::DecimalStatistics& stats = pb.decimal_statistics(); _stats.setHasMinimum(stats.has_minimum()); _stats.setHasMaximum(stats.has_maximum()); _stats.setHasSum(stats.has_sum()); @@ -248,16 +240,15 @@ namespace orc { } } - DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_doublestatistics()) { + DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (!pb.has_double_statistics()) { _stats.setMinimum(0); _stats.setMaximum(0); _stats.setSum(0); - }else{ - const proto::DoubleStatistics& stats = pb.doublestatistics(); + } else { + const proto::DoubleStatistics& stats = pb.double_statistics(); _stats.setHasMinimum(stats.has_minimum()); _stats.setHasMaximum(stats.has_maximum()); _stats.setHasSum(stats.has_sum()); @@ -268,16 +259,15 @@ namespace orc { } } - IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_intstatistics()) { + IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (!pb.has_int_statistics()) { _stats.setMinimum(0); _stats.setMaximum(0); _stats.setSum(0); - }else{ - const proto::IntegerStatistics& stats = pb.intstatistics(); + } else { + const proto::IntegerStatistics& stats = pb.int_statistics(); _stats.setHasMinimum(stats.has_minimum()); _stats.setHasMaximum(stats.has_maximum()); _stats.setHasSum(stats.has_sum()); @@ -288,14 +278,14 @@ namespace orc { } } - StringColumnStatisticsImpl::StringColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_stringstatistics() || !statContext.correctStats) { + StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb, + const StatContext& statContext) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (!pb.has_string_statistics() || !statContext.correctStats) { _stats.setTotalLength(0); - }else{ - const proto::StringStatistics& stats = pb.stringstatistics(); + } else { + const proto::StringStatistics& stats = pb.string_statistics(); _stats.setHasMinimum(stats.has_minimum()); _stats.setHasMaximum(stats.has_maximum()); _stats.setHasTotalLength(stats.has_sum()); @@ -306,46 +296,40 @@ namespace orc { } } - TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_timestampstatistics() || !statContext.correctStats) { + TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb, + const StatContext& statContext) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (!pb.has_timestamp_statistics() || !statContext.correctStats) { _stats.setMinimum(0); _stats.setMaximum(0); _lowerBound = 0; _upperBound = 0; _minimumNanos = DEFAULT_MIN_NANOS; _maximumNanos = DEFAULT_MAX_NANOS; - }else{ - const proto::TimestampStatistics& stats = pb.timestampstatistics(); - _stats.setHasMinimum( - stats.has_minimumutc() || - (stats.has_minimum() && (statContext.writerTimezone != nullptr))); - _stats.setHasMaximum( - stats.has_maximumutc() || - (stats.has_maximum() && (statContext.writerTimezone != nullptr))); - _hasLowerBound = stats.has_minimumutc() || stats.has_minimum(); - _hasUpperBound = stats.has_maximumutc() || stats.has_maximum(); - // to be consistent with java side, non-default minimumnanos and maximumnanos + } else { + const proto::TimestampStatistics& stats = pb.timestamp_statistics(); + _stats.setHasMinimum(stats.has_minimum_utc() || + (stats.has_minimum() && (statContext.writerTimezone != nullptr))); + _stats.setHasMaximum(stats.has_maximum_utc() || + (stats.has_maximum() && (statContext.writerTimezone != nullptr))); + _hasLowerBound = stats.has_minimum_utc() || stats.has_minimum(); + _hasUpperBound = stats.has_maximum_utc() || stats.has_maximum(); + // to be consistent with java side, non-default minimum_nanos and maximum_nanos // are added by one in their serialized form. - _minimumNanos = stats.has_minimumnanos() ? - stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS; - _maximumNanos = stats.has_maximumnanos() ? - stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS; + _minimumNanos = stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS; + _maximumNanos = stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS; // Timestamp stats are stored in milliseconds - if (stats.has_minimumutc()) { - int64_t minimum = stats.minimumutc(); + if (stats.has_minimum_utc()) { + int64_t minimum = stats.minimum_utc(); _stats.setMinimum(minimum); _lowerBound = minimum; } else if (statContext.writerTimezone) { int64_t writerTimeSec = stats.minimum() / 1000; // multiply the offset by 1000 to convert to millisecond - int64_t minimum = - stats.minimum() + - (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) - * 1000; + int64_t minimum = stats.minimum() + + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000; _stats.setMinimum(minimum); _lowerBound = minimum; } else { @@ -356,94 +340,82 @@ namespace orc { } // Timestamp stats are stored in milliseconds - if (stats.has_maximumutc()) { - int64_t maximum = stats.maximumutc(); + if (stats.has_maximum_utc()) { + int64_t maximum = stats.maximum_utc(); _stats.setMaximum(maximum); _upperBound = maximum; } else if (statContext.writerTimezone) { int64_t writerTimeSec = stats.maximum() / 1000; // multiply the offset by 1000 to convert to millisecond int64_t maximum = stats.maximum() + - (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) - * 1000; + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) * 1000; _stats.setMaximum(maximum); _upperBound = maximum; } else { _stats.setMaximum(0); // add 1 day 1 hour (25 hours) in milliseconds to handle unknown // TZ and daylight savings - _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); + _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); } // Add 1 millisecond to account for microsecond precision of values _upperBound += 1; } } - CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl - (const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_collectionstatistics()) { + CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl( + const proto::ColumnStatistics& pb) { + _stats.setNumberOfValues(pb.number_of_values()); + _stats.setHasNull(pb.has_null()); + if (!pb.has_collection_statistics()) { _stats.setMinimum(0); _stats.setMaximum(0); _stats.setSum(0); } else { - const proto::CollectionStatistics& stats = pb.collectionstatistics(); - _stats.setHasMinimum(stats.has_minchildren()); - _stats.setHasMaximum(stats.has_maxchildren()); - _stats.setHasSum(stats.has_totalchildren()); - - _stats.setMinimum(stats.minchildren()); - _stats.setMaximum(stats.maxchildren()); - _stats.setSum(stats.totalchildren()); + const proto::CollectionStatistics& stats = pb.collection_statistics(); + _stats.setHasMinimum(stats.has_min_children()); + _stats.setHasMaximum(stats.has_max_children()); + _stats.setHasSum(stats.has_total_children()); + + _stats.setMinimum(stats.min_children()); + _stats.setMaximum(stats.max_children()); + _stats.setSum(stats.total_children()); } } - std::unique_ptr<MutableColumnStatistics> createColumnStatistics( - const Type& type) { + std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type) { switch (static_cast<int64_t>(type.getKind())) { case BOOLEAN: - return std::unique_ptr<MutableColumnStatistics>( - new BooleanColumnStatisticsImpl()); + return std::make_unique<BooleanColumnStatisticsImpl>(); case BYTE: case INT: case LONG: case SHORT: - return std::unique_ptr<MutableColumnStatistics>( - new IntegerColumnStatisticsImpl()); + return std::make_unique<IntegerColumnStatisticsImpl>(); case MAP: case LIST: - return std::unique_ptr<MutableColumnStatistics>( - new CollectionColumnStatisticsImpl()); + return std::make_unique<CollectionColumnStatisticsImpl>(); case STRUCT: case UNION: - return std::unique_ptr<MutableColumnStatistics>( - new ColumnStatisticsImpl()); + return std::make_unique<ColumnStatisticsImpl>(); case FLOAT: case DOUBLE: - return std::unique_ptr<MutableColumnStatistics>( - new DoubleColumnStatisticsImpl()); + return std::make_unique<DoubleColumnStatisticsImpl>(); case BINARY: - return std::unique_ptr<MutableColumnStatistics>( - new BinaryColumnStatisticsImpl()); + return std::make_unique<BinaryColumnStatisticsImpl>(); case STRING: case CHAR: case VARCHAR: - return std::unique_ptr<MutableColumnStatistics>( - new StringColumnStatisticsImpl()); + return std::make_unique<StringColumnStatisticsImpl>(); case DATE: - return std::unique_ptr<MutableColumnStatistics>( - new DateColumnStatisticsImpl()); + return std::make_unique<DateColumnStatisticsImpl>(); case TIMESTAMP: case TIMESTAMP_INSTANT: - return std::unique_ptr<MutableColumnStatistics>( - new TimestampColumnStatisticsImpl()); + return std::make_unique<TimestampColumnStatisticsImpl>(); case DECIMAL: - return std::unique_ptr<MutableColumnStatistics>( - new DecimalColumnStatisticsImpl()); + return std::make_unique<DecimalColumnStatisticsImpl>(); default: throw NotImplementedYet("Not supported type: " + type.toString()); } } -}// namespace +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh index 8cb2283f13..b36e431a7f 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.hh +++ b/contrib/libs/apache/orc/c++/src/Statistics.hh @@ -29,25 +29,25 @@ namespace orc { -/** - * StatContext contains fields required to compute statistics - */ + /** + * StatContext contains fields required to compute statistics + */ struct StatContext { const bool correctStats; const Timezone* const writerTimezone; StatContext() : correctStats(false), writerTimezone(nullptr) {} - StatContext(bool cStat, const Timezone* const timezone = nullptr) : - correctStats(cStat), writerTimezone(timezone) {} + StatContext(bool cStat, const Timezone* const timezone = nullptr) + : correctStats(cStat), writerTimezone(timezone) {} }; -/** - * Internal Statistics Implementation - */ + /** + * Internal Statistics Implementation + */ template <typename T> class InternalStatisticsImpl { - private: + private: bool _hasNull; bool _hasMinimum; bool _hasMaximum; @@ -58,7 +58,8 @@ namespace orc { T _minimum; T _maximum; T _sum; - public: + + public: InternalStatisticsImpl() { _hasNull = false; _hasMinimum = false; @@ -72,52 +73,90 @@ namespace orc { ~InternalStatisticsImpl() {} // GET / SET _totalLength - bool hasTotalLength() const { return _hasTotalLength; } + bool hasTotalLength() const { + return _hasTotalLength; + } void setHasTotalLength(bool hasTotalLength) { _hasTotalLength = hasTotalLength; } - uint64_t getTotalLength() const { return _totalLength; } + uint64_t getTotalLength() const { + return _totalLength; + } - void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; } + void setTotalLength(uint64_t totalLength) { + _totalLength = totalLength; + } // GET / SET _sum - bool hasSum() const { return _hasSum; } + bool hasSum() const { + return _hasSum; + } - void setHasSum(bool hasSum) { _hasSum = hasSum; } + void setHasSum(bool hasSum) { + _hasSum = hasSum; + } - T getSum() const { return _sum; } + T getSum() const { + return _sum; + } - void setSum(T sum) { _sum = sum; } + void setSum(T sum) { + _sum = sum; + } // GET / SET _maximum - bool hasMaximum() const { return _hasMaximum; } + bool hasMaximum() const { + return _hasMaximum; + } - const T & getMaximum() const { return _maximum; } + const T& getMaximum() const { + return _maximum; + } - void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; } + void setHasMaximum(bool hasMax) { + _hasMaximum = hasMax; + } - void setMaximum(T max) { _maximum = max; } + void setMaximum(T max) { + _maximum = max; + } // GET / SET _minimum - bool hasMinimum() const { return _hasMinimum; } + bool hasMinimum() const { + return _hasMinimum; + } - void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; } + void setHasMinimum(bool hasMin) { + _hasMinimum = hasMin; + } - const T & getMinimum() const { return _minimum; } + const T& getMinimum() const { + return _minimum; + } - void setMinimum(T min) { _minimum = min; } + void setMinimum(T min) { + _minimum = min; + } // GET / SET _valueCount - uint64_t getNumberOfValues() const { return _valueCount; } + uint64_t getNumberOfValues() const { + return _valueCount; + } - void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; } + void setNumberOfValues(uint64_t numValues) { + _valueCount = numValues; + } // GET / SET _hasNullValue - bool hasNull() const { return _hasNull; } + bool hasNull() const { + return _hasNull; + } - void setHasNull(bool hasNull) { _hasNull = hasNull; } + void setHasNull(bool hasNull) { + _hasNull = hasNull; + } void reset() { _hasNull = false; @@ -164,7 +203,7 @@ namespace orc { _hasTotalLength = _hasTotalLength && other._hasTotalLength; _totalLength += other._totalLength; } - }; + }; typedef InternalStatisticsImpl<char> InternalCharStatistics; typedef InternalStatisticsImpl<char> InternalBooleanStatistics; @@ -179,7 +218,7 @@ namespace orc { * Mutable column statistics for use by the writer. */ class MutableColumnStatistics { - public: + public: virtual ~MutableColumnStatistics(); virtual void increase(uint64_t count) = 0; @@ -195,16 +234,18 @@ namespace orc { virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0; }; -/** - * ColumnStatistics Implementation - */ + /** + * ColumnStatistics Implementation + */ - class ColumnStatisticsImpl: public ColumnStatistics, - public MutableColumnStatistics { - private: + class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics { + private: InternalCharStatistics _stats; - public: - ColumnStatisticsImpl() { reset(); } + + public: + ColumnStatisticsImpl() { + reset(); + } ColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~ColumnStatisticsImpl() override; @@ -237,25 +278,26 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); } std::string toString() const override { std::ostringstream buffer; buffer << "Column has " << getNumberOfValues() << " values" - << " and has null value: " << (hasNull() ? "yes" : "no") - << std::endl; + << " and has null value: " << (hasNull() ? "yes" : "no") << std::endl; return buffer.str(); } }; - class BinaryColumnStatisticsImpl: public BinaryColumnStatistics, - public MutableColumnStatistics { - private: + class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics { + private: InternalCharStatistics _stats; - public: - BinaryColumnStatisticsImpl() { reset(); } + + public: + BinaryColumnStatisticsImpl() { + reset(); + } BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~BinaryColumnStatisticsImpl() override; @@ -285,9 +327,9 @@ namespace orc { } uint64_t getTotalLength() const override { - if(hasTotalLength()){ + if (hasTotalLength()) { return _stats.getTotalLength(); - }else{ + } else { throw ParseError("Total length is not defined."); } } @@ -303,7 +345,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const BinaryColumnStatisticsImpl& binStats = - dynamic_cast<const BinaryColumnStatisticsImpl&>(other); + dynamic_cast<const BinaryColumnStatisticsImpl&>(other); _stats.merge(binStats._stats); } @@ -313,10 +355,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); + proto::BinaryStatistics* binStats = pbStats.mutable_binary_statistics(); binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); } @@ -325,24 +367,26 @@ namespace orc { buffer << "Data type: Binary" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasTotalLength()){ + if (hasTotalLength()) { buffer << "Total length: " << getTotalLength() << std::endl; - }else{ + } else { buffer << "Total length: not defined" << std::endl; } return buffer.str(); } }; - class BooleanColumnStatisticsImpl: public BooleanColumnStatistics, - public MutableColumnStatistics { - private: + class BooleanColumnStatisticsImpl : public BooleanColumnStatistics, + public MutableColumnStatistics { + private: InternalBooleanStatistics _stats; bool _hasCount; uint64_t _trueCount; - public: - BooleanColumnStatisticsImpl() { reset(); } + public: + BooleanColumnStatisticsImpl() { + reset(); + } BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~BooleanColumnStatisticsImpl() override; @@ -373,17 +417,17 @@ namespace orc { } uint64_t getFalseCount() const override { - if(hasCount()){ + if (hasCount()) { return getNumberOfValues() - _trueCount; - }else{ + } else { throw ParseError("False count is not defined."); } } uint64_t getTrueCount() const override { - if(hasCount()){ + if (hasCount()) { return _trueCount; - }else{ + } else { throw ParseError("True count is not defined."); } } @@ -401,7 +445,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const BooleanColumnStatisticsImpl& boolStats = - dynamic_cast<const BooleanColumnStatisticsImpl&>(other); + dynamic_cast<const BooleanColumnStatisticsImpl&>(other); _stats.merge(boolStats._stats); _hasCount = _hasCount && boolStats._hasCount; _trueCount += boolStats._trueCount; @@ -413,10 +457,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); + proto::BucketStatistics* bucketStats = pbStats.mutable_bucket_statistics(); if (_hasCount) { bucketStats->add_count(_trueCount); } else { @@ -429,9 +473,8 @@ namespace orc { buffer << "Data type: Boolean" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasCount()){ - buffer << "(true: " << getTrueCount() << "; false: " - << getFalseCount() << ")" << std::endl; + if (hasCount()) { + buffer << "(true: " << getTrueCount() << "; false: " << getFalseCount() << ")" << std::endl; } else { buffer << "(true: not defined; false: not defined)" << std::endl; buffer << "True and false counts are not defined" << std::endl; @@ -440,14 +483,15 @@ namespace orc { } }; - class DateColumnStatisticsImpl: public DateColumnStatistics, - public MutableColumnStatistics{ - private: + class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics { + private: InternalDateStatistics _stats; - public: - DateColumnStatisticsImpl() { reset(); } - DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); + + public: + DateColumnStatisticsImpl() { + reset(); + } + DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~DateColumnStatisticsImpl() override; bool hasMinimum() const override { @@ -479,17 +523,17 @@ namespace orc { } int32_t getMinimum() const override { - if(hasMinimum()){ + if (hasMinimum()) { return _stats.getMinimum(); - }else{ + } else { throw ParseError("Minimum is not defined."); } } int32_t getMaximum() const override { - if(hasMaximum()){ + if (hasMaximum()) { return _stats.getMaximum(); - }else{ + } else { throw ParseError("Maximum is not defined."); } } @@ -510,7 +554,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const DateColumnStatisticsImpl& dateStats = - dynamic_cast<const DateColumnStatisticsImpl&>(other); + dynamic_cast<const DateColumnStatisticsImpl&>(other); _stats.merge(dateStats._stats); } @@ -519,11 +563,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::DateStatistics* dateStatistics = - pbStats.mutable_datestatistics(); + proto::DateStatistics* dateStatistics = pbStats.mutable_date_statistics(); if (_stats.hasMinimum()) { dateStatistics->set_maximum(_stats.getMaximum()); dateStatistics->set_minimum(_stats.getMinimum()); @@ -538,28 +581,30 @@ namespace orc { buffer << "Data type: Date" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ + if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; - }else{ + } else { buffer << "Minimum: not defined" << std::endl; } - if(hasMaximum()){ + if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; - }else{ + } else { buffer << "Maximum: not defined" << std::endl; } return buffer.str(); } }; - class DecimalColumnStatisticsImpl: public DecimalColumnStatistics, - public MutableColumnStatistics { - private: + class DecimalColumnStatisticsImpl : public DecimalColumnStatistics, + public MutableColumnStatistics { + private: InternalDecimalStatistics _stats; - public: - DecimalColumnStatisticsImpl() { reset(); } + public: + DecimalColumnStatisticsImpl() { + reset(); + } DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~DecimalColumnStatisticsImpl() override; @@ -597,17 +642,17 @@ namespace orc { } Decimal getMinimum() const override { - if(hasMinimum()){ + if (hasMinimum()) { return _stats.getMinimum(); - }else{ + } else { throw ParseError("Minimum is not defined."); } } Decimal getMaximum() const override { - if(hasMaximum()){ + if (hasMaximum()) { return _stats.getMaximum(); - }else{ + } else { throw ParseError("Maximum is not defined."); } } @@ -623,9 +668,9 @@ namespace orc { } Decimal getSum() const override { - if(hasSum()){ + if (hasSum()) { return _stats.getSum(); - }else{ + } else { throw ParseError("Sum is not defined."); } } @@ -645,7 +690,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const DecimalColumnStatisticsImpl& decStats = - dynamic_cast<const DecimalColumnStatisticsImpl&>(other); + dynamic_cast<const DecimalColumnStatisticsImpl&>(other); _stats.merge(decStats._stats); @@ -661,10 +706,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); + proto::DecimalStatistics* decStats = pbStats.mutable_decimal_statistics(); if (_stats.hasMinimum()) { decStats->set_minimum(TString(_stats.getMinimum().toString(true))); decStats->set_maximum(TString(_stats.getMaximum().toString(true))); @@ -684,40 +729,36 @@ namespace orc { buffer << "Data type: Decimal" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ + if (hasMinimum()) { buffer << "Minimum: " << getMinimum().toString() << std::endl; - }else{ + } else { buffer << "Minimum: not defined" << std::endl; } - if(hasMaximum()){ + if (hasMaximum()) { buffer << "Maximum: " << getMaximum().toString() << std::endl; - }else{ + } else { buffer << "Maximum: not defined" << std::endl; } - if(hasSum()){ + if (hasSum()) { buffer << "Sum: " << getSum().toString() << std::endl; - }else{ + } else { buffer << "Sum: not defined" << std::endl; } return buffer.str(); } - private: + private: void updateSum(Decimal value) { if (_stats.hasSum()) { bool overflow = false; Decimal sum = _stats.getSum(); if (sum.scale > value.scale) { - value.value = scaleUpInt128ByPowerOfTen(value.value, - sum.scale - value.scale, - overflow); + value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow); } else if (sum.scale < value.scale) { - sum.value = scaleUpInt128ByPowerOfTen(sum.value, - value.scale - sum.scale, - overflow); + sum.value = scaleUpInt128ByPowerOfTen(sum.value, value.scale - sum.scale, overflow); sum.scale = value.scale; } @@ -738,12 +779,14 @@ namespace orc { } }; - class DoubleColumnStatisticsImpl: public DoubleColumnStatistics, - public MutableColumnStatistics { - private: + class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics { + private: InternalDoubleStatistics _stats; - public: - DoubleColumnStatisticsImpl() { reset(); } + + public: + DoubleColumnStatisticsImpl() { + reset(); + } DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~DoubleColumnStatisticsImpl() override; @@ -780,17 +823,17 @@ namespace orc { } double getMinimum() const override { - if(hasMinimum()){ + if (hasMinimum()) { return _stats.getMinimum(); - }else{ + } else { throw ParseError("Minimum is not defined."); } } double getMaximum() const override { - if(hasMaximum()){ + if (hasMaximum()) { return _stats.getMaximum(); - }else{ + } else { throw ParseError("Maximum is not defined."); } } @@ -806,9 +849,9 @@ namespace orc { } double getSum() const override { - if(hasSum()){ + if (hasSum()) { return _stats.getSum(); - }else{ + } else { throw ParseError("Sum is not defined."); } } @@ -825,7 +868,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const DoubleColumnStatisticsImpl& doubleStats = - dynamic_cast<const DoubleColumnStatisticsImpl&>(other); + dynamic_cast<const DoubleColumnStatisticsImpl&>(other); _stats.merge(doubleStats._stats); _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); @@ -840,10 +883,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); + proto::DoubleStatistics* doubleStats = pbStats.mutable_double_statistics(); if (_stats.hasMinimum()) { doubleStats->set_minimum(_stats.getMinimum()); doubleStats->set_maximum(_stats.getMaximum()); @@ -863,33 +906,36 @@ namespace orc { buffer << "Data type: Double" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ + if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; - }else{ + } else { buffer << "Minimum: not defined" << std::endl; } - if(hasMaximum()){ + if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; - }else{ + } else { buffer << "Maximum: not defined" << std::endl; } - if(hasSum()){ + if (hasSum()) { buffer << "Sum: " << getSum() << std::endl; - }else{ + } else { buffer << "Sum: not defined" << std::endl; } return buffer.str(); } }; - class IntegerColumnStatisticsImpl: public IntegerColumnStatistics, - public MutableColumnStatistics { - private: + class IntegerColumnStatisticsImpl : public IntegerColumnStatistics, + public MutableColumnStatistics { + private: InternalIntegerStatistics _stats; - public: - IntegerColumnStatisticsImpl() { reset(); } + + public: + IntegerColumnStatisticsImpl() { + reset(); + } IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~IntegerColumnStatisticsImpl() override; @@ -926,17 +972,17 @@ namespace orc { } int64_t getMinimum() const override { - if(hasMinimum()){ + if (hasMinimum()) { return _stats.getMinimum(); - }else{ + } else { throw ParseError("Minimum is not defined."); } } int64_t getMaximum() const override { - if(hasMaximum()){ + if (hasMaximum()) { return _stats.getMaximum(); - }else{ + } else { throw ParseError("Maximum is not defined."); } } @@ -952,9 +998,9 @@ namespace orc { } int64_t getSum() const override { - if(hasSum()){ + if (hasSum()) { return _stats.getSum(); - }else{ + } else { throw ParseError("Sum is not defined."); } } @@ -984,7 +1030,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const IntegerColumnStatisticsImpl& intStats = - dynamic_cast<const IntegerColumnStatisticsImpl&>(other); + dynamic_cast<const IntegerColumnStatisticsImpl&>(other); _stats.merge(intStats._stats); @@ -1005,10 +1051,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); + proto::IntegerStatistics* intStats = pbStats.mutable_int_statistics(); if (_stats.hasMinimum()) { intStats->set_minimum(_stats.getMinimum()); intStats->set_maximum(_stats.getMaximum()); @@ -1028,33 +1074,32 @@ namespace orc { buffer << "Data type: Integer" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ + if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; - }else{ + } else { buffer << "Minimum: not defined" << std::endl; } - if(hasMaximum()){ + if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; - }else{ + } else { buffer << "Maximum: not defined" << std::endl; } - if(hasSum()){ + if (hasSum()) { buffer << "Sum: " << getSum() << std::endl; - }else{ + } else { buffer << "Sum: not defined" << std::endl; } return buffer.str(); } }; - class StringColumnStatisticsImpl: public StringColumnStatistics, - public MutableColumnStatistics{ - private: + class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics { + private: InternalStringStatistics _stats; - public: + public: StringColumnStatisticsImpl() { reset(); } @@ -1094,18 +1139,18 @@ namespace orc { _stats.setHasNull(hasNull); } - const std::string & getMinimum() const override { - if(hasMinimum()){ + const std::string& getMinimum() const override { + if (hasMinimum()) { return _stats.getMinimum(); - }else{ + } else { throw ParseError("Minimum is not defined."); } } - const std::string & getMaximum() const override { - if(hasMaximum()){ + const std::string& getMaximum() const override { + if (hasMaximum()) { return _stats.getMaximum(); - }else{ + } else { throw ParseError("Maximum is not defined."); } } @@ -1121,9 +1166,9 @@ namespace orc { } uint64_t getTotalLength() const override { - if(hasTotalLength()){ + if (hasTotalLength()) { return _stats.getTotalLength(); - }else{ + } else { throw ParseError("Total length is not defined."); } } @@ -1141,20 +1186,16 @@ namespace orc { setMaximum(tempStr); } else { // update min - int minCmp = strncmp(_stats.getMinimum().c_str(), - value, + int minCmp = strncmp(_stats.getMinimum().c_str(), value, std::min(_stats.getMinimum().length(), length)); - if (minCmp > 0 || - (minCmp == 0 && length < _stats.getMinimum().length())) { + if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) { setMinimum(std::string(value, value + length)); } // update max - int maxCmp = strncmp(_stats.getMaximum().c_str(), - value, + int maxCmp = strncmp(_stats.getMaximum().c_str(), value, std::min(_stats.getMaximum().length(), length)); - if (maxCmp < 0 || - (maxCmp == 0 && length > _stats.getMaximum().length())) { + if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) { setMaximum(std::string(value, value + length)); } } @@ -1169,7 +1210,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const StringColumnStatisticsImpl& strStats = - dynamic_cast<const StringColumnStatisticsImpl&>(other); + dynamic_cast<const StringColumnStatisticsImpl&>(other); _stats.merge(strStats._stats); } @@ -1179,10 +1220,10 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); + proto::StringStatistics* strStats = pbStats.mutable_string_statistics(); if (_stats.hasMinimum()) { strStats->set_minimum(TString(_stats.getMinimum())); strStats->set_maximum(TString(_stats.getMaximum())); @@ -1202,42 +1243,44 @@ namespace orc { buffer << "Data type: String" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ + if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; - }else{ + } else { buffer << "Minimum is not defined" << std::endl; } - if(hasMaximum()){ + if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; - }else{ + } else { buffer << "Maximum is not defined" << std::endl; } - if(hasTotalLength()){ + if (hasTotalLength()) { buffer << "Total length: " << getTotalLength() << std::endl; - }else{ + } else { buffer << "Total length is not defined" << std::endl; } return buffer.str(); } }; - class TimestampColumnStatisticsImpl: public TimestampColumnStatistics, - public MutableColumnStatistics { - private: + class TimestampColumnStatisticsImpl : public TimestampColumnStatistics, + public MutableColumnStatistics { + private: InternalIntegerStatistics _stats; bool _hasLowerBound; bool _hasUpperBound; int64_t _lowerBound; int64_t _upperBound; - int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp - int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp + int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp + int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp static constexpr int32_t DEFAULT_MIN_NANOS = 0; static constexpr int32_t DEFAULT_MAX_NANOS = 999999; - public: - TimestampColumnStatisticsImpl() { reset(); } + public: + TimestampColumnStatisticsImpl() { + reset(); + } TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~TimestampColumnStatisticsImpl() override; @@ -1271,17 +1314,17 @@ namespace orc { } int64_t getMinimum() const override { - if(hasMinimum()){ + if (hasMinimum()) { return _stats.getMinimum(); - }else{ + } else { throw ParseError("Minimum is not defined."); } } int64_t getMaximum() const override { - if(hasMaximum()){ + if (hasMaximum()) { return _stats.getMaximum(); - }else{ + } else { throw ParseError("Maximum is not defined."); } } @@ -1326,7 +1369,7 @@ namespace orc { void merge(const MutableColumnStatistics& other) override { const TimestampColumnStatisticsImpl& tsStats = - dynamic_cast<const TimestampColumnStatisticsImpl&>(other); + dynamic_cast<const TimestampColumnStatisticsImpl&>(other); _stats.setHasNull(_stats.hasNull() || tsStats.hasNull()); _stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues()); @@ -1365,25 +1408,24 @@ namespace orc { } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::TimestampStatistics* tsStats = - pbStats.mutable_timestampstatistics(); + proto::TimestampStatistics* tsStats = pbStats.mutable_timestamp_statistics(); if (_stats.hasMinimum()) { - tsStats->set_minimumutc(_stats.getMinimum()); - tsStats->set_maximumutc(_stats.getMaximum()); + tsStats->set_minimum_utc(_stats.getMinimum()); + tsStats->set_maximum_utc(_stats.getMaximum()); if (_minimumNanos != DEFAULT_MIN_NANOS) { - tsStats->set_minimumnanos(_minimumNanos + 1); + tsStats->set_minimum_nanos(_minimumNanos + 1); } if (_maximumNanos != DEFAULT_MAX_NANOS) { - tsStats->set_maximumnanos(_maximumNanos + 1); + tsStats->set_maximum_nanos(_maximumNanos + 1); } } else { - tsStats->clear_minimumutc(); - tsStats->clear_maximumutc(); - tsStats->clear_minimumnanos(); - tsStats->clear_maximumnanos(); + tsStats->clear_minimum_utc(); + tsStats->clear_maximum_utc(); + tsStats->clear_minimum_nanos(); + tsStats->clear_maximum_nanos(); } } @@ -1396,43 +1438,39 @@ namespace orc { buffer << "Data type: Timestamp" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ + if (hasMinimum()) { secs = static_cast<time_t>(getMinimum() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "Minimum: " << timeBuffer << "." - << (getMinimum() % 1000) << std::endl; - }else{ + buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl; + } else { buffer << "Minimum is not defined" << std::endl; } - if(hasLowerBound()){ + if (hasLowerBound()) { secs = static_cast<time_t>(getLowerBound() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "LowerBound: " << timeBuffer << "." - << (getLowerBound() % 1000) << std::endl; - }else{ + buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl; + } else { buffer << "LowerBound is not defined" << std::endl; } - if(hasMaximum()){ - secs = static_cast<time_t>(getMaximum()/1000); + if (hasMaximum()) { + secs = static_cast<time_t>(getMaximum() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "Maximum: " << timeBuffer << "." - << (getMaximum() % 1000) << std::endl; - }else{ + buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl; + } else { buffer << "Maximum is not defined" << std::endl; } - if(hasUpperBound()){ + if (hasUpperBound()) { secs = static_cast<time_t>(getUpperBound() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "UpperBound: " << timeBuffer << "." - << (getUpperBound() % 1000) << std::endl; - }else{ + buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl; + } else { buffer << "UpperBound is not defined" << std::endl; } @@ -1448,17 +1486,17 @@ namespace orc { } int64_t getLowerBound() const override { - if(hasLowerBound()){ + if (hasLowerBound()) { return _lowerBound; - }else{ + } else { throw ParseError("LowerBound is not defined."); } } int64_t getUpperBound() const override { - if(hasUpperBound()){ + if (hasUpperBound()) { return _upperBound; - }else{ + } else { throw ParseError("UpperBound is not defined."); } } @@ -1482,12 +1520,14 @@ namespace orc { class CollectionColumnStatisticsImpl : public CollectionColumnStatistics, public MutableColumnStatistics { - private: + private: InternalCollectionStatistics _stats; - public: - CollectionColumnStatisticsImpl() { reset(); } - CollectionColumnStatisticsImpl(const proto::ColumnStatistics &stats); + public: + CollectionColumnStatisticsImpl() { + reset(); + } + CollectionColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~CollectionColumnStatisticsImpl() override; bool hasMinimumChildren() const override { @@ -1523,7 +1563,7 @@ namespace orc { } uint64_t getMinimumChildren() const override { - if(hasMinimumChildren()) { + if (hasMinimumChildren()) { return _stats.getMinimum(); } else { throw ParseError("MinimumChildren is not defined."); @@ -1531,7 +1571,7 @@ namespace orc { } uint64_t getMaximumChildren() const override { - if(hasMaximumChildren()) { + if (hasMaximumChildren()) { return _stats.getMaximum(); } else { throw ParseError("MaximumChildren is not defined."); @@ -1539,7 +1579,7 @@ namespace orc { } uint64_t getTotalChildren() const override { - if(hasTotalChildren()) { + if (hasTotalChildren()) { return _stats.getSum(); } else { throw ParseError("TotalChildren is not defined."); @@ -1598,31 +1638,30 @@ namespace orc { } } - void toProtoBuf(proto::ColumnStatistics &pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_has_null(_stats.hasNull()); + pbStats.set_number_of_values(_stats.getNumberOfValues()); - proto::CollectionStatistics* collectionStats = - pbStats.mutable_collectionstatistics(); + proto::CollectionStatistics* collectionStats = pbStats.mutable_collection_statistics(); if (_stats.hasMinimum()) { - collectionStats->set_minchildren(_stats.getMinimum()); - collectionStats->set_maxchildren(_stats.getMaximum()); + collectionStats->set_min_children(_stats.getMinimum()); + collectionStats->set_max_children(_stats.getMaximum()); } else { - collectionStats->clear_minchildren(); - collectionStats->clear_maxchildren(); + collectionStats->clear_min_children(); + collectionStats->clear_max_children(); } if (_stats.hasSum()) { - collectionStats->set_totalchildren(_stats.getSum()); + collectionStats->set_total_children(_stats.getSum()); } else { - collectionStats->clear_totalchildren(); + collectionStats->clear_total_children(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Collection(LIST|MAP)" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimumChildren()) { buffer << "MinChildren: " << getMinimumChildren() << std::endl; } else { @@ -1647,22 +1686,20 @@ namespace orc { ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, const StatContext& statContext); - class StatisticsImpl: public Statistics { - private: + class StatisticsImpl : public Statistics { + private: std::vector<ColumnStatistics*> colStats; // DELIBERATELY NOT IMPLEMENTED StatisticsImpl(const StatisticsImpl&); StatisticsImpl& operator=(const StatisticsImpl&); - public: - StatisticsImpl(const proto::StripeStatistics& stripeStats, - const StatContext& statContext); + public: + StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext); StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); - virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId - ) const override { + virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { return colStats[columnId]; } @@ -1673,24 +1710,21 @@ namespace orc { } }; - class StripeStatisticsImpl: public StripeStatistics { - private: + class StripeStatisticsImpl : public StripeStatistics { + private: std::unique_ptr<StatisticsImpl> columnStats; - std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > - rowIndexStats; + std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); StripeStatisticsImpl& operator=(const StripeStatisticsImpl&); - public: - StripeStatisticsImpl( - const proto::StripeStatistics& stripeStats, - std::vector<std::vector<proto::ColumnStatistics> >& indexStats, - const StatContext& statContext); + public: + StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, + std::vector<std::vector<proto::ColumnStatistics> >& indexStats, + const StatContext& statContext); - virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId - ) const override { + virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { return columnStats->getColumnStatistics(columnId); } @@ -1699,8 +1733,7 @@ namespace orc { } virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, - uint32_t rowIndex - ) const override { + uint32_t rowIndex) const override { // check id indices are valid return rowIndexStats[columnId][rowIndex].get(); } @@ -1717,9 +1750,8 @@ namespace orc { * @param type of column * @return MutableColumnStatistics instances */ - std::unique_ptr<MutableColumnStatistics> createColumnStatistics( - const Type& type); + std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type); -}// namespace +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc index 6d6dda8328..8507e95767 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.cc +++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc @@ -16,10 +16,10 @@ * limitations under the License. */ -#include "orc/Exceptions.hh" +#include "StripeStream.hh" #include "RLE.hh" #include "Reader.hh" -#include "StripeStream.hh" +#include "orc/Exceptions.hh" #include "wrap/coded-stream-wrapper.h" @@ -27,19 +27,17 @@ namespace orc { StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, const proto::StripeInformation& _stripeInfo, - const proto::StripeFooter& _footer, - uint64_t _stripeStart, - InputStream& _input, - const Timezone& _writerTimezone, - const Timezone& _readerTimezone - ): reader(_reader), - stripeInfo(_stripeInfo), - footer(_footer), - stripeIndex(_index), - stripeStart(_stripeStart), - input(_input), - writerTimezone(_writerTimezone), - readerTimezone(_readerTimezone) { + const proto::StripeFooter& _footer, uint64_t _stripeStart, + InputStream& _input, const Timezone& _writerTimezone, + const Timezone& _readerTimezone) + : reader(_reader), + stripeInfo(_stripeInfo), + footer(_footer), + stripeIndex(_index), + stripeStart(_stripeStart), + input(_input), + writerTimezone(_writerTimezone), + readerTimezone(_readerTimezone) { // PASS } @@ -55,7 +53,6 @@ namespace orc { // PASS } - StreamInformationImpl::~StreamInformationImpl() { // PASS } @@ -64,8 +61,7 @@ namespace orc { return reader.getSelectedColumns(); } - proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId - ) const { + proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId) const { return footer.columns(static_cast<int>(columnId)); } @@ -81,48 +77,46 @@ namespace orc { return reader.getFileContents().errorStream; } - std::unique_ptr<SeekableInputStream> - StripeStreamsImpl::getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const { + std::unique_ptr<SeekableInputStream> StripeStreamsImpl::getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const { uint64_t offset = stripeStart; - uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength(); - MemoryPool *pool = reader.getFileContents().pool; - for(int i = 0; i < footer.streams_size(); ++i) { + uint64_t dataEnd = stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length(); + MemoryPool* pool = reader.getFileContents().pool; + for (int i = 0; i < footer.streams_size(); ++i) { const proto::Stream& stream = footer.streams(i); - if (stream.has_kind() && - stream.kind() == kind && + if (stream.has_kind() && stream.kind() == kind && stream.column() == static_cast<uint64_t>(columnId)) { uint64_t streamLength = stream.length(); - uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; + uint64_t myBlock = shouldStream ? input.getNaturalReadSize() : streamLength; if (offset + streamLength > dataEnd) { std::stringstream msg; msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex << ": streamOffset=" << offset << ", streamLength=" << streamLength - << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" - << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength(); + << ", stripeOffset=" << stripeInfo.offset() + << ", stripeIndexLength=" << stripeInfo.index_length() + << ", stripeDataLength=" << stripeInfo.data_length(); throw ParseError(msg.str()); } return createDecompressor(reader.getCompression(), - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream - (&input, - offset, - stream.length(), - *pool, - myBlock)), - reader.getCompressionSize(), - *pool); + std::make_unique<SeekableFileInputStream>( + &input, offset, stream.length(), *pool, myBlock), + reader.getCompressionSize(), *pool, + reader.getFileContents().readerMetrics); } offset += stream.length(); } - return std::unique_ptr<SeekableInputStream>(); + return nullptr; } MemoryPool& StripeStreamsImpl::getMemoryPool() const { return *reader.getFileContents().pool; } + ReaderMetrics* StripeStreamsImpl::getReaderMetrics() const { + return reader.getFileContents().readerMetrics; + } + bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const { return reader.getThrowOnHive11DecimalOverflow(); } @@ -135,37 +129,33 @@ namespace orc { return reader.getForcedScaleOnHive11Decimal(); } + const SchemaEvolution* StripeStreamsImpl::getSchemaEvolution() const { + return reader.getSchemaEvolution(); + } + void StripeInformationImpl::ensureStripeFooterLoaded() const { if (stripeFooter.get() == nullptr) { std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(stream, - offset + - indexLength + - dataLength, - footerLength, - memory)), - blockSize, - memory); - stripeFooter.reset(new proto::StripeFooter()); + createDecompressor(compression, + std::make_unique<SeekableFileInputStream>( + stream, offset + indexLength + dataLength, footerLength, memory), + blockSize, memory, metrics); + stripeFooter = std::make_unique<proto::StripeFooter>(); if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { throw ParseError("Failed to parse the stripe footer"); } } } - std::unique_ptr<StreamInformation> - StripeInformationImpl::getStreamInformation(uint64_t streamId) const { + std::unique_ptr<StreamInformation> StripeInformationImpl::getStreamInformation( + uint64_t streamId) const { ensureStripeFooterLoaded(); uint64_t streamOffset = offset; - for(uint64_t s=0; s < streamId; ++s) { + for (uint64_t s = 0; s < streamId; ++s) { streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); } - return ORC_UNIQUE_PTR<StreamInformation> - (new StreamInformationImpl(streamOffset, - stripeFooter-> - streams(static_cast<int>(streamId)))); + return std::make_unique<StreamInformationImpl>( + streamOffset, stripeFooter->streams(static_cast<int>(streamId))); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh index 8d9fb06527..eae6ce0c31 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.hh +++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh @@ -23,6 +23,7 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "ColumnReader.hh" #include "Timezone.hh" #include "TypeImpl.hh" @@ -31,11 +32,11 @@ namespace orc { class RowReaderImpl; /** - * StripeStream Implementation - */ + * StripeStream Implementation + */ - class StripeStreamsImpl: public StripeStreams { - private: + class StripeStreamsImpl : public StripeStreams { + private: const RowReaderImpl& reader; const proto::StripeInformation& stripeInfo; const proto::StripeFooter& footer; @@ -45,29 +46,26 @@ namespace orc { const Timezone& writerTimezone; const Timezone& readerTimezone; - public: + public: StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, - const proto::StripeInformation& stripeInfo, - const proto::StripeFooter& footer, - uint64_t stripeStart, - InputStream& input, - const Timezone& writerTimezone, + const proto::StripeInformation& stripeInfo, const proto::StripeFooter& footer, + uint64_t stripeStart, InputStream& input, const Timezone& writerTimezone, const Timezone& readerTimezone); virtual ~StripeStreamsImpl() override; virtual const std::vector<bool> getSelectedColumns() const override; - virtual proto::ColumnEncoding getEncoding(uint64_t columnId - ) const override; + virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const override; - virtual std::unique_ptr<SeekableInputStream> - getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const override; + virtual std::unique_ptr<SeekableInputStream> getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const override; MemoryPool& getMemoryPool() const override; + ReaderMetrics* getReaderMetrics() const override; + const Timezone& getWriterTimezone() const override; const Timezone& getReaderTimezone() const override; @@ -79,25 +77,27 @@ namespace orc { bool isDecimalAsLong() const override; int32_t getForcedScaleOnHive11Decimal() const override; + + const SchemaEvolution* getSchemaEvolution() const override; }; - /** - * StreamInformation Implementation - */ + /** + * StreamInformation Implementation + */ - class StreamInformationImpl: public StreamInformation { - private: + class StreamInformationImpl : public StreamInformation { + private: StreamKind kind; uint64_t column; uint64_t offset; uint64_t length; - public: - StreamInformationImpl(uint64_t _offset, - const proto::Stream& stream - ): kind(static_cast<StreamKind>(stream.kind())), - column(stream.column()), - offset(_offset), - length(stream.length()) { + + public: + StreamInformationImpl(uint64_t _offset, const proto::Stream& stream) + : kind(static_cast<StreamKind>(stream.kind())), + column(stream.column()), + offset(_offset), + length(stream.length()) { // PASS } @@ -120,9 +120,9 @@ namespace orc { } }; - /** - * StripeInformation Implementation - */ + /** + * StripeInformation Implementation + */ class StripeInformationImpl : public StripeInformation { uint64_t offset; @@ -135,27 +135,24 @@ namespace orc { CompressionKind compression; uint64_t blockSize; mutable std::unique_ptr<proto::StripeFooter> stripeFooter; + ReaderMetrics* metrics; void ensureStripeFooterLoaded() const; - public: - - StripeInformationImpl(uint64_t _offset, - uint64_t _indexLength, - uint64_t _dataLength, - uint64_t _footerLength, - uint64_t _numRows, - InputStream* _stream, - MemoryPool& _memory, - CompressionKind _compression, - uint64_t _blockSize - ) : offset(_offset), - indexLength(_indexLength), - dataLength(_dataLength), - footerLength(_footerLength), - numRows(_numRows), - stream(_stream), - memory(_memory), - compression(_compression), - blockSize(_blockSize) { + + public: + StripeInformationImpl(uint64_t _offset, uint64_t _indexLength, uint64_t _dataLength, + uint64_t _footerLength, uint64_t _numRows, InputStream* _stream, + MemoryPool& _memory, CompressionKind _compression, uint64_t _blockSize, + ReaderMetrics* _metrics) + : offset(_offset), + indexLength(_indexLength), + dataLength(_dataLength), + footerLength(_footerLength), + numRows(_numRows), + stream(_stream), + memory(_memory), + compression(_compression), + blockSize(_blockSize), + metrics(_metrics) { // PASS } @@ -174,7 +171,7 @@ namespace orc { return indexLength; } - uint64_t getDataLength()const override { + uint64_t getDataLength() const override { return dataLength; } @@ -191,29 +188,25 @@ namespace orc { return static_cast<uint64_t>(stripeFooter->streams_size()); } - std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId - ) const override; + std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId) const override; ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter-> - columns(static_cast<int>(colId)) - .kind()); + return static_cast<ColumnEncodingKind>(stripeFooter->columns(static_cast<int>(colId)).kind()); } uint64_t getDictionarySize(uint64_t colId) const override { ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter-> - columns(static_cast<int>(colId)) - .dictionarysize()); + return static_cast<ColumnEncodingKind>( + stripeFooter->columns(static_cast<int>(colId)).dictionary_size()); } const std::string& getWriterTimezone() const override { ensureStripeFooterLoaded(); - return stripeFooter->writertimezone(); + return stripeFooter->writer_timezone(); } }; -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc index 318e5bcc12..27e14480d5 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.cc +++ b/contrib/libs/apache/orc/c++/src/Timezone.cc @@ -16,16 +16,17 @@ * limitations under the License. */ -#include "orc/OrcFile.hh" #include "Timezone.hh" +#include "orc/OrcFile.hh" #include <errno.h> -#include <map> -#include <sstream> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <time.h> +#include <filesystem> +#include <map> +#include <sstream> namespace orc { @@ -35,25 +36,21 @@ namespace orc { // location of a symlink to the local timezone static const char LOCAL_TIMEZONE[] = "/etc/localtime"; - enum TransitionKind { - TRANSITION_JULIAN, - TRANSITION_DAY, - TRANSITION_MONTH - }; + enum TransitionKind { TRANSITION_JULIAN, TRANSITION_DAY, TRANSITION_MONTH }; static const int64_t MONTHS_PER_YEAR = 12; /** * The number of days in each month in non-leap and leap years. */ - static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = - {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = { + {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; static const int64_t DAYS_PER_WEEK = 7; // Leap years and day of the week repeat every 400 years, which makes it // a good cycle length. static const int64_t SECONDS_PER_400_YEARS = - SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3)); + SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3)); /** * Is the given year a leap year? @@ -68,7 +65,7 @@ namespace orc { * @return -1 if the target < array[0] or array is empty or * i if array[i] <= target and (i == n or array[i] < array[i+1]) */ - int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) { + int64_t binarySearch(const std::vector<int64_t>& array, int64_t target) { uint64_t size = array.size(); if (size == 0) { return -1; @@ -103,18 +100,17 @@ namespace orc { std::string toString() const { std::stringstream buffer; switch (kind) { - case TRANSITION_JULIAN: - buffer << "julian " << day; - break; - case TRANSITION_DAY: - buffer << "day " << day; - break; - case TRANSITION_MONTH: - buffer << "month " << month << " week " << week << " day " << day; - break; + case TRANSITION_JULIAN: + buffer << "julian " << day; + break; + case TRANSITION_DAY: + buffer << "day " << day; + break; + case TRANSITION_MONTH: + buffer << "month " << month << " week " << week << " day " << day; + break; } - buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) - << ":" << (time % 60); + buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) << ":" << (time % 60); return buffer.str(); } @@ -127,48 +123,48 @@ namespace orc { int64_t getTime(int64_t year) const { int64_t result = time; switch (kind) { - case TRANSITION_JULIAN: - result += SECONDS_PER_DAY * day; - if (day > 60 && isLeap(year)) { - result += SECONDS_PER_DAY; - } - break; - case TRANSITION_DAY: - result += SECONDS_PER_DAY * day; - break; - case TRANSITION_MONTH: { - bool inLeap = isLeap(year); - int64_t adjustedMonth = (month + 9) % 12 + 1; - int64_t adjustedYear = (month <= 2) ? (year - 1) : year; - int64_t adjustedCentury = adjustedYear / 100; - int64_t adjustedRemainder = adjustedYear % 100; - - // day of the week of the first day of month - int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + - 1 + adjustedRemainder + adjustedRemainder / 4 + - adjustedCentury / 4 - 2 * adjustedCentury) % 7; - if (dayOfWeek < 0) { - dayOfWeek += DAYS_PER_WEEK; - } + case TRANSITION_JULIAN: + result += SECONDS_PER_DAY * day; + if (day > 60 && isLeap(year)) { + result += SECONDS_PER_DAY; + } + break; + case TRANSITION_DAY: + result += SECONDS_PER_DAY * day; + break; + case TRANSITION_MONTH: { + bool inLeap = isLeap(year); + int64_t adjustedMonth = (month + 9) % 12 + 1; + int64_t adjustedYear = (month <= 2) ? (year - 1) : year; + int64_t adjustedCentury = adjustedYear / 100; + int64_t adjustedRemainder = adjustedYear % 100; + + // day of the week of the first day of month + int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + 1 + adjustedRemainder + + adjustedRemainder / 4 + adjustedCentury / 4 - 2 * adjustedCentury) % + 7; + if (dayOfWeek < 0) { + dayOfWeek += DAYS_PER_WEEK; + } - int64_t d = day - dayOfWeek; - if (d < 0) { - d += DAYS_PER_WEEK; - } - for (int w = 1; w < week; ++w) { - if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) { - break; + int64_t d = day - dayOfWeek; + if (d < 0) { + d += DAYS_PER_WEEK; } - d += DAYS_PER_WEEK; - } - result += d * SECONDS_PER_DAY; + for (int w = 1; w < week; ++w) { + if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) { + break; + } + d += DAYS_PER_WEEK; + } + result += d * SECONDS_PER_DAY; - // Add in the time for the month - for(int m=0; m < month - 1; ++m) { - result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY; + // Add in the time for the month + for (int m = 0; m < month - 1; ++m) { + result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY; + } + break; } - break; - } } return result; } @@ -187,7 +183,7 @@ namespace orc { * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)? * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week> */ - class FutureRuleImpl: public FutureRule { + class FutureRuleImpl : public FutureRule { std::string ruleString; TimezoneVariant standard; bool hasDst; @@ -215,17 +211,17 @@ namespace orc { offsets.resize(400 * 2 + 1); startInStd = start.getTime(1970) < end.getTime(1970); int64_t base = 0; - for(int64_t year = 1970; year < 1970 + 400; ++year) { + for (int64_t year = 1970; year < 1970 + 400; ++year) { if (startInStd) { offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + start.getTime(year) - standard.gmtOffset; + base + start.getTime(year) - standard.gmtOffset; offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + end.getTime(year) - dst.gmtOffset; + base + end.getTime(year) - dst.gmtOffset; } else { offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + end.getTime(year) - dst.gmtOffset; + base + end.getTime(year) - dst.gmtOffset; offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + start.getTime(year) - standard.gmtOffset; + base + start.getTime(year) - standard.gmtOffset; } base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY; } @@ -233,7 +229,7 @@ namespace orc { offsets[0] = 0; } - public: + public: virtual ~FutureRuleImpl() override; bool isDefined() const override; const TimezoneVariant& getVariant(int64_t clk) const override; @@ -287,13 +283,9 @@ namespace orc { * A parser for the future rule strings. */ class FutureRuleParser { - public: - FutureRuleParser(const std::string& str, - FutureRuleImpl* rule - ): ruleString(str), - length(str.size()), - position(0), - output(*rule) { + public: + FutureRuleParser(const std::string& str, FutureRuleImpl* rule) + : ruleString(str), length(str.size()), position(0), output(*rule) { output.ruleString = str; if (position != length) { parseName(output.standard.name); @@ -318,14 +310,13 @@ namespace orc { } } - private: - + private: const std::string& ruleString; size_t length; size_t position; - FutureRuleImpl &output; + FutureRuleImpl& output; - void throwError(const char *msg) { + void throwError(const char* msg) { std::stringstream buffer; buffer << msg << " at " << position << " in '" << ruleString << "'"; throw TimezoneError(buffer.str()); @@ -348,7 +339,7 @@ namespace orc { if (position == length) { throwError("missing close '>'"); } - position +=1; + position += 1; } else { while (position < length) { char ch = ruleString[position]; @@ -456,9 +447,8 @@ namespace orc { * Parse the POSIX TZ string. */ std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) { - std::shared_ptr<FutureRule> result(new FutureRuleImpl()); - FutureRuleParser parser(ruleString, - dynamic_cast<FutureRuleImpl*>(result.get())); + auto result = std::make_shared<FutureRuleImpl>(); + FutureRuleParser parser(ruleString, dynamic_cast<FutureRuleImpl*>(result.get())); return result; } @@ -475,7 +465,7 @@ namespace orc { * An abstraction of the differences between versions. */ class VersionParser { - public: + public: virtual ~VersionParser(); /** @@ -496,8 +486,7 @@ namespace orc { /** * Parse the future string */ - virtual std::string parseFutureString(const unsigned char *ptr, - uint64_t offset, + virtual std::string parseFutureString(const unsigned char* ptr, uint64_t offset, uint64_t length) const = 0; }; @@ -506,14 +495,12 @@ namespace orc { } static uint32_t decode32(const unsigned char* ptr) { - return static_cast<uint32_t>(ptr[0] << 24) | - static_cast<uint32_t>(ptr[1] << 16) | - static_cast<uint32_t>(ptr[2] << 8) | - static_cast<uint32_t>(ptr[3]); + return static_cast<uint32_t>(ptr[0] << 24) | static_cast<uint32_t>(ptr[1] << 16) | + static_cast<uint32_t>(ptr[2] << 8) | static_cast<uint32_t>(ptr[3]); } - class Version1Parser: public VersionParser { - public: + class Version1Parser : public VersionParser { + public: virtual ~Version1Parser() override; virtual uint64_t getVersion() const override { @@ -535,9 +522,7 @@ namespace orc { return static_cast<int32_t>(decode32(ptr)); } - virtual std::string parseFutureString(const unsigned char *, - uint64_t, - uint64_t) const override { + virtual std::string parseFutureString(const unsigned char*, uint64_t, uint64_t) const override { return ""; } }; @@ -546,8 +531,8 @@ namespace orc { // PASS } - class Version2Parser: public VersionParser { - public: + class Version2Parser : public VersionParser { + public: virtual ~Version2Parser() override; virtual uint64_t getVersion() const override { @@ -568,11 +553,9 @@ namespace orc { return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4); } - virtual std::string parseFutureString(const unsigned char *ptr, - uint64_t offset, + virtual std::string parseFutureString(const unsigned char* ptr, uint64_t offset, uint64_t length) const override { - return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, - length - 2); + return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, length - 2); } }; @@ -580,10 +563,9 @@ namespace orc { // PASS } - class TimezoneImpl: public Timezone { - public: - TimezoneImpl(const std::string& name, - const std::vector<unsigned char> bytes); + class TimezoneImpl : public Timezone { + public: + TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer); virtual ~TimezoneImpl() override; /** @@ -605,15 +587,16 @@ namespace orc { return clk + getVariant(clk).gmtOffset; } - private: - void parseTimeVariants(const unsigned char* ptr, - uint64_t variantOffset, - uint64_t variantCount, - uint64_t nameOffset, - uint64_t nameCount); - void parseZoneFile(const unsigned char* ptr, - uint64_t sectionOffset, - uint64_t fileLength, + int64_t convertFromUTC(int64_t clk) const override { + int64_t adjustedTime = clk - getVariant(clk).gmtOffset; + const auto& adjustedReader = getVariant(adjustedTime); + return clk - adjustedReader.gmtOffset; + } + + private: + void parseTimeVariants(const unsigned char* ptr, uint64_t variantOffset, uint64_t variantCount, + uint64_t nameOffset, uint64_t nameCount); + void parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset, uint64_t fileLength, const VersionParser& version); // filename std::string filename; @@ -644,10 +627,10 @@ namespace orc { }; DIAGNOSTIC_PUSH - #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wglobal-constructors") - DIAGNOSTIC_IGNORE("-Wexit-time-destructors") - #endif +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wglobal-constructors") + DIAGNOSTIC_IGNORE("-Wexit-time-destructors") +#endif static std::mutex timezone_mutex; static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache; DIAGNOSTIC_POP @@ -656,9 +639,8 @@ namespace orc { // PASS } - TimezoneImpl::TimezoneImpl(const std::string& _filename, - const std::vector<unsigned char> buffer - ): filename(_filename) { + TimezoneImpl::TimezoneImpl(const std::string& _filename, const std::vector<unsigned char>& buffer) + : filename(_filename) { parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser()); // Build the literal for the ORC epoch // 2015 Jan 1 00:00:00 @@ -675,7 +657,7 @@ namespace orc { } const char* getTimezoneDirectory() { - const char *dir = getenv("TZDIR"); + const char* dir = getenv("TZDIR"); if (!dir) { dir = DEFAULT_TZDIR; } @@ -689,18 +671,23 @@ namespace orc { const Timezone& getTimezoneByFilename(const std::string& filename) { // ORC-110 std::lock_guard<std::mutex> timezone_lock(timezone_mutex); - std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = - timezoneCache.find(filename); + std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = timezoneCache.find(filename); if (itr != timezoneCache.end()) { return *(itr->second).get(); } + if (!std::filesystem::exists(std::filesystem::path(filename))) { + std::stringstream ss; + ss << "Time zone file " << filename << " does not exist." + << " Please install IANA time zone database and set TZDIR env."; + throw TimezoneError(ss.str()); + } try { - ORC_UNIQUE_PTR<InputStream> file = readFile(filename); + std::unique_ptr<InputStream> file = readFile(filename); size_t size = static_cast<size_t>(file->getLength()); std::vector<unsigned char> buffer(size); file->read(&buffer[0], size, 0); - timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer)); - } catch(ParseError& err) { + timezoneCache[filename] = std::make_shared<TimezoneImpl>(filename, buffer); + } catch (ParseError& err) { throw TimezoneError(err.what()); } return *timezoneCache[filename].get(); @@ -732,32 +719,30 @@ namespace orc { * Parse a set of bytes as a timezone file as if they came from filename. */ std::unique_ptr<Timezone> getTimezone(const std::string& filename, - const std::vector<unsigned char>& b){ - return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b)); + const std::vector<unsigned char>& b) { + return std::make_unique<TimezoneImpl>(filename, b); } TimezoneImpl::~TimezoneImpl() { // PASS } - void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, - uint64_t variantOffset, - uint64_t variantCount, - uint64_t nameOffset, + void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, uint64_t variantOffset, + uint64_t variantCount, uint64_t nameOffset, uint64_t nameCount) { - for(uint64_t variant=0; variant < variantCount; ++variant) { + for (uint64_t variant = 0; variant < variantCount; ++variant) { variants[variant].gmtOffset = - static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); + static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; uint64_t nameStart = ptr[variantOffset + 6 * variant + 5]; if (nameStart >= nameCount) { std::stringstream buffer; - buffer << "name out of range in variant " << variant - << " - " << nameStart << " >= " << nameCount; + buffer << "name out of range in variant " << variant << " - " << nameStart + << " >= " << nameCount; throw TimezoneError(buffer.str()); } - variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) - + nameOffset + nameStart); + variants[variant].name = + std::string(reinterpret_cast<const char*>(ptr) + nameOffset + nameStart); } } @@ -787,17 +772,14 @@ namespace orc { * IsGmt * FutureString */ - void TimezoneImpl::parseZoneFile(const unsigned char *ptr, - uint64_t sectionOffset, - uint64_t fileLength, - const VersionParser& versionParser) { - const uint64_t magicOffset = sectionOffset + 0; + void TimezoneImpl::parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset, + uint64_t fileLength, const VersionParser& versionParser) { + const uint64_t magicOffset = sectionOffset + 0; const uint64_t headerOffset = magicOffset + 20; // check for validity before we start parsing if (fileLength < headerOffset + 6 * 4 || - strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) - != 0) { + strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) != 0) { std::stringstream buffer; buffer << "non-tzfile " << filename; throw TimezoneError(buffer.str()); @@ -805,24 +787,23 @@ namespace orc { const uint64_t isGmtCount = decode32(ptr + headerOffset + 0); const uint64_t isStdCount = decode32(ptr + headerOffset + 4); - const uint64_t leapCount = decode32(ptr + headerOffset + 8); - const uint64_t timeCount = decode32(ptr + headerOffset + 12); - const uint64_t variantCount = decode32(ptr + headerOffset + 16); - const uint64_t nameCount = decode32(ptr + headerOffset + 20); + const uint64_t leapCount = decode32(ptr + headerOffset + 8); + const uint64_t timeCount = decode32(ptr + headerOffset + 12); + const uint64_t variantCount = decode32(ptr + headerOffset + 16); + const uint64_t nameCount = decode32(ptr + headerOffset + 20); const uint64_t timeOffset = headerOffset + 24; - const uint64_t timeVariantOffset = - timeOffset + versionParser.getTimeSize() * timeCount; + const uint64_t timeVariantOffset = timeOffset + versionParser.getTimeSize() * timeCount; const uint64_t variantOffset = timeVariantOffset + timeCount; const uint64_t nameOffset = variantOffset + variantCount * 6; - const uint64_t sectionLength = nameOffset + nameCount - + (versionParser.getTimeSize() + 4) * leapCount - + isGmtCount + isStdCount; + const uint64_t sectionLength = nameOffset + nameCount + + (versionParser.getTimeSize() + 4) * leapCount + isGmtCount + + isStdCount; if (sectionLength > fileLength) { std::stringstream buffer; - buffer << "tzfile too short " << filename - << " needs " << sectionLength << " and has " << fileLength; + buffer << "tzfile too short " << filename << " needs " << sectionLength << " and has " + << fileLength; throw TimezoneError(buffer.str()); } @@ -835,24 +816,19 @@ namespace orc { variants.resize(variantCount); transitions.resize(timeCount); currentVariant.resize(timeCount); - parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, - nameCount); + parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, nameCount); bool foundAncient = false; - for(uint64_t t=0; t < timeCount; ++t) { - transitions[t] = - versionParser.parseTime(ptr + timeOffset + - t * versionParser.getTimeSize()); + for (uint64_t t = 0; t < timeCount; ++t) { + transitions[t] = versionParser.parseTime(ptr + timeOffset + t * versionParser.getTimeSize()); currentVariant[t] = ptr[timeVariantOffset + t]; if (currentVariant[t] >= variantCount) { std::stringstream buffer; - buffer << "tzfile rule out of range " << filename - << " references rule " << currentVariant[t] - << " of " << variantCount; + buffer << "tzfile rule out of range " << filename << " references rule " + << currentVariant[t] << " of " << variantCount; throw TimezoneError(buffer.str()); } // find the oldest standard time and use that as the ancient value - if (!foundAncient && - !variants[currentVariant[t]].isDst) { + if (!foundAncient && !variants[currentVariant[t]].isDst) { foundAncient = true; ancientVariant = currentVariant[t]; } @@ -860,9 +836,8 @@ namespace orc { if (!foundAncient) { ancientVariant = 0; } - futureRule = parseFutureRule(versionParser.parseFutureString - (ptr, sectionLength, - fileLength - sectionLength)); + futureRule = parseFutureRule( + versionParser.parseFutureString(ptr, sectionLength, fileLength - sectionLength)); // find the lower bound for applying the future rule if (futureRule->isDefined()) { @@ -897,11 +872,10 @@ namespace orc { out << "Timezone file: " << filename << "\n"; out << " Version: " << version << "\n"; futureRule->print(out); - for(uint64_t r=0; r < variants.size(); ++r) { - out << " Variant " << r << ": " - << variants[r].toString() << "\n"; + for (uint64_t r = 0; r < variants.size(); ++r) { + out << " Variant " << r << ": " << variants[r].toString() << "\n"; } - for(uint64_t t=0; t < transitions.size(); ++t) { + for (uint64_t t = 0; t < transitions.size(); ++t) { tm timeStruct; tm* result = nullptr; char buffer[25]; @@ -912,25 +886,21 @@ namespace orc { strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct); } } - std::cout << " Transition: " << (result == nullptr ? "null" : buffer) - << " (" << transitions[t] << ") -> " - << variants[currentVariant[t]].name - << "\n"; + out << " Transition: " << (result == nullptr ? "null" : buffer) << " (" << transitions[t] + << ") -> " << variants[currentVariant[t]].name << "\n"; } } - TimezoneError::TimezoneError(const std::string& what - ): std::runtime_error(what) { + TimezoneError::TimezoneError(const std::string& what) : std::runtime_error(what) { // PASS } - TimezoneError::TimezoneError(const TimezoneError& other - ): std::runtime_error(other) { + TimezoneError::TimezoneError(const TimezoneError& other) : std::runtime_error(other) { // PASS } - TimezoneError::~TimezoneError() ORC_NOEXCEPT { + TimezoneError::~TimezoneError() noexcept { // PASS } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Timezone.hh b/contrib/libs/apache/orc/c++/src/Timezone.hh index 6c8b861259..0716c5a3f2 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.hh +++ b/contrib/libs/apache/orc/c++/src/Timezone.hh @@ -23,9 +23,9 @@ #include "Adaptor.hh" +#include <stdint.h> #include <memory> #include <stdexcept> -#include <stdint.h> #include <string> #include <vector> @@ -55,7 +55,7 @@ namespace orc { * city in the region (eg. America/Los_Angeles or America/Mexico_City). */ class Timezone { - public: + public: virtual ~Timezone(); /** @@ -79,12 +79,17 @@ namespace orc { /** * Get the version of the zone file. */ - virtual uint64_t getVersion() const =0; + virtual uint64_t getVersion() const = 0; /** * Convert wall clock time of current timezone to UTC timezone */ virtual int64_t convertToUTC(int64_t clk) const = 0; + + /** + * Convert UTC timezone to wall clock time of current timezone + */ + virtual int64_t convertFromUTC(int64_t clk) const = 0; }; /** @@ -105,11 +110,11 @@ namespace orc { std::unique_ptr<Timezone> getTimezone(const std::string& filename, const std::vector<unsigned char>& b); - class TimezoneError: public std::runtime_error { - public: - TimezoneError(const std::string& what); - TimezoneError(const TimezoneError&); - virtual ~TimezoneError() ORC_NOEXCEPT; + class TimezoneError : public std::runtime_error { + public: + explicit TimezoneError(const std::string& what); + explicit TimezoneError(const TimezoneError&); + ~TimezoneError() noexcept override; }; /** @@ -118,7 +123,7 @@ namespace orc { * the future. */ class FutureRule { - public: + public: virtual ~FutureRule(); virtual bool isDefined() const = 0; virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; @@ -129,6 +134,6 @@ namespace orc { * Parse the POSIX TZ string. */ std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString); -} +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc index 14517ce164..c427a962b5 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc @@ -16,9 +16,9 @@ * limitations under the License. */ +#include "TypeImpl.hh" #include "Adaptor.hh" #include "orc/Exceptions.hh" -#include "TypeImpl.hh" #include <iostream> #include <sstream> @@ -51,8 +51,7 @@ namespace orc { subtypeCount = 0; } - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, - uint64_t _scale) { + TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, uint64_t _scale) { parent = nullptr; columnId = -1; maximumColumnId = -1; @@ -66,7 +65,7 @@ namespace orc { uint64_t TypeImpl::assignIds(uint64_t root) const { columnId = static_cast<int64_t>(root); uint64_t current = root + 1; - for(uint64_t i=0; i < subtypeCount; ++i) { + for (uint64_t i = 0; i < subtypeCount; ++i) { current = dynamic_cast<TypeImpl*>(subTypes[i].get())->assignIds(current); } maximumColumnId = static_cast<int64_t>(current) - 1; @@ -121,8 +120,7 @@ namespace orc { return scale; } - Type& TypeImpl::setAttribute(const std::string& key, - const std::string& value) { + Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) { attributes[key] = value; return *this; } @@ -171,8 +169,7 @@ namespace orc { subtypeCount += 1; } - Type* TypeImpl::addStructField(const std::string& fieldName, - std::unique_ptr<Type> fieldType) { + Type* TypeImpl::addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) { addChildType(std::move(fieldType)); fieldNames.push_back(fieldName); return this; @@ -184,299 +181,301 @@ namespace orc { } bool isUnquotedFieldName(std::string fieldName) { - for (auto &ch : fieldName) { - if (!isalnum(ch) && ch != '_') { - return false; - } + for (auto& ch : fieldName) { + if (!isalnum(ch) && ch != '_') { + return false; + } } return true; } std::string TypeImpl::toString() const { switch (static_cast<int64_t>(kind)) { - case BOOLEAN: - return "boolean"; - case BYTE: - return "tinyint"; - case SHORT: - return "smallint"; - case INT: - return "int"; - case LONG: - return "bigint"; - case FLOAT: - return "float"; - case DOUBLE: - return "double"; - case STRING: - return "string"; - case BINARY: - return "binary"; - case TIMESTAMP: - return "timestamp"; - case TIMESTAMP_INSTANT: - return "timestamp with local time zone"; - case LIST: - return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; - case MAP: - return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + - (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; - case STRUCT: { - std::string result = "struct<"; - for(size_t i=0; i < subTypes.size(); ++i) { - if (i != 0) { - result += ","; - } - if (isUnquotedFieldName(fieldNames[i])) { - result += fieldNames[i]; - } else { - std::string name(fieldNames[i]); - size_t pos = 0; - while ((pos = name.find("`", pos)) != std::string::npos) { - name.replace(pos, 1, "``"); - pos += 2; + case BOOLEAN: + return "boolean"; + case BYTE: + return "tinyint"; + case SHORT: + return "smallint"; + case INT: + return "int"; + case LONG: + return "bigint"; + case FLOAT: + return "float"; + case DOUBLE: + return "double"; + case STRING: + return "string"; + case BINARY: + return "binary"; + case TIMESTAMP: + return "timestamp"; + case TIMESTAMP_INSTANT: + return "timestamp with local time zone"; + case LIST: + return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; + case MAP: + return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + + (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; + case STRUCT: { + std::string result = "struct<"; + for (size_t i = 0; i < subTypes.size(); ++i) { + if (i != 0) { + result += ","; } - result += "`"; - result += name; - result += "`"; + if (isUnquotedFieldName(fieldNames[i])) { + result += fieldNames[i]; + } else { + std::string name(fieldNames[i]); + size_t pos = 0; + while ((pos = name.find("`", pos)) != std::string::npos) { + name.replace(pos, 1, "``"); + pos += 2; + } + result += "`"; + result += name; + result += "`"; + } + result += ":"; + result += subTypes[i]->toString(); } - result += ":"; - result += subTypes[i]->toString(); + result += ">"; + return result; } - result += ">"; - return result; - } - case UNION: { - std::string result = "uniontype<"; - for(size_t i=0; i < subTypes.size(); ++i) { - if (i != 0) { - result += ","; + case UNION: { + std::string result = "uniontype<"; + for (size_t i = 0; i < subTypes.size(); ++i) { + if (i != 0) { + result += ","; + } + result += subTypes[i]->toString(); } - result += subTypes[i]->toString(); + result += ">"; + return result; } - result += ">"; - return result; - } - case DECIMAL: { - std::stringstream result; - result << "decimal(" << precision << "," << scale << ")"; - return result.str(); - } - case DATE: - return "date"; - case VARCHAR: { - std::stringstream result; - result << "varchar(" << maxLength << ")"; - return result.str(); - } - case CHAR: { - std::stringstream result; - result << "char(" << maxLength << ")"; - return result.str(); - } - default: - throw NotImplementedYet("Unknown type"); + case DECIMAL: { + std::stringstream result; + result << "decimal(" << precision << "," << scale << ")"; + return result.str(); + } + case DATE: + return "date"; + case VARCHAR: { + std::stringstream result; + result << "varchar(" << maxLength << ")"; + return result.str(); + } + case CHAR: { + std::stringstream result; + result << "char(" << maxLength << ")"; + return result.str(); + } + default: + throw NotImplementedYet("Unknown type"); } } - std::unique_ptr<ColumnVectorBatch> - TypeImpl::createRowBatch(uint64_t capacity, - MemoryPool& memoryPool, - bool encoded) const { + std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(uint64_t capacity, + MemoryPool& memoryPool, + bool encoded) const { + return createRowBatch(capacity, memoryPool, encoded, /*useTightNumericVector=*/false); + } + + std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(uint64_t capacity, + MemoryPool& memoryPool, bool encoded, + bool useTightNumericVector) const { switch (static_cast<int64_t>(kind)) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case DATE: - return std::unique_ptr<ColumnVectorBatch> - (new LongVectorBatch(capacity, memoryPool)); - - case FLOAT: - case DOUBLE: - return std::unique_ptr<ColumnVectorBatch> - (new DoubleVectorBatch(capacity, memoryPool)); - - case STRING: - case BINARY: - case CHAR: - case VARCHAR: - return encoded ? - std::unique_ptr<ColumnVectorBatch> - (new EncodedStringVectorBatch(capacity, memoryPool)) - : std::unique_ptr<ColumnVectorBatch> - (new StringVectorBatch(capacity, memoryPool)); - - case TIMESTAMP: - case TIMESTAMP_INSTANT: - return std::unique_ptr<ColumnVectorBatch> - (new TimestampVectorBatch(capacity, memoryPool)); - - case STRUCT: { - StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - for(uint64_t i=0; i < getSubtypeCount(); ++i) { - result->fields.push_back(getSubtype(i)-> - createRowBatch(capacity, - memoryPool, encoded).release()); + case BOOLEAN: + if (useTightNumericVector) { + return std::make_unique<ByteVectorBatch>(capacity, memoryPool); + } + return std::make_unique<LongVectorBatch>(capacity, memoryPool); + case BYTE: + if (useTightNumericVector) { + return std::make_unique<ByteVectorBatch>(capacity, memoryPool); + } + return std::make_unique<LongVectorBatch>(capacity, memoryPool); + case SHORT: + if (useTightNumericVector) { + return std::make_unique<ShortVectorBatch>(capacity, memoryPool); + } + return std::make_unique<LongVectorBatch>(capacity, memoryPool); + case INT: + if (useTightNumericVector) { + return std::make_unique<IntVectorBatch>(capacity, memoryPool); + } + return std::make_unique<LongVectorBatch>(capacity, memoryPool); + case LONG: + case DATE: + return std::make_unique<LongVectorBatch>(capacity, memoryPool); + + case FLOAT: + if (useTightNumericVector) { + return std::make_unique<FloatVectorBatch>(capacity, memoryPool); + } + return std::make_unique<DoubleVectorBatch>(capacity, memoryPool); + case DOUBLE: + return std::make_unique<DoubleVectorBatch>(capacity, memoryPool); + + case STRING: + case BINARY: + case CHAR: + case VARCHAR: + return encoded ? std::make_unique<EncodedStringVectorBatch>(capacity, memoryPool) + : std::make_unique<StringVectorBatch>(capacity, memoryPool); + + case TIMESTAMP: + case TIMESTAMP_INSTANT: + return std::make_unique<TimestampVectorBatch>(capacity, memoryPool); + + case STRUCT: { + auto result = std::make_unique<StructVectorBatch>(capacity, memoryPool); + for (uint64_t i = 0; i < getSubtypeCount(); ++i) { + result->fields.push_back( + getSubtype(i) + ->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector) + .release()); + } + return result; } - return return_value; - } - case LIST: { - ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - if (getSubtype(0) != nullptr) { - result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); + case LIST: { + auto result = std::make_unique<ListVectorBatch>(capacity, memoryPool); + if (getSubtype(0) != nullptr) { + result->elements = + getSubtype(0)->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector); + } + return result; } - return return_value; - } - case MAP: { - MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - if (getSubtype(0) != nullptr) { - result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); - } - if (getSubtype(1) != nullptr) { - result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded); + case MAP: { + auto result = std::make_unique<MapVectorBatch>(capacity, memoryPool); + if (getSubtype(0) != nullptr) { + result->keys = + getSubtype(0)->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector); + } + if (getSubtype(1) != nullptr) { + result->elements = + getSubtype(1)->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector); + } + return result; } - return return_value; - } - case DECIMAL: { - if (getPrecision() == 0 || getPrecision() > 18) { - return std::unique_ptr<ColumnVectorBatch> - (new Decimal128VectorBatch(capacity, memoryPool)); - } else { - return std::unique_ptr<ColumnVectorBatch> - (new Decimal64VectorBatch(capacity, memoryPool)); + case DECIMAL: { + if (getPrecision() == 0 || getPrecision() > 18) { + return std::make_unique<Decimal128VectorBatch>(capacity, memoryPool); + } else { + return std::make_unique<Decimal64VectorBatch>(capacity, memoryPool); + } } - } - case UNION: { - UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - for(uint64_t i=0; i < getSubtypeCount(); ++i) { - result->children.push_back(getSubtype(i)->createRowBatch(capacity, - memoryPool, encoded) - .release()); + case UNION: { + auto result = std::make_unique<UnionVectorBatch>(capacity, memoryPool); + for (uint64_t i = 0; i < getSubtypeCount(); ++i) { + result->children.push_back( + getSubtype(i) + ->createRowBatch(capacity, memoryPool, encoded, useTightNumericVector) + .release()); + } + return result; } - return return_value; - } - default: - throw NotImplementedYet("not supported yet"); + default: + throw NotImplementedYet("not supported yet"); } } std::unique_ptr<Type> createPrimitiveType(TypeKind kind) { - return std::unique_ptr<Type>(new TypeImpl(kind)); + return std::make_unique<TypeImpl>(kind); } - std::unique_ptr<Type> createCharType(TypeKind kind, - uint64_t maxLength) { - return std::unique_ptr<Type>(new TypeImpl(kind, maxLength)); + std::unique_ptr<Type> createCharType(TypeKind kind, uint64_t maxLength) { + return std::make_unique<TypeImpl>(kind, maxLength); } - std::unique_ptr<Type> createDecimalType(uint64_t precision, - uint64_t scale) { - return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); + std::unique_ptr<Type> createDecimalType(uint64_t precision, uint64_t scale) { + return std::make_unique<TypeImpl>(DECIMAL, precision, scale); } std::unique_ptr<Type> createStructType() { - return std::unique_ptr<Type>(new TypeImpl(STRUCT)); + return std::make_unique<TypeImpl>(STRUCT); } std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) { - TypeImpl* result = new TypeImpl(LIST); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + auto result = std::make_unique<TypeImpl>(LIST); result->addChildType(std::move(elements)); - return return_value; + return result; } - std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, - std::unique_ptr<Type> value) { - TypeImpl* result = new TypeImpl(MAP); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, std::unique_ptr<Type> value) { + auto result = std::make_unique<TypeImpl>(MAP); result->addChildType(std::move(key)); result->addChildType(std::move(value)); - return return_value; + return result; } std::unique_ptr<Type> createUnionType() { - return std::unique_ptr<Type>(new TypeImpl(UNION)); + return std::make_unique<TypeImpl>(UNION); } std::string printProtobufMessage(const google::protobuf::Message& message); - std::unique_ptr<Type> convertType(const proto::Type& type, - const proto::Footer& footer) { + std::unique_ptr<Type> convertType(const proto::Type& type, const proto::Footer& footer) { std::unique_ptr<Type> ret; switch (static_cast<int64_t>(type.kind())) { - - case proto::Type_Kind_BOOLEAN: - case proto::Type_Kind_BYTE: - case proto::Type_Kind_SHORT: - case proto::Type_Kind_INT: - case proto::Type_Kind_LONG: - case proto::Type_Kind_FLOAT: - case proto::Type_Kind_DOUBLE: - case proto::Type_Kind_STRING: - case proto::Type_Kind_BINARY: - case proto::Type_Kind_TIMESTAMP: - case proto::Type_Kind_TIMESTAMP_INSTANT: - case proto::Type_Kind_DATE: - ret = std::unique_ptr<Type> - (new TypeImpl(static_cast<TypeKind>(type.kind()))); - break; - - case proto::Type_Kind_CHAR: - case proto::Type_Kind_VARCHAR: - ret = std::unique_ptr<Type> - (new TypeImpl(static_cast<TypeKind>(type.kind()), - type.maximumlength())); - break; - - case proto::Type_Kind_DECIMAL: - ret = std::unique_ptr<Type> - (new TypeImpl(DECIMAL, type.precision(), type.scale())); - break; - - case proto::Type_Kind_LIST: - case proto::Type_Kind_MAP: - case proto::Type_Kind_UNION: { - TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind())); - ret = std::unique_ptr<Type>(result); - if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1) - throw ParseError("Illegal LIST type that doesn't contain one subtype"); - if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2) - throw ParseError("Illegal MAP type that doesn't contain two subtypes"); - if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0) - throw ParseError("Illegal UNION type that doesn't contain any subtypes"); - for(int i=0; i < type.subtypes_size(); ++i) { - result->addUnionChild(convertType(footer.types(static_cast<int> - (type.subtypes(i))), - footer)); + case proto::Type_Kind_BOOLEAN: + case proto::Type_Kind_BYTE: + case proto::Type_Kind_SHORT: + case proto::Type_Kind_INT: + case proto::Type_Kind_LONG: + case proto::Type_Kind_FLOAT: + case proto::Type_Kind_DOUBLE: + case proto::Type_Kind_STRING: + case proto::Type_Kind_BINARY: + case proto::Type_Kind_TIMESTAMP: + case proto::Type_Kind_TIMESTAMP_INSTANT: + case proto::Type_Kind_DATE: + ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind())); + break; + + case proto::Type_Kind_CHAR: + case proto::Type_Kind_VARCHAR: + ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()), type.maximum_length()); + break; + + case proto::Type_Kind_DECIMAL: + ret = std::make_unique<TypeImpl>(DECIMAL, type.precision(), type.scale()); + break; + + case proto::Type_Kind_LIST: + case proto::Type_Kind_MAP: + case proto::Type_Kind_UNION: { + ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind())); + if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1) + throw ParseError("Illegal LIST type that doesn't contain one subtype"); + if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2) + throw ParseError("Illegal MAP type that doesn't contain two subtypes"); + if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0) + throw ParseError("Illegal UNION type that doesn't contain any subtypes"); + for (int i = 0; i < type.subtypes_size(); ++i) { + ret->addUnionChild(convertType(footer.types(static_cast<int>(type.subtypes(i))), footer)); + } + break; } - break; - } - - case proto::Type_Kind_STRUCT: { - TypeImpl* result = new TypeImpl(STRUCT); - ret = std::unique_ptr<Type>(result); - if (type.subtypes_size() > type.fieldnames_size()) - throw ParseError("Illegal STRUCT type that contains less fieldnames than subtypes"); - for(int i=0; i < type.subtypes_size(); ++i) { - result->addStructField(type.fieldnames(i), - convertType(footer.types(static_cast<int> - (type.subtypes(i))), - footer)); + + case proto::Type_Kind_STRUCT: { + ret = std::make_unique<TypeImpl>(STRUCT); + if (type.subtypes_size() > type.field_names_size()) + throw ParseError("Illegal STRUCT type that contains less field_names than subtypes"); + for (int i = 0; i < type.subtypes_size(); ++i) { + ret->addStructField( + type.field_names(i), + convertType(footer.types(static_cast<int>(type.subtypes(i))), footer)); + } + break; } - break; - } - default: - throw NotImplementedYet("Unknown type kind"); + default: + throw NotImplementedYet("Unknown type kind"); } for (int i = 0; i < type.attributes_size(); ++i) { const auto& attribute = type.attributes(i); @@ -493,143 +492,126 @@ namespace orc { * @param selected is each column by id selected * @return a clone of the fileType filtered by the selection array */ - std::unique_ptr<Type> buildSelectedType(const Type *fileType, - const std::vector<bool>& selected) { + std::unique_ptr<Type> buildSelectedType(const Type* fileType, const std::vector<bool>& selected) { if (fileType == nullptr || !selected[fileType->getColumnId()]) { - return std::unique_ptr<Type>(); + return nullptr; } - TypeImpl* result; + std::unique_ptr<TypeImpl> result; switch (static_cast<int>(fileType->getKind())) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case FLOAT: - case DOUBLE: - case STRING: - case BINARY: - case TIMESTAMP: - case TIMESTAMP_INSTANT: - case DATE: - result = new TypeImpl(fileType->getKind()); - break; - - case DECIMAL: - result= new TypeImpl(fileType->getKind(), - fileType->getPrecision(), fileType->getScale()); - break; - - case VARCHAR: - case CHAR: - result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength()); - break; - - case LIST: - result = new TypeImpl(fileType->getKind()); - result->addChildType(buildSelectedType(fileType->getSubtype(0), - selected)); - break; - - case MAP: - result = new TypeImpl(fileType->getKind()); - result->addChildType(buildSelectedType(fileType->getSubtype(0), - selected)); - result->addChildType(buildSelectedType(fileType->getSubtype(1), - selected)); - break; - - case STRUCT: { - result = new TypeImpl(fileType->getKind()); - for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { - std::unique_ptr<Type> childType = - buildSelectedType(fileType->getSubtype(child), selected); - if (childType.get() != nullptr) { - result->addStructField(fileType->getFieldName(child), - std::move(childType)); + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + case BINARY: + case TIMESTAMP: + case TIMESTAMP_INSTANT: + case DATE: + result = std::make_unique<TypeImpl>(fileType->getKind()); + break; + + case DECIMAL: + result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getPrecision(), + fileType->getScale()); + break; + + case VARCHAR: + case CHAR: + result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getMaximumLength()); + break; + + case LIST: + result = std::make_unique<TypeImpl>(fileType->getKind()); + result->addChildType(buildSelectedType(fileType->getSubtype(0), selected)); + break; + + case MAP: + result = std::make_unique<TypeImpl>(fileType->getKind()); + result->addChildType(buildSelectedType(fileType->getSubtype(0), selected)); + result->addChildType(buildSelectedType(fileType->getSubtype(1), selected)); + break; + + case STRUCT: { + result = std::make_unique<TypeImpl>(fileType->getKind()); + for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) { + std::unique_ptr<Type> childType = + buildSelectedType(fileType->getSubtype(child), selected); + if (childType.get() != nullptr) { + result->addStructField(fileType->getFieldName(child), std::move(childType)); + } } + break; } - break; - } - case UNION: { - result = new TypeImpl(fileType->getKind()); - for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { - std::unique_ptr<Type> childType = - buildSelectedType(fileType->getSubtype(child), selected); - if (childType.get() != nullptr) { - result->addUnionChild(std::move(childType)); + case UNION: { + result = std::make_unique<TypeImpl>(fileType->getKind()); + for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) { + std::unique_ptr<Type> childType = + buildSelectedType(fileType->getSubtype(child), selected); + if (childType.get() != nullptr) { + result->addUnionChild(std::move(childType)); + } } + break; } - break; - } - default: - throw NotImplementedYet("Unknown type kind"); + default: + throw NotImplementedYet("Unknown type kind"); } result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); for (auto& key : fileType->getAttributeKeys()) { const auto& value = fileType->getAttributeValue(key); result->setAttribute(key, value); } - return std::unique_ptr<Type>(result); + return result; } - ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) { + std::unique_ptr<Type> Type::buildTypeFromString(const std::string& input) { size_t size = input.size(); - std::pair<ORC_UNIQUE_PTR<Type>, size_t> res = - TypeImpl::parseType(input, 0, size); + std::pair<std::unique_ptr<Type>, size_t> res = TypeImpl::parseType(input, 0, size); if (res.second != size) { throw std::logic_error("Invalid type string."); } return std::move(res.first); } - std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input, - size_t start, + std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string& input, size_t start, size_t end) { - TypeImpl* arrayType = new TypeImpl(LIST); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType); + auto result = std::make_unique<TypeImpl>(LIST); if (input[start] != '<') { throw std::logic_error("Missing < after array."); } - std::pair<ORC_UNIQUE_PTR<Type>, size_t> res = - TypeImpl::parseType(input, start + 1, end); + std::pair<std::unique_ptr<Type>, size_t> res = TypeImpl::parseType(input, start + 1, end); if (res.second != end) { - throw std::logic_error( - "Array type must contain exactly one sub type."); + throw std::logic_error("Array type must contain exactly one sub type."); } - arrayType->addChildType(std::move(res.first)); - return return_value; + result->addChildType(std::move(res.first)); + return result; } - std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* mapType = new TypeImpl(MAP); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType); + std::unique_ptr<Type> TypeImpl::parseMapType(const std::string& input, size_t start, size_t end) { + auto result = std::make_unique<TypeImpl>(MAP); if (input[start] != '<') { throw std::logic_error("Missing < after map."); } - std::pair<ORC_UNIQUE_PTR<Type>, size_t> key = - TypeImpl::parseType(input, start + 1, end); + std::pair<std::unique_ptr<Type>, size_t> key = TypeImpl::parseType(input, start + 1, end); if (input[key.second] != ',') { throw std::logic_error("Missing comma after key."); } - std::pair<ORC_UNIQUE_PTR<Type>, size_t> val = - TypeImpl::parseType(input, key.second + 1, end); + std::pair<std::unique_ptr<Type>, size_t> val = TypeImpl::parseType(input, key.second + 1, end); if (val.second != end) { - throw std::logic_error( - "Map type must contain exactly two sub types."); + throw std::logic_error("Map type must contain exactly two sub types."); } - mapType->addChildType(std::move(key.first)); - mapType->addChildType(std::move(val.first)); - return return_value; + result->addChildType(std::move(key.first)); + result->addChildType(std::move(val.first)); + return result; } - std::pair<std::string, size_t> TypeImpl::parseName(const std::string &input, - const size_t start, + std::pair<std::string, size_t> TypeImpl::parseName(const std::string& input, const size_t start, const size_t end) { size_t pos = start; if (input[pos] == '`') { @@ -638,7 +620,7 @@ namespace orc { while (pos < end) { char ch = input[++pos]; if (ch == '`') { - if (pos < end && input[pos+1] == '`') { + if (pos < end && input[pos + 1] == '`') { ++pos; oss.put('`'); } else { @@ -667,11 +649,9 @@ namespace orc { } } - std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input, - size_t start, + std::unique_ptr<Type> TypeImpl::parseStructType(const std::string& input, size_t start, size_t end) { - TypeImpl* structType = new TypeImpl(STRUCT); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType); + auto result = std::make_unique<TypeImpl>(STRUCT); size_t pos = start + 1; if (input[start] != '<') { throw std::logic_error("Missing < after struct."); @@ -682,9 +662,8 @@ namespace orc { if (input[pos] != ':') { throw std::logic_error("Invalid struct type. No field name set."); } - std::pair<ORC_UNIQUE_PTR<Type>, size_t> typeRes = - TypeImpl::parseType(input, ++pos, end); - structType->addStructField(nameRes.first, std::move(typeRes.first)); + std::pair<std::unique_ptr<Type>, size_t> typeRes = TypeImpl::parseType(input, ++pos, end); + result->addStructField(nameRes.first, std::move(typeRes.first)); pos = typeRes.second; if (pos != end && input[pos] != ',') { throw std::logic_error("Missing comma after field."); @@ -692,22 +671,19 @@ namespace orc { ++pos; } - return return_value; + return result; } - std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input, - size_t start, + std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string& input, size_t start, size_t end) { - TypeImpl* unionType = new TypeImpl(UNION); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType); + auto result = std::make_unique<TypeImpl>(UNION); size_t pos = start + 1; if (input[start] != '<') { throw std::logic_error("Missing < after uniontype."); } while (pos < end) { - std::pair<ORC_UNIQUE_PTR<Type>, size_t> res = - TypeImpl::parseType(input, pos, end); - unionType->addChildType(std::move(res.first)); + std::pair<std::unique_ptr<Type>, size_t> res = TypeImpl::parseType(input, pos, end); + result->addChildType(std::move(res.first)); pos = res.second; if (pos != end && input[pos] != ',') { throw std::logic_error("Missing comma after union sub type."); @@ -715,11 +691,10 @@ namespace orc { ++pos; } - return return_value; + return result; } - std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input, - size_t start, + std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string& input, size_t start, size_t end) { if (input[start] != '(') { throw std::logic_error("Missing ( after decimal."); @@ -729,61 +704,54 @@ namespace orc { if (sep + 1 >= end || sep == std::string::npos) { throw std::logic_error("Decimal type must specify precision and scale."); } - uint64_t precision = - static_cast<uint64_t>(atoi(input.substr(pos, sep - pos).c_str())); - uint64_t scale = - static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); - return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); + uint64_t precision = static_cast<uint64_t>(atoi(input.substr(pos, sep - pos).c_str())); + uint64_t scale = static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); + return std::make_unique<TypeImpl>(DECIMAL, precision, scale); } - void validatePrimitiveType(std::string category, - const std::string &input, - const size_t pos) { + void validatePrimitiveType(std::string category, const std::string& input, const size_t pos) { if (input[pos] == '<' || input[pos] == '(') { std::ostringstream oss; - oss << "Invalid " << input[pos] << " after " - << category << " type."; + oss << "Invalid " << input[pos] << " after " << category << " type."; throw std::logic_error(oss.str()); } } - std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, - const std::string &input, - size_t start, - size_t end) { + std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, const std::string& input, + size_t start, size_t end) { if (category == "boolean") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(BOOLEAN)); + return std::make_unique<TypeImpl>(BOOLEAN); } else if (category == "tinyint") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(BYTE)); + return std::make_unique<TypeImpl>(BYTE); } else if (category == "smallint") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(SHORT)); + return std::make_unique<TypeImpl>(SHORT); } else if (category == "int") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(INT)); + return std::make_unique<TypeImpl>(INT); } else if (category == "bigint") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(LONG)); + return std::make_unique<TypeImpl>(LONG); } else if (category == "float") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(FLOAT)); + return std::make_unique<TypeImpl>(FLOAT); } else if (category == "double") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(DOUBLE)); + return std::make_unique<TypeImpl>(DOUBLE); } else if (category == "string") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(STRING)); + return std::make_unique<TypeImpl>(STRING); } else if (category == "binary") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(BINARY)); + return std::make_unique<TypeImpl>(BINARY); } else if (category == "timestamp") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP)); + return std::make_unique<TypeImpl>(TIMESTAMP); } else if (category == "timestamp with local time zone") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP_INSTANT)); + return std::make_unique<TypeImpl>(TIMESTAMP_INSTANT); } else if (category == "array") { return parseArrayType(input, start, end); } else if (category == "map") { @@ -796,27 +764,28 @@ namespace orc { return parseDecimalType(input, start, end); } else if (category == "date") { validatePrimitiveType(category, input, start); - return std::unique_ptr<Type>(new TypeImpl(DATE)); + return std::make_unique<TypeImpl>(DATE); } else if (category == "varchar") { if (input[start] != '(') { throw std::logic_error("Missing ( after varchar."); } - uint64_t maxLength = static_cast<uint64_t>( - atoi(input.substr(start + 1, end - start + 1).c_str())); - return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength)); + uint64_t maxLength = + static_cast<uint64_t>(atoi(input.substr(start + 1, end - start + 1).c_str())); + return std::make_unique<TypeImpl>(VARCHAR, maxLength); } else if (category == "char") { if (input[start] != '(') { throw std::logic_error("Missing ( after char."); } - uint64_t maxLength = static_cast<uint64_t>( - atoi(input.substr(start + 1, end - start + 1).c_str())); - return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength)); + uint64_t maxLength = + static_cast<uint64_t>(atoi(input.substr(start + 1, end - start + 1).c_str())); + return std::make_unique<TypeImpl>(CHAR, maxLength); } else { throw std::logic_error("Unknown type " + category); } } - std::pair<ORC_UNIQUE_PTR<Type>, size_t> TypeImpl::parseType(const std::string &input, size_t start, size_t end) { + std::pair<std::unique_ptr<Type>, size_t> TypeImpl::parseType(const std::string& input, + size_t start, size_t end) { size_t pos = start; while (pos < end && (isalpha(input[pos]) || input[pos] == ' ')) { ++pos; @@ -854,4 +823,18 @@ namespace orc { return std::make_pair(parseCategory(category, input, pos, nextPos), endPos); } -} + const Type* TypeImpl::getTypeByColumnId(uint64_t colIdx) const { + if (getColumnId() == colIdx) { + return this; + } + + for (uint64_t i = 0; i != getSubtypeCount(); ++i) { + const Type* ret = getSubtype(i)->getTypeByColumnId(colIdx); + if (ret != nullptr) { + return ret; + } + } + return nullptr; + } + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh index 88c4737d18..6d0743793a 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh @@ -28,8 +28,8 @@ namespace orc { - class TypeImpl: public Type { - private: + class TypeImpl : public Type { + private: TypeImpl* parent; mutable int64_t columnId; mutable int64_t maximumColumnId; @@ -42,7 +42,7 @@ namespace orc { uint64_t scale; std::map<std::string, std::string> attributes; - public: + public: /** * Create most of the primitive types. */ @@ -56,8 +56,7 @@ namespace orc { /** * Create decimal type. */ - TypeImpl(TypeKind kind, uint64_t precision, - uint64_t scale); + TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale); uint64_t getColumnId() const override; @@ -77,8 +76,7 @@ namespace orc { uint64_t getScale() const override; - Type& setAttribute(const std::string& key, - const std::string& value) override; + Type& setAttribute(const std::string& key, const std::string& value) override; bool hasAttributeKey(const std::string& key) const override; @@ -90,14 +88,16 @@ namespace orc { std::string toString() const override; - Type* addStructField(const std::string& fieldName, - std::unique_ptr<Type> fieldType) override; + const Type* getTypeByColumnId(uint64_t colIdx) const override; + Type* addStructField(const std::string& fieldName, std::unique_ptr<Type> fieldType) override; Type* addUnionChild(std::unique_ptr<Type> fieldType) override; - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, - MemoryPool& memoryPool, - bool encoded = false - ) const override; + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& memoryPool, + bool encoded = false) const override; + + std::unique_ptr<ColumnVectorBatch> createRowBatch( + uint64_t size, MemoryPool& memoryPool, bool encoded = false, + bool useTightNumericVector = false) const override; /** * Explicitly set the column ids. Only for internal usage. @@ -109,12 +109,10 @@ namespace orc { */ void addChildType(std::unique_ptr<Type> childType); - static std::pair<ORC_UNIQUE_PTR<Type>, size_t> parseType( - const std::string &input, - size_t start, - size_t end); + static std::pair<std::unique_ptr<Type>, size_t> parseType(const std::string& input, + size_t start, size_t end); - private: + private: /** * Assign ids to this node and its children giving this * node rootId. @@ -133,9 +131,7 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::unique_ptr<Type> parseArrayType(const std::string &input, - size_t start, - size_t end); + static std::unique_ptr<Type> parseArrayType(const std::string& input, size_t start, size_t end); /** * Parse map type from string @@ -143,9 +139,7 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::unique_ptr<Type> parseMapType(const std::string &input, - size_t start, - size_t end); + static std::unique_ptr<Type> parseMapType(const std::string& input, size_t start, size_t end); /** * Parse field name from string @@ -153,8 +147,7 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::pair<std::string, size_t> parseName(const std::string &input, - const size_t start, + static std::pair<std::string, size_t> parseName(const std::string& input, const size_t start, const size_t end); /** @@ -163,8 +156,7 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::unique_ptr<Type> parseStructType(const std::string &input, - size_t start, + static std::unique_ptr<Type> parseStructType(const std::string& input, size_t start, size_t end); /** @@ -173,9 +165,7 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::unique_ptr<Type> parseUnionType(const std::string &input, - size_t start, - size_t end); + static std::unique_ptr<Type> parseUnionType(const std::string& input, size_t start, size_t end); /** * Parse decimal type from string @@ -183,8 +173,7 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::unique_ptr<Type> parseDecimalType(const std::string &input, - size_t start, + static std::unique_ptr<Type> parseDecimalType(const std::string& input, size_t start, size_t end); /** @@ -194,14 +183,11 @@ namespace orc { * @param start start position of the input string * @param end end position of the input string */ - static std::unique_ptr<Type> parseCategory(std::string category, - const std::string &input, - size_t start, - size_t end); + static std::unique_ptr<Type> parseCategory(std::string category, const std::string& input, + size_t start, size_t end); }; - std::unique_ptr<Type> convertType(const proto::Type& type, - const proto::Footer& footer); + std::unique_ptr<Type> convertType(const proto::Type& type, const proto::Footer& footer); /** * Build a clone of the file type, projecting columns from the selected @@ -211,8 +197,7 @@ namespace orc { * @param selected is each column by id selected * @return a clone of the fileType filtered by the selection array */ - std::unique_ptr<Type> buildSelectedType(const Type *fileType, - const std::vector<bool>& selected); -} + std::unique_ptr<Type> buildSelectedType(const Type* fileType, const std::vector<bool>& selected); +} // namespace orc #endif diff --git a/contrib/libs/apache/orc/c++/src/Utils.hh b/contrib/libs/apache/orc/c++/src/Utils.hh new file mode 100644 index 0000000000..751c09b205 --- /dev/null +++ b/contrib/libs/apache/orc/c++/src/Utils.hh @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_UTILS_HH +#define ORC_UTILS_HH + +#include <atomic> +#include <chrono> + +namespace orc { + + class AutoStopwatch { + std::chrono::high_resolution_clock::time_point start; + std::atomic<uint64_t>* latencyUs; + std::atomic<uint64_t>* count; + bool minus; + + public: + AutoStopwatch(std::atomic<uint64_t>* _latencyUs, std::atomic<uint64_t>* _count, + bool _minus = false) + : latencyUs(_latencyUs), count(_count), minus(_minus) { + if (latencyUs) { + start = std::chrono::high_resolution_clock::now(); + } + } + + ~AutoStopwatch() { + if (latencyUs) { + std::chrono::microseconds elapsedTime = + std::chrono::duration_cast<std::chrono::microseconds>( + std::chrono::high_resolution_clock::now() - start); + if (!minus) { + latencyUs->fetch_add(static_cast<uint64_t>(elapsedTime.count())); + } else { + latencyUs->fetch_sub(static_cast<uint64_t>(elapsedTime.count())); + } + } + + if (count) { + count->fetch_add(1); + } + } + }; + +#if ENABLE_METRICS +#define SCOPED_STOPWATCH(METRICS_PTR, LATENCY_VAR, COUNT_VAR) \ + AutoStopwatch measure((METRICS_PTR == nullptr ? nullptr : &METRICS_PTR->LATENCY_VAR), \ + (METRICS_PTR == nullptr ? nullptr : &METRICS_PTR->COUNT_VAR)) + +#define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR) \ + AutoStopwatch measure((METRICS_PTR == nullptr ? nullptr : &METRICS_PTR->LATENCY_VAR), nullptr, \ + true) +#else +#define SCOPED_STOPWATCH(METRICS_PTR, LATENCY_VAR, COUNT_VAR) +#define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR) +#endif + +} // namespace orc + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc index fefaaad4b1..b9e2854586 100644 --- a/contrib/libs/apache/orc/c++/src/Vector.cc +++ b/contrib/libs/apache/orc/c++/src/Vector.cc @@ -20,21 +20,21 @@ #include "Adaptor.hh" #include "orc/Exceptions.hh" +#include "orc/MemoryPool.hh" +#include <cstdlib> #include <iostream> #include <sstream> -#include <cstdlib> namespace orc { - ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, - MemoryPool& pool - ): capacity(cap), - numElements(0), - notNull(pool, cap), - hasNulls(false), - isEncoded(false), - memoryPool(pool) { + ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, MemoryPool& pool) + : capacity(cap), + numElements(0), + notNull(pool, cap), + hasNulls(false), + isEncoded(false), + memoryPool(pool) { std::memset(notNull.data(), 1, capacity); } @@ -61,81 +61,13 @@ namespace orc { return false; } - LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity) { - // PASS - } - - LongVectorBatch::~LongVectorBatch() { - // PASS - } - - std::string LongVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Long vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void LongVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - } - } - - void LongVectorBatch::clear() { - numElements = 0; - } - - uint64_t LongVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() + - static_cast<uint64_t>(data.capacity() * sizeof(int64_t)); - } - - DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity) { - // PASS - } - - DoubleVectorBatch::~DoubleVectorBatch() { - // PASS - } - - std::string DoubleVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Double vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void DoubleVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - } - } - - void DoubleVectorBatch::clear() { - numElements = 0; - } - - uint64_t DoubleVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(data.capacity() * sizeof(double)); - } - StringDictionary::StringDictionary(MemoryPool& pool) - : dictionaryBlob(pool), - dictionaryOffset(pool) { + : dictionaryBlob(pool), dictionaryOffset(pool) { // PASS } - EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, - MemoryPool& pool) - : StringVectorBatch(_capacity, pool), - dictionary(), - index(pool, _capacity) { + EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, MemoryPool& pool) + : StringVectorBatch(_capacity, pool), dictionary(), index(pool, _capacity) { // PASS } @@ -156,11 +88,11 @@ namespace orc { } } - StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity), - length(pool, _capacity), - blob(pool) { + StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool) + : ColumnVectorBatch(_capacity, pool), + data(pool, _capacity), + length(pool, _capacity), + blob(pool) { // PASS } @@ -187,28 +119,27 @@ namespace orc { } uint64_t StringVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(data.capacity() * sizeof(char*) - + length.capacity() * sizeof(int64_t)); + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(char*) + + length.capacity() * sizeof(int64_t)); } - StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool) { + StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool) { // PASS } StructVectorBatch::~StructVectorBatch() { - for (uint64_t i=0; i<this->fields.size(); i++) { + for (uint64_t i = 0; i < this->fields.size(); i++) { delete this->fields[i]; } } std::string StructVectorBatch::toString() const { std::ostringstream buffer; - buffer << "Struct vector <" << numElements << " of " << capacity - << "; "; - for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin(); - ptr != fields.end(); ++ptr) { + buffer << "Struct vector <" << numElements << " of " << capacity << "; "; + for (std::vector<ColumnVectorBatch*>::const_iterator ptr = fields.begin(); ptr != fields.end(); + ++ptr) { buffer << (*ptr)->toString() << "; "; } buffer << ">"; @@ -220,7 +151,7 @@ namespace orc { } void StructVectorBatch::clear() { - for(size_t i=0; i < fields.size(); i++) { + for (size_t i = 0; i < fields.size(); i++) { fields[i]->clear(); } numElements = 0; @@ -228,14 +159,14 @@ namespace orc { uint64_t StructVectorBatch::getMemoryUsage() { uint64_t memory = ColumnVectorBatch::getMemoryUsage(); - for (unsigned int i=0; i < fields.size(); i++) { + for (unsigned int i = 0; i < fields.size(); i++) { memory += fields[i]->getMemoryUsage(); } return memory; } bool StructVectorBatch::hasVariableLength() { - for (unsigned int i=0; i < fields.size(); i++) { + for (unsigned int i = 0; i < fields.size(); i++) { if (fields[i]->hasVariableLength()) { return true; } @@ -243,10 +174,9 @@ namespace orc { return false; } - ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - offsets(pool, cap+1) { - // PASS + ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { + offsets.zeroOut(); } ListVectorBatch::~ListVectorBatch() { @@ -255,8 +185,8 @@ namespace orc { std::string ListVectorBatch::toString() const { std::ostringstream buffer; - buffer << "List vector <" << elements->toString() << " with " - << numElements << " of " << capacity << ">"; + buffer << "List vector <" << elements->toString() << " with " << numElements << " of " + << capacity << ">"; return buffer.str(); } @@ -273,19 +203,17 @@ namespace orc { } uint64_t ListVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) - + elements->getMemoryUsage(); + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + elements->getMemoryUsage(); } bool ListVectorBatch::hasVariableLength() { return true; } - MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - offsets(pool, cap+1) { - // PASS + MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { + offsets.zeroOut(); } MapVectorBatch::~MapVectorBatch() { @@ -294,9 +222,9 @@ namespace orc { std::string MapVectorBatch::toString() const { std::ostringstream buffer; - buffer << "Map vector <" << (keys ? keys->toString(): "key not selected") << ", " - << (elements ? elements->toString(): "value not selected") << " with " - << numElements << " of " << capacity << ">"; + buffer << "Map vector <" << (keys ? keys->toString() : "key not selected") << ", " + << (elements ? elements->toString() : "value not selected") << " with " << numElements + << " of " << capacity << ">"; return buffer.str(); } @@ -314,25 +242,23 @@ namespace orc { } uint64_t MapVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) - + (keys ? keys->getMemoryUsage() : 0) - + (elements ? elements->getMemoryUsage() : 0); + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + + (keys ? keys->getMemoryUsage() : 0) + (elements ? elements->getMemoryUsage() : 0); } bool MapVectorBatch::hasVariableLength() { return true; } - UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - tags(pool, cap), - offsets(pool, cap) { - // PASS + UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) { + tags.zeroOut(); + offsets.zeroOut(); } UnionVectorBatch::~UnionVectorBatch() { - for (uint64_t i=0; i < children.size(); i++) { + for (uint64_t i = 0; i < children.size(); i++) { delete children[i]; } } @@ -340,7 +266,7 @@ namespace orc { std::string UnionVectorBatch::toString() const { std::ostringstream buffer; buffer << "Union vector <"; - for(size_t i=0; i < children.size(); ++i) { + for (size_t i = 0; i < children.size(); ++i) { if (i != 0) { buffer << ", "; } @@ -359,24 +285,24 @@ namespace orc { } void UnionVectorBatch::clear() { - for(size_t i=0; i < children.size(); i++) { + for (size_t i = 0; i < children.size(); i++) { children[i]->clear(); } numElements = 0; } uint64_t UnionVectorBatch::getMemoryUsage() { - uint64_t memory = ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) - + offsets.capacity() * sizeof(uint64_t)); - for(size_t i=0; i < children.size(); ++i) { + uint64_t memory = ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) + + offsets.capacity() * sizeof(uint64_t)); + for (size_t i = 0; i < children.size(); ++i) { memory += children[i]->getMemoryUsage(); } return memory; } bool UnionVectorBatch::hasVariableLength() { - for(size_t i=0; i < children.size(); ++i) { + for (size_t i = 0; i < children.size(); ++i) { if (children[i]->hasVariableLength()) { return true; } @@ -384,12 +310,12 @@ namespace orc { return false; } - Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - precision(0), - scale(0), - values(pool, cap), - readScales(pool, cap) { + Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), + precision(0), + scale(0), + values(pool, cap), + readScales(pool, cap) { // PASS } @@ -399,8 +325,7 @@ namespace orc { std::string Decimal64VectorBatch::toString() const { std::ostringstream buffer; - buffer << "Decimal64 vector with " - << numElements << " of " << capacity << ">"; + buffer << "Decimal64 vector with " << numElements << " of " << capacity << ">"; return buffer.str(); } @@ -417,17 +342,16 @@ namespace orc { } uint64_t Decimal64VectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>( - (values.capacity() + readScales.capacity()) * sizeof(int64_t)); + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>((values.capacity() + readScales.capacity()) * sizeof(int64_t)); } - Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - precision(0), - scale(0), - values(pool, cap), - readScales(pool, cap) { + Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool) + : ColumnVectorBatch(cap, pool), + precision(0), + scale(0), + values(pool, cap), + readScales(pool, cap) { // PASS } @@ -437,8 +361,7 @@ namespace orc { std::string Decimal128VectorBatch::toString() const { std::ostringstream buffer; - buffer << "Decimal128 vector with " - << numElements << " of " << capacity << ">"; + buffer << "Decimal128 vector with " << numElements << " of " << capacity << ">"; return buffer.str(); } @@ -455,23 +378,22 @@ namespace orc { } uint64_t Decimal128VectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(values.capacity() * sizeof(Int128) - + readScales.capacity() * sizeof(int64_t)); + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(values.capacity() * sizeof(Int128) + + readScales.capacity() * sizeof(int64_t)); } - Decimal::Decimal(const Int128& _value, - int32_t _scale): value(_value), scale(_scale) { + Decimal::Decimal(const Int128& _value, int32_t _scale) : value(_value), scale(_scale) { // PASS } Decimal::Decimal(const std::string& str) { std::size_t foundPoint = str.find("."); // no decimal point, it is int - if(foundPoint == std::string::npos){ + if (foundPoint == std::string::npos) { value = Int128(str); scale = 0; - }else{ + } else { std::string copy(str); scale = static_cast<int32_t>(str.length() - foundPoint - 1); value = Int128(copy.replace(foundPoint, 1, "")); @@ -486,12 +408,8 @@ namespace orc { return value.toDecimalString(scale, trimTrailingZeros); } - TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, - MemoryPool& pool - ): ColumnVectorBatch(_capacity, - pool), - data(pool, _capacity), - nanoseconds(pool, _capacity) { + TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, MemoryPool& pool) + : ColumnVectorBatch(_capacity, pool), data(pool, _capacity), nanoseconds(pool, _capacity) { // PASS } @@ -518,8 +436,7 @@ namespace orc { } uint64_t TimestampVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>( - (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t)); + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>((data.capacity() + nanoseconds.capacity()) * sizeof(int64_t)); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc index 8a7d10ba81..19b71190a3 100644 --- a/contrib/libs/apache/orc/c++/src/Writer.cc +++ b/contrib/libs/apache/orc/c++/src/Writer.cc @@ -21,6 +21,7 @@ #include "ColumnWriter.hh" #include "Timezone.hh" +#include "Utils.hh" #include <memory> @@ -42,37 +43,41 @@ namespace orc { double bloomFilterFalsePositiveProb; BloomFilterVersion bloomFilterVersion; std::string timezone; + WriterMetrics* metrics; + bool useTightNumericVector; + uint64_t outputBufferCapacity; - WriterOptionsPrivate() : - fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 - stripeSize = 64 * 1024 * 1024; // 64M - compressionBlockSize = 64 * 1024; // 64K + WriterOptionsPrivate() : fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 + stripeSize = 64 * 1024 * 1024; // 64M + compressionBlockSize = 64 * 1024; // 64K rowIndexStride = 10000; - compression = CompressionKind_ZLIB; + compression = CompressionKind_ZSTD; compressionStrategy = CompressionStrategy_SPEED; memoryPool = getDefaultPool(); paddingTolerance = 0.0; errorStream = &std::cerr; dictionaryKeySizeThreshold = 0.0; enableIndex = true; - bloomFilterFalsePositiveProb = 0.05; + bloomFilterFalsePositiveProb = 0.01; bloomFilterVersion = UTF8; - //Writer timezone uses "GMT" by default to get rid of potential issues - //introduced by moving timestamps between different timezones. - //Explictly set the writer timezone if the use case depends on it. + // Writer timezone uses "GMT" by default to get rid of potential issues + // introduced by moving timestamps between different timezones. + // Explictly set the writer timezone if the use case depends on it. timezone = "GMT"; + metrics = nullptr; + useTightNumericVector = false; + outputBufferCapacity = 1024 * 1024; } }; - WriterOptions::WriterOptions(): - privateBits(std::unique_ptr<WriterOptionsPrivate> - (new WriterOptionsPrivate())) { + WriterOptions::WriterOptions() + : privateBits(std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate())) { // PASS } - WriterOptions::WriterOptions(const WriterOptions& rhs): - privateBits(std::unique_ptr<WriterOptionsPrivate> - (new WriterOptionsPrivate(*(rhs.privateBits.get())))) { + WriterOptions::WriterOptions(const WriterOptions& rhs) + : privateBits(std::unique_ptr<WriterOptionsPrivate>( + new WriterOptionsPrivate(*(rhs.privateBits.get())))) { // PASS } @@ -92,8 +97,7 @@ namespace orc { // PASS } RleVersion WriterOptions::getRleVersion() const { - if(privateBits->fileVersion == FileVersion::v_0_11()) - { + if (privateBits->fileVersion == FileVersion::v_0_11()) { return RleVersion_1; } @@ -110,6 +114,9 @@ namespace orc { } WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) { + if (size >= (1 << 23)) { + throw std::invalid_argument("Compression block size cannot be greater or equal than 8M"); + } privateBits->compressionBlockSize = size; return *this; } @@ -167,8 +174,7 @@ namespace orc { return privateBits->compression; } - WriterOptions& WriterOptions::setCompressionStrategy( - CompressionStrategy strategy) { + WriterOptions& WriterOptions::setCompressionStrategy(CompressionStrategy strategy) { privateBits->compressionStrategy = strategy; return *this; } @@ -216,8 +222,7 @@ namespace orc { return privateBits->dictionaryKeySizeThreshold > 0.0; } - WriterOptions& WriterOptions::setColumnsUseBloomFilter( - const std::set<uint64_t>& columns) { + WriterOptions& WriterOptions::setColumnsUseBloomFilter(const std::set<uint64_t>& columns) { privateBits->columnsUseBloomFilter = columns; return *this; } @@ -255,12 +260,39 @@ namespace orc { return *this; } + WriterMetrics* WriterOptions::getWriterMetrics() const { + return privateBits->metrics; + } + + WriterOptions& WriterOptions::setWriterMetrics(WriterMetrics* metrics) { + privateBits->metrics = metrics; + return *this; + } + + WriterOptions& WriterOptions::setUseTightNumericVector(bool useTightNumericVector) { + privateBits->useTightNumericVector = useTightNumericVector; + return *this; + } + + bool WriterOptions::getUseTightNumericVector() const { + return privateBits->useTightNumericVector; + } + + WriterOptions& WriterOptions::setOutputBufferCapacity(uint64_t capacity) { + privateBits->outputBufferCapacity = capacity; + return *this; + } + + uint64_t WriterOptions::getOutputBufferCapacity() const { + return privateBits->outputBufferCapacity; + } + Writer::~Writer() { // PASS } class WriterImpl : public Writer { - private: + private: std::unique_ptr<ColumnWriter> columnWriter; std::unique_ptr<BufferedOutputStream> compressionStream; std::unique_ptr<BufferedOutputStream> bufferedStream; @@ -277,23 +309,24 @@ namespace orc { static const char* magicId; static const WriterId writerId; + bool useTightNumericVector; + int32_t stripesAtLastFlush; + uint64_t lastFlushOffset; - public: - WriterImpl( - const Type& type, - OutputStream* stream, - const WriterOptions& options); + public: + WriterImpl(const Type& type, OutputStream* stream, const WriterOptions& options); - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) - const override; + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) const override; void add(ColumnVectorBatch& rowsToAdd) override; void close() override; - void addUserMetadata(const std::string name, const std::string value) override; + void addUserMetadata(const std::string& name, const std::string& value) override; - private: + uint64_t writeIntermediateFooter() override; + + private: void init(); void initStripe(); void writeStripe(); @@ -301,48 +334,41 @@ namespace orc { void writeFileFooter(); void writePostscript(); void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index); - static proto::CompressionKind convertCompressionKind( - const CompressionKind& kind); + static proto::CompressionKind convertCompressionKind(const CompressionKind& kind); }; - const char * WriterImpl::magicId = "ORC"; + const char* WriterImpl::magicId = "ORC"; const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER; - WriterImpl::WriterImpl( - const Type& t, - OutputStream* stream, - const WriterOptions& opts) : - outStream(stream), - options(opts), - type(t) { + WriterImpl::WriterImpl(const Type& t, OutputStream* stream, const WriterOptions& opts) + : outStream(stream), options(opts), type(t) { streamsFactory = createStreamsFactory(options, outStream); columnWriter = buildWriter(type, *streamsFactory, options); stripeRows = totalRows = indexRows = 0; currentOffset = 0; + stripesAtLastFlush = 0; + lastFlushOffset = 0; + + useTightNumericVector = opts.getUseTightNumericVector(); // compression stream for stripe footer, file footer and metadata - compressionStream = createCompressor( - options.getCompression(), - outStream, - options.getCompressionStrategy(), - 1 * 1024 * 1024, // buffer capacity: 1M - options.getCompressionBlockSize(), - *options.getMemoryPool()); + compressionStream = + createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(), + options.getOutputBufferCapacity(), options.getCompressionBlockSize(), + *options.getMemoryPool(), options.getWriterMetrics()); // uncompressed stream for post script - bufferedStream.reset(new BufferedOutputStream( - *options.getMemoryPool(), - outStream, - 1024, // buffer capacity: 1024 bytes - options.getCompressionBlockSize())); + bufferedStream.reset(new BufferedOutputStream(*options.getMemoryPool(), outStream, + 1024, // buffer capacity: 1024 bytes + options.getCompressionBlockSize(), + options.getWriterMetrics())); init(); } - std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) - const { - return type.createRowBatch(size, *options.getMemoryPool()); + std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) const { + return type.createRowBatch(size, *options.getMemoryPool(), false, useTightNumericVector); } void WriterImpl::add(ColumnVectorBatch& rowsToAdd) { @@ -351,8 +377,7 @@ namespace orc { uint64_t chunkSize = 0; uint64_t rowIndexStride = options.getRowIndexStride(); while (pos < rowsToAdd.numElements) { - chunkSize = std::min(rowsToAdd.numElements - pos, - rowIndexStride - indexRows); + chunkSize = std::min(rowsToAdd.numElements - pos, rowIndexStride - indexRows); columnWriter->add(rowsToAdd, pos, chunkSize, nullptr); pos += chunkSize; @@ -384,7 +409,25 @@ namespace orc { outStream->close(); } - void WriterImpl::addUserMetadata(const std::string name, const std::string value){ + uint64_t WriterImpl::writeIntermediateFooter() { + if (stripeRows > 0) { + writeStripe(); + } + if (stripesAtLastFlush != fileFooter.stripes_size()) { + writeMetadata(); + writeFileFooter(); + writePostscript(); + stripesAtLastFlush = fileFooter.stripes_size(); + outStream->flush(); + lastFlushOffset = outStream->getLength(); + currentOffset = lastFlushOffset; + // init stripe now that we adjusted the currentOffset + initStripe(); + } + return lastFlushOffset; + } + + void WriterImpl::addUserMetadata(const std::string& name, const std::string& value) { proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata(); userMetadataItem->set_name(TString(name)); userMetadataItem->set_value(TString(value)); @@ -393,31 +436,32 @@ namespace orc { void WriterImpl::init() { // Write file header const static size_t magicIdLength = strlen(WriterImpl::magicId); - outStream->write(WriterImpl::magicId, magicIdLength); + { + SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + outStream->write(WriterImpl::magicId, magicIdLength); + } currentOffset += magicIdLength; // Initialize file footer - fileFooter.set_headerlength(currentOffset); - fileFooter.set_contentlength(0); - fileFooter.set_numberofrows(0); - fileFooter.set_rowindexstride( - static_cast<uint32_t>(options.getRowIndexStride())); + fileFooter.set_header_length(currentOffset); + fileFooter.set_content_length(0); + fileFooter.set_number_of_rows(0); + fileFooter.set_row_index_stride(static_cast<uint32_t>(options.getRowIndexStride())); fileFooter.set_writer(writerId); - fileFooter.set_softwareversion(ORC_VERSION); + fileFooter.set_software_version(ORC_VERSION); uint32_t index = 0; buildFooterType(type, fileFooter, index); // Initialize post script - postScript.set_footerlength(0); - postScript.set_compression( - WriterImpl::convertCompressionKind(options.getCompression())); - postScript.set_compressionblocksize(options.getCompressionBlockSize()); + postScript.set_footer_length(0); + postScript.set_compression(WriterImpl::convertCompressionKind(options.getCompression())); + postScript.set_compression_block_size(options.getCompressionBlockSize()); postScript.add_version(options.getFileVersion().getMajor()); postScript.add_version(options.getFileVersion().getMinor()); - postScript.set_writerversion(WriterVersion_ORC_135); + postScript.set_writer_version(WriterVersion_ORC_135); postScript.set_magic("ORC"); // Initialize first stripe @@ -426,10 +470,10 @@ namespace orc { void WriterImpl::initStripe() { stripeInfo.set_offset(currentOffset); - stripeInfo.set_indexlength(0); - stripeInfo.set_datalength(0); - stripeInfo.set_footerlength(0); - stripeInfo.set_numberofrows(0); + stripeInfo.set_index_length(0); + stripeInfo.set_data_length(0); + stripeInfo.set_footer_length(0); + stripeInfo.set_number_of_rows(0); stripeRows = indexRows = 0; } @@ -466,14 +510,14 @@ namespace orc { *stripeFooter.add_columns() = encodings[i]; } - stripeFooter.set_writertimezone(TString(options.getTimezoneName())); + stripeFooter.set_writer_timezone(TString(options.getTimezoneName())); // add stripe statistics to metadata - proto::StripeStatistics* stripeStats = metadata.add_stripestats(); + proto::StripeStatistics* stripeStats = metadata.add_stripe_stats(); std::vector<proto::ColumnStatistics> colStats; columnWriter->getStripeStatistics(colStats); for (uint32_t i = 0; i != colStats.size(); ++i) { - *stripeStats->add_colstats() = colStats[i]; + *stripeStats->add_col_stats() = colStats[i]; } // merge stripe stats into file stats and clear stripe stats columnWriter->mergeStripeStatsIntoFileStats(); @@ -496,10 +540,10 @@ namespace orc { } // update stripe info - stripeInfo.set_indexlength(indexLength); - stripeInfo.set_datalength(dataLength); - stripeInfo.set_footerlength(footerLength); - stripeInfo.set_numberofrows(stripeRows); + stripeInfo.set_index_length(indexLength); + stripeInfo.set_data_length(dataLength); + stripeInfo.set_footer_length(footerLength); + stripeInfo.set_number_of_rows(stripeRows); *fileFooter.add_stripes() = stripeInfo; @@ -515,16 +559,17 @@ namespace orc { if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) { throw std::logic_error("Failed to write metadata."); } - postScript.set_metadatalength(compressionStream.get()->flush()); + postScript.set_metadata_length(compressionStream.get()->flush()); } void WriterImpl::writeFileFooter() { - fileFooter.set_contentlength(currentOffset - fileFooter.headerlength()); - fileFooter.set_numberofrows(totalRows); + fileFooter.set_content_length(currentOffset - fileFooter.header_length()); + fileFooter.set_number_of_rows(totalRows); // update file statistics std::vector<proto::ColumnStatistics> colStats; columnWriter->getFileStatistics(colStats); + fileFooter.clear_statistics(); for (uint32_t i = 0; i != colStats.size(); ++i) { *fileFooter.add_statistics() = colStats[i]; } @@ -532,106 +577,103 @@ namespace orc { if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) { throw std::logic_error("Failed to write file footer."); } - postScript.set_footerlength(compressionStream->flush()); + postScript.set_footer_length(compressionStream->flush()); } void WriterImpl::writePostscript() { if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) { throw std::logic_error("Failed to write post script."); } - unsigned char psLength = - static_cast<unsigned char>(bufferedStream->flush()); + unsigned char psLength = static_cast<unsigned char>(bufferedStream->flush()); + SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); outStream->write(&psLength, sizeof(unsigned char)); } - void WriterImpl::buildFooterType( - const Type& t, - proto::Footer& footer, - uint32_t & index) { + void WriterImpl::buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index) { proto::Type protoType; - protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength())); + protoType.set_maximum_length(static_cast<uint32_t>(t.getMaximumLength())); protoType.set_precision(static_cast<uint32_t>(t.getPrecision())); protoType.set_scale(static_cast<uint32_t>(t.getScale())); switch (t.getKind()) { - case BOOLEAN: { - protoType.set_kind(proto::Type_Kind_BOOLEAN); - break; - } - case BYTE: { - protoType.set_kind(proto::Type_Kind_BYTE); - break; - } - case SHORT: { - protoType.set_kind(proto::Type_Kind_SHORT); - break; - } - case INT: { - protoType.set_kind(proto::Type_Kind_INT); - break; - } - case LONG: { - protoType.set_kind(proto::Type_Kind_LONG); - break; - } - case FLOAT: { - protoType.set_kind(proto::Type_Kind_FLOAT); - break; - } - case DOUBLE: { - protoType.set_kind(proto::Type_Kind_DOUBLE); - break; - } - case STRING: { - protoType.set_kind(proto::Type_Kind_STRING); - break; - } - case BINARY: { - protoType.set_kind(proto::Type_Kind_BINARY); - break; - } - case TIMESTAMP: { - protoType.set_kind(proto::Type_Kind_TIMESTAMP); - break; - } - case TIMESTAMP_INSTANT: { - protoType.set_kind(proto::Type_Kind_TIMESTAMP_INSTANT); - break; - } - case LIST: { - protoType.set_kind(proto::Type_Kind_LIST); - break; - } - case MAP: { - protoType.set_kind(proto::Type_Kind_MAP); - break; - } - case STRUCT: { - protoType.set_kind(proto::Type_Kind_STRUCT); - break; - } - case UNION: { - protoType.set_kind(proto::Type_Kind_UNION); - break; - } - case DECIMAL: { - protoType.set_kind(proto::Type_Kind_DECIMAL); - break; - } - case DATE: { - protoType.set_kind(proto::Type_Kind_DATE); - break; - } - case VARCHAR: { - protoType.set_kind(proto::Type_Kind_VARCHAR); - break; - } - case CHAR: { - protoType.set_kind(proto::Type_Kind_CHAR); - break; - } - default: - throw std::logic_error("Unknown type."); + case BOOLEAN: { + protoType.set_kind(proto::Type_Kind_BOOLEAN); + break; + } + case BYTE: { + protoType.set_kind(proto::Type_Kind_BYTE); + break; + } + case SHORT: { + protoType.set_kind(proto::Type_Kind_SHORT); + break; + } + case INT: { + protoType.set_kind(proto::Type_Kind_INT); + break; + } + case LONG: { + protoType.set_kind(proto::Type_Kind_LONG); + break; + } + case FLOAT: { + protoType.set_kind(proto::Type_Kind_FLOAT); + break; + } + case DOUBLE: { + protoType.set_kind(proto::Type_Kind_DOUBLE); + break; + } + case STRING: { + protoType.set_kind(proto::Type_Kind_STRING); + break; + } + case BINARY: { + protoType.set_kind(proto::Type_Kind_BINARY); + break; + } + case TIMESTAMP: { + protoType.set_kind(proto::Type_Kind_TIMESTAMP); + break; + } + case TIMESTAMP_INSTANT: { + protoType.set_kind(proto::Type_Kind_TIMESTAMP_INSTANT); + break; + } + case LIST: { + protoType.set_kind(proto::Type_Kind_LIST); + break; + } + case MAP: { + protoType.set_kind(proto::Type_Kind_MAP); + break; + } + case STRUCT: { + protoType.set_kind(proto::Type_Kind_STRUCT); + break; + } + case UNION: { + protoType.set_kind(proto::Type_Kind_UNION); + break; + } + case DECIMAL: { + protoType.set_kind(proto::Type_Kind_DECIMAL); + break; + } + case DATE: { + protoType.set_kind(proto::Type_Kind_DATE); + break; + } + case VARCHAR: { + protoType.set_kind(proto::Type_Kind_VARCHAR); + break; + } + case CHAR: { + protoType.set_kind(proto::Type_Kind_CHAR); + break; + } + default: + throw std::logic_error("Unknown type."); } for (auto& key : t.getAttributeKeys()) { @@ -647,28 +689,20 @@ namespace orc { for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) { // only add subtypes' field names if this type is STRUCT if (t.getKind() == STRUCT) { - footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i))); + footer.mutable_types(pos)->add_field_names(TString(t.getFieldName(i))); } footer.mutable_types(pos)->add_subtypes(++index); buildFooterType(*t.getSubtype(i), footer, index); } } - proto::CompressionKind WriterImpl::convertCompressionKind( - const CompressionKind& kind) { + proto::CompressionKind WriterImpl::convertCompressionKind(const CompressionKind& kind) { return static_cast<proto::CompressionKind>(kind); } - std::unique_ptr<Writer> createWriter( - const Type& type, - OutputStream* stream, + std::unique_ptr<Writer> createWriter(const Type& type, OutputStream* stream, const WriterOptions& options) { - return std::unique_ptr<Writer>( - new WriterImpl( - type, - stream, - options)); + return std::unique_ptr<Writer>(new WriterImpl(type, stream, options)); } -} - +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc index ec798d4ed7..3bf1781747 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc @@ -16,26 +16,22 @@ * limitations under the License. */ -#include "orc/Exceptions.hh" #include "InputStream.hh" +#include "orc/Exceptions.hh" #include <algorithm> #include <iomanip> namespace orc { - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length) { + void printBuffer(std::ostream& out, const char* buffer, uint64_t length) { const uint64_t width = 24; out << std::hex; - for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { + for (uint64_t line = 0; line < (length + width - 1) / width; ++line) { out << std::setfill('0') << std::setw(7) << (line * width); - for(uint64_t byte = 0; - byte < width && line * width + byte < length; ++byte) { + for (uint64_t byte = 0; byte < width && line * width + byte < length; ++byte) { out << " " << std::setfill('0') << std::setw(2) - << static_cast<uint64_t>(0xff & buffer[line * width + - byte]); + << static_cast<uint64_t>(0xff & buffer[line * width + byte]); } out << "\n"; } @@ -64,26 +60,23 @@ namespace orc { // PASS } - SeekableArrayInputStream::SeekableArrayInputStream - (const unsigned char* values, - uint64_t size, - uint64_t blkSize - ): data(reinterpret_cast<const char*>(values)) { + SeekableArrayInputStream::SeekableArrayInputStream(const unsigned char* values, uint64_t size, + uint64_t blkSize) + : data(reinterpret_cast<const char*>(values)) { length = size; position = 0; blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); } - SeekableArrayInputStream::SeekableArrayInputStream(const char* values, - uint64_t size, - uint64_t blkSize - ): data(values) { + SeekableArrayInputStream::SeekableArrayInputStream(const char* values, uint64_t size, + uint64_t blkSize) + : data(values) { length = size; position = 0; blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); } - bool SeekableArrayInputStream::Next(const void** buffer, int*size) { + bool SeekableArrayInputStream::Next(const void** buffer, int* size) { uint64_t currentSize = std::min(length - position, blockSize); if (currentSize > 0) { *buffer = data + position; @@ -137,19 +130,14 @@ namespace orc { return std::min(length, request == 0 ? 256 * 1024 : request); } - SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, - uint64_t offset, - uint64_t byteCount, - MemoryPool& _pool, - uint64_t _blockSize - ):pool(_pool), - input(stream), - start(offset), - length(byteCount), - blockSize(computeBlock - (_blockSize, - length)) { - + SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, uint64_t offset, + uint64_t byteCount, MemoryPool& _pool, + uint64_t _blockSize) + : pool(_pool), + input(stream), + start(offset), + length(byteCount), + blockSize(computeBlock(_blockSize, length)) { position = 0; buffer.reset(new DataBuffer<char>(pool)); pushBack = 0; @@ -159,7 +147,7 @@ namespace orc { // PASS } - bool SeekableFileInputStream::Next(const void** data, int*size) { + bool SeekableFileInputStream::Next(const void** data, int* size) { uint64_t bytesRead; if (pushBack != 0) { *data = buffer->data() + (buffer->size() - pushBack); @@ -168,7 +156,7 @@ namespace orc { bytesRead = std::min(length - position, blockSize); buffer->resize(bytesRead); if (bytesRead > 0) { - input->read(buffer->data(), bytesRead, start+position); + input->read(buffer->data(), bytesRead, start + position); *data = static_cast<void*>(buffer->data()); } } @@ -218,9 +206,8 @@ namespace orc { std::string SeekableFileInputStream::getName() const { std::ostringstream result; - result << input->getName() << " from " << start << " for " - << length; + result << input->getName() << " from " << start << " for " << length; return result.str(); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh index ab7ecedb44..33c64f8809 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh @@ -23,22 +23,21 @@ #include "orc/OrcFile.hh" #include "wrap/zero-copy-stream-wrapper.h" -#include <list> #include <fstream> #include <iostream> +#include <list> #include <sstream> #include <vector> namespace orc { - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length); + void printBuffer(std::ostream& out, const char* buffer, uint64_t length); class PositionProvider { - private: + private: std::list<uint64_t>::const_iterator position; - public: + + public: PositionProvider(const std::list<uint64_t>& positions); uint64_t next(); uint64_t current(); @@ -49,9 +48,9 @@ namespace orc { * By extending Google's class, we get the ability to pass it directly * to the protobuf readers. */ - class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { - public: - virtual ~SeekableInputStream(); + class SeekableInputStream : public google::protobuf::io::ZeroCopyInputStream { + public: + ~SeekableInputStream() override; virtual void seek(PositionProvider& position) = 0; virtual std::string getName() const = 0; }; @@ -59,22 +58,18 @@ namespace orc { /** * Create a seekable input stream based on a memory range. */ - class SeekableArrayInputStream: public SeekableInputStream { - private: + class SeekableArrayInputStream : public SeekableInputStream { + private: const char* data; uint64_t length; uint64_t position; uint64_t blockSize; - public: - SeekableArrayInputStream(const unsigned char* list, - uint64_t length, - uint64_t block_size = 0); - SeekableArrayInputStream(const char* list, - uint64_t length, - uint64_t block_size = 0); + public: + SeekableArrayInputStream(const unsigned char* list, uint64_t length, uint64_t block_size = 0); + SeekableArrayInputStream(const char* list, uint64_t length, uint64_t block_size = 0); virtual ~SeekableArrayInputStream() override; - virtual bool Next(const void** data, int*size) override; + virtual bool Next(const void** data, int* size) override; virtual void BackUp(int count) override; virtual bool Skip(int count) override; virtual int64_t ByteCount() const override; @@ -85,8 +80,8 @@ namespace orc { /** * Create a seekable input stream based on an input stream. */ - class SeekableFileInputStream: public SeekableInputStream { - private: + class SeekableFileInputStream : public SeekableInputStream { + private: MemoryPool& pool; InputStream* const input; const uint64_t start; @@ -96,15 +91,12 @@ namespace orc { uint64_t position; uint64_t pushBack; - public: - SeekableFileInputStream(InputStream* input, - uint64_t offset, - uint64_t byteCount, - MemoryPool& pool, - uint64_t blockSize = 0); + public: + SeekableFileInputStream(InputStream* input, uint64_t offset, uint64_t byteCount, + MemoryPool& pool, uint64_t blockSize = 0); virtual ~SeekableFileInputStream() override; - virtual bool Next(const void** data, int*size) override; + virtual bool Next(const void** data, int* size) override; virtual void BackUp(int count) override; virtual bool Skip(int count) override; virtual int64_t ByteCount() const override; @@ -112,6 +104,6 @@ namespace orc { virtual std::string getName() const override; }; -} +} // namespace orc -#endif //ORC_INPUTSTREAM_HH +#endif // ORC_INPUTSTREAM_HH diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc index 14d5e5e7c4..7d9fb92206 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc @@ -16,8 +16,9 @@ * limitations under the License. */ -#include "orc/Exceptions.hh" #include "OutputStream.hh" +#include "Utils.hh" +#include "orc/Exceptions.hh" #include <sstream> @@ -27,14 +28,11 @@ namespace orc { // PASS } - BufferedOutputStream::BufferedOutputStream( - MemoryPool& pool, - OutputStream * outStream, - uint64_t capacity_, - uint64_t blockSize_) - : outputStream(outStream), - blockSize(blockSize_) { - dataBuffer.reset(new DataBuffer<char>(pool)); + BufferedOutputStream::BufferedOutputStream(MemoryPool& pool, OutputStream* outStream, + uint64_t capacity_, uint64_t blockSize_, + WriterMetrics* metrics_) + : outputStream(outStream), blockSize(blockSize_), metrics(metrics_) { + dataBuffer.reset(new BlockBuffer(pool, blockSize)); dataBuffer->reserve(capacity_); } @@ -43,16 +41,12 @@ namespace orc { } bool BufferedOutputStream::Next(void** buffer, int* size) { - *size = static_cast<int>(blockSize); - uint64_t oldSize = dataBuffer->size(); - uint64_t newSize = oldSize + blockSize; - uint64_t newCapacity = dataBuffer->capacity(); - while (newCapacity < newSize) { - newCapacity += dataBuffer->capacity(); + auto block = dataBuffer->getNextBlock(); + if (block.data == nullptr) { + throw std::logic_error("Failed to get next buffer from block buffer."); } - dataBuffer->reserve(newCapacity); - dataBuffer->resize(newSize); - *buffer = dataBuffer->data() + oldSize; + *buffer = block.data; + *size = static_cast<int>(block.size); return true; } @@ -71,7 +65,7 @@ namespace orc { return static_cast<google::protobuf::int64>(dataBuffer->size()); } - bool BufferedOutputStream::WriteAliasedRaw(const void *, int) { + bool BufferedOutputStream::WriteAliasedRaw(const void*, int) { throw NotImplementedYet("WriteAliasedRaw is not supported."); } @@ -81,8 +75,7 @@ namespace orc { std::string BufferedOutputStream::getName() const { std::ostringstream result; - result << "BufferedOutputStream " << dataBuffer->size() << " of " - << dataBuffer->capacity(); + result << "BufferedOutputStream " << dataBuffer->size() << " of " << dataBuffer->capacity(); return result.str(); } @@ -92,7 +85,11 @@ namespace orc { uint64_t BufferedOutputStream::flush() { uint64_t dataSize = dataBuffer->size(); - outputStream->write(dataBuffer->data(), dataSize); + // flush data buffer into outputStream + if (dataSize > 0) { + SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); + dataBuffer->writeTo(outputStream, metrics); + } dataBuffer->resize(0); return dataSize; } @@ -101,20 +98,16 @@ namespace orc { dataBuffer->resize(0); } - void AppendOnlyBufferedStream::write(const char * data, size_t size) { + void AppendOnlyBufferedStream::write(const char* data, size_t size) { size_t dataOffset = 0; while (size > 0) { if (bufferOffset == bufferLength) { - if (!outStream->Next( - reinterpret_cast<void **>(&buffer), - &bufferLength)) { + if (!outStream->Next(reinterpret_cast<void**>(&buffer), &bufferLength)) { throw std::logic_error("Failed to allocate buffer."); } bufferOffset = 0; } - size_t len = std::min( - static_cast<size_t>(bufferLength - bufferOffset), - size); + size_t len = std::min(static_cast<size_t>(bufferLength - bufferOffset), size); memcpy(buffer + bufferOffset, data + dataOffset, len); bufferOffset += static_cast<int>(len); dataOffset += len; @@ -148,4 +141,4 @@ namespace orc { } } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh index 0fb92465e9..d8bc21ce6d 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh @@ -20,6 +20,7 @@ #define ORC_OUTPUTSTREAM_HH #include "Adaptor.hh" +#include "BlockBuffer.hh" #include "orc/OrcFile.hh" #include "wrap/zero-copy-stream-wrapper.h" @@ -27,36 +28,41 @@ namespace orc { /** * Record write position for creating index stream - */ + */ class PositionRecorder { - public: + public: virtual ~PositionRecorder(); virtual void add(uint64_t pos) = 0; }; + DIAGNOSTIC_PUSH + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wunused-private-field") +#endif + struct WriterMetrics; /** * A subclass of Google's ZeroCopyOutputStream that supports output to memory * buffer, and flushing to OutputStream. * By extending Google's class, we get the ability to pass it directly * to the protobuf writers. */ - class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream { - private: - OutputStream * outputStream; - std::unique_ptr<DataBuffer<char> > dataBuffer; + class BufferedOutputStream : public google::protobuf::io::ZeroCopyOutputStream { + private: + OutputStream* outputStream; + std::unique_ptr<BlockBuffer> dataBuffer; uint64_t blockSize; + WriterMetrics* metrics; - public: - BufferedOutputStream(MemoryPool& pool, - OutputStream * outStream, - uint64_t capacity, - uint64_t block_size); + public: + BufferedOutputStream(MemoryPool& pool, OutputStream* outStream, uint64_t capacity, + uint64_t block_size, WriterMetrics* metrics); virtual ~BufferedOutputStream() override; - virtual bool Next(void** data, int*size) override; + virtual bool Next(void** data, int* size) override; virtual void BackUp(int count) override; virtual int64_t ByteCount() const override; - virtual bool WriteAliasedRaw(const void * data, int size) override; + virtual bool WriteAliasedRaw(const void* data, int size) override; virtual bool AllowsAliasing() const override; virtual std::string getName() const; @@ -64,8 +70,11 @@ namespace orc { virtual uint64_t flush(); virtual void suppress(); - virtual bool isCompressed() const { return false; } + virtual bool isCompressed() const { + return false; + } }; + DIAGNOSTIC_POP /** * An append only buffered stream that allows @@ -74,24 +83,24 @@ namespace orc { * to the protobuf writers. */ class AppendOnlyBufferedStream { - private: + private: std::unique_ptr<BufferedOutputStream> outStream; - char * buffer; + char* buffer; int bufferOffset, bufferLength; - public: - AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) : - outStream(std::move(_outStream)) { + public: + AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) + : outStream(std::move(_outStream)) { buffer = nullptr; bufferOffset = bufferLength = 0; } - void write(const char * data, size_t size); + void write(const char* data, size_t size); uint64_t getSize() const; uint64_t flush(); void recordPosition(PositionRecorder* recorder) const; }; -} +} // namespace orc -#endif // ORC_OUTPUTSTREAM_HH +#endif // ORC_OUTPUTSTREAM_HH diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc index e7d87083d8..9176c1f6c3 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc @@ -24,41 +24,28 @@ namespace orc { ExpressionTree::ExpressionTree(Operator op) - : mOperator(op) - , mLeaf(UNUSED_LEAF) - , mConstant(TruthValue::YES_NO_NULL) { - } - + : mOperator(op), mLeaf(UNUSED_LEAF), mConstant(TruthValue::YES_NO_NULL) {} - ExpressionTree::ExpressionTree(Operator op, - std::initializer_list<TreeNode> children) - : mOperator(op) - , mChildren(children.begin(), children.end()) - , mLeaf(UNUSED_LEAF) - , mConstant(TruthValue::YES_NO_NULL) { + ExpressionTree::ExpressionTree(Operator op, std::initializer_list<TreeNode> children) + : mOperator(op), + mChildren(children.begin(), children.end()), + mLeaf(UNUSED_LEAF), + mConstant(TruthValue::YES_NO_NULL) { // PASS } ExpressionTree::ExpressionTree(size_t leaf) - : mOperator(Operator::LEAF) - , mChildren() - , mLeaf(leaf) - , mConstant(TruthValue::YES_NO_NULL) { + : mOperator(Operator::LEAF), mChildren(), mLeaf(leaf), mConstant(TruthValue::YES_NO_NULL) { // PASS } ExpressionTree::ExpressionTree(TruthValue constant) - : mOperator(Operator::CONSTANT) - , mChildren() - , mLeaf(UNUSED_LEAF) - , mConstant(constant) { + : mOperator(Operator::CONSTANT), mChildren(), mLeaf(UNUSED_LEAF), mConstant(constant) { // PASS } ExpressionTree::ExpressionTree(const ExpressionTree& other) - : mOperator(other.mOperator) - , mLeaf(other.mLeaf) - , mConstant(other.mConstant) { + : mOperator(other.mOperator), mLeaf(other.mLeaf), mConstant(other.mConstant) { for (TreeNode child : other.mChildren) { mChildren.emplace_back(std::make_shared<ExpressionTree>(*child)); } @@ -74,7 +61,7 @@ namespace orc { std::vector<TreeNode>& ExpressionTree::getChildren() { return const_cast<std::vector<TreeNode>&>( - const_cast<const ExpressionTree *>(this)->getChildren()); + const_cast<const ExpressionTree*>(this)->getChildren()); } const TreeNode ExpressionTree::getChild(size_t i) const { @@ -83,7 +70,7 @@ namespace orc { TreeNode ExpressionTree::getChild(size_t i) { return std::const_pointer_cast<ExpressionTree>( - const_cast<const ExpressionTree *>(this)->getChild(i)); + const_cast<const ExpressionTree*>(this)->getChild(i)); } TruthValue ExpressionTree::getConstant() const { @@ -105,20 +92,17 @@ namespace orc { mChildren.push_back(child); } - TruthValue ExpressionTree::evaluate( - const std::vector<TruthValue>& leaves) const { + TruthValue ExpressionTree::evaluate(const std::vector<TruthValue>& leaves) const { TruthValue result; switch (mOperator) { - case Operator::OR: - { + case Operator::OR: { result = mChildren.at(0)->evaluate(leaves); for (size_t i = 1; i < mChildren.size() && !isNeeded(result); ++i) { result = mChildren.at(i)->evaluate(leaves) || result; } return result; } - case Operator::AND: - { + case Operator::AND: { result = mChildren.at(0)->evaluate(leaves); for (size_t i = 1; i < mChildren.size() && isNeeded(result); ++i) { result = mChildren.at(i)->evaluate(leaves) && result; @@ -189,4 +173,4 @@ namespace orc { return sstream.str(); } -} // namespace orc +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh index bb3d16e924..3e0b331a2d 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.hh @@ -40,7 +40,7 @@ namespace orc { * the SearchArgument into an internal form. */ class ExpressionTree { - public: + public: enum class Operator { OR, AND, NOT, LEAF, CONSTANT }; ExpressionTree(Operator op); @@ -73,13 +73,13 @@ namespace orc { TruthValue evaluate(const std::vector<TruthValue>& leaves) const; - private: + private: Operator mOperator; std::vector<TreeNode> mChildren; size_t mLeaf; TruthValue mConstant; }; -} // namespace orc +} // namespace orc -#endif //ORC_EXPRESSIONTREE_HH +#endif // ORC_EXPRESSIONTREE_HH diff --git a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc index da4cdd0d47..c0cdd62201 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/Literal.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/Literal.cc @@ -78,7 +78,7 @@ namespace orc { mHashCode = hashCode(); } - Literal::Literal(const char * str, size_t size) { + Literal::Literal(const char* str, size_t size) { mType = PredicateDataType::STRING; mValue.Buffer = new char[size]; memcpy(mValue.Buffer, str, size); @@ -110,10 +110,8 @@ namespace orc { mHashCode = hashCode(); } - Literal::Literal(const Literal& r): mType(r.mType) - , mSize(r.mSize) - , mIsNull(r.mIsNull) - , mHashCode(r.mHashCode) { + Literal::Literal(const Literal& r) + : mType(r.mType), mSize(r.mSize), mIsNull(r.mIsNull), mHashCode(r.mHashCode) { if (mType == PredicateDataType::STRING) { mValue.Buffer = new char[r.mSize]; memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize); @@ -134,7 +132,7 @@ namespace orc { Literal::~Literal() { if (mType == PredicateDataType::STRING && mValue.Buffer) { - delete [] mValue.Buffer; + delete[] mValue.Buffer; mValue.Buffer = nullptr; } } @@ -142,7 +140,7 @@ namespace orc { Literal& Literal::operator=(const Literal& r) { if (this != &r) { if (mType == PredicateDataType::STRING && mValue.Buffer) { - delete [] mValue.Buffer; + delete[] mValue.Buffer; mValue.Buffer = nullptr; } @@ -178,8 +176,7 @@ namespace orc { sstream << mValue.DateVal; break; case PredicateDataType::TIMESTAMP: - sstream << mValue.TimeStampVal.second << "." - << mValue.TimeStampVal.nanos; + sstream << mValue.TimeStampVal.second << "." << mValue.TimeStampVal.nanos; break; case PredicateDataType::FLOAT: sstream << mValue.DoubleVal; @@ -209,14 +206,13 @@ namespace orc { return std::hash<int64_t>{}(mValue.DateVal); case PredicateDataType::TIMESTAMP: return std::hash<int64_t>{}(mValue.TimeStampVal.second) * 17 + - std::hash<int32_t>{}(mValue.TimeStampVal.nanos); + std::hash<int32_t>{}(mValue.TimeStampVal.nanos); case PredicateDataType::FLOAT: return std::hash<double>{}(mValue.DoubleVal); case PredicateDataType::BOOLEAN: return std::hash<bool>{}(mValue.BooleanVal); case PredicateDataType::STRING: - return std::hash<std::string>{}( - std::string(mValue.Buffer, mSize)); + return std::hash<std::string>{}(std::string(mValue.Buffer, mSize)); case PredicateDataType::DECIMAL: // current glibc does not support hash<int128_t> return std::hash<int64_t>{}(mValue.IntVal); @@ -246,12 +242,11 @@ namespace orc { return mValue.TimeStampVal == r.mValue.TimeStampVal; case PredicateDataType::FLOAT: return std::fabs(mValue.DoubleVal - r.mValue.DoubleVal) < - std::numeric_limits<double>::epsilon(); + std::numeric_limits<double>::epsilon(); case PredicateDataType::BOOLEAN: return mValue.BooleanVal == r.mValue.BooleanVal; case PredicateDataType::STRING: - return mSize == r.mSize && memcmp( - mValue.Buffer, r.mValue.Buffer, mSize) == 0; + return mSize == r.mSize && memcmp(mValue.Buffer, r.mValue.Buffer, mSize) == 0; case PredicateDataType::DECIMAL: return mValue.DecimalVal == r.mValue.DecimalVal; default: @@ -263,8 +258,7 @@ namespace orc { return !(*this == r); } - inline void validate(const bool& isNull, - const PredicateDataType& type, + inline void validate(const bool& isNull, const PredicateDataType& type, const PredicateDataType& expected) { if (isNull) { throw std::logic_error("cannot get value when it is null!"); @@ -309,4 +303,4 @@ namespace orc { return Decimal(mValue.DecimalVal, mScale); } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc index 3b012cece4..5fceedd854 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc @@ -16,10 +16,10 @@ * limitations under the License. */ +#include "PredicateLeaf.hh" #include "orc/BloomFilter.hh" #include "orc/Common.hh" #include "orc/Type.hh" -#include "PredicateLeaf.hh" #include <algorithm> #include <functional> @@ -28,81 +28,62 @@ namespace orc { - PredicateLeaf::PredicateLeaf(Operator op, - PredicateDataType type, - const std::string& colName, + PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, Literal literal) - : mOperator(op) - , mType(type) - , mColumnName(colName) - , mHasColumnName(true) - , mColumnId(0) { + : mOperator(op), mType(type), mColumnName(colName), mHasColumnName(true), mColumnId(0) { mLiterals.emplace_back(literal); mHashCode = hashCode(); validate(); } - PredicateLeaf::PredicateLeaf(Operator op, - PredicateDataType type, - uint64_t columnId, + PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, Literal literal) - : mOperator(op) - , mType(type) - , mHasColumnName(false) - , mColumnId(columnId) { + : mOperator(op), mType(type), mHasColumnName(false), mColumnId(columnId) { mLiterals.emplace_back(literal); mHashCode = hashCode(); validate(); } - PredicateLeaf::PredicateLeaf(Operator op, - PredicateDataType type, - const std::string& colName, + PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, const std::initializer_list<Literal>& literals) - : mOperator(op) - , mType(type) - , mColumnName(colName) - , mHasColumnName(true) - , mLiterals(literals.begin(), literals.end()) { + : mOperator(op), + mType(type), + mColumnName(colName), + mHasColumnName(true), + mLiterals(literals.begin(), literals.end()) { mHashCode = hashCode(); validate(); } - PredicateLeaf::PredicateLeaf(Operator op, - PredicateDataType type, - uint64_t columnId, + PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, const std::initializer_list<Literal>& literals) - : mOperator(op) - , mType(type) - , mHasColumnName(false) - , mColumnId(columnId) - , mLiterals(literals.begin(), literals.end()) { + : mOperator(op), + mType(type), + mHasColumnName(false), + mColumnId(columnId), + mLiterals(literals.begin(), literals.end()) { mHashCode = hashCode(); validate(); } - PredicateLeaf::PredicateLeaf(Operator op, - PredicateDataType type, - const std::string& colName, + PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, const std::vector<Literal>& literals) - : mOperator(op) - , mType(type) - , mColumnName(colName) - , mHasColumnName(true) - , mLiterals(literals.begin(), literals.end()) { + : mOperator(op), + mType(type), + mColumnName(colName), + mHasColumnName(true), + mLiterals(literals.begin(), literals.end()) { mHashCode = hashCode(); validate(); } - PredicateLeaf::PredicateLeaf(Operator op, - PredicateDataType type, - uint64_t columnId, + PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, const std::vector<Literal>& literals) - : mOperator(op) - , mType(type) - , mHasColumnName(false) - , mColumnId(columnId) - , mLiterals(literals.begin(), literals.end()) { + : mOperator(op), + mType(type), + mHasColumnName(false), + mColumnId(columnId), + mLiterals(literals.begin(), literals.end()) { mHashCode = hashCode(); validate(); } @@ -131,8 +112,7 @@ namespace orc { if (mLiterals.size() != 1) { throw std::invalid_argument("One literal is required!"); } - if (static_cast<int>(mLiterals.at(0).getType()) != - static_cast<int>(mType)) { + if (static_cast<int>(mLiterals.at(0).getType()) != static_cast<int>(mType)) { throw std::invalid_argument("leaf and literal types do not match!"); } break; @@ -232,8 +212,7 @@ namespace orc { sstream << columnDebugString() << " = " << getLiteralString(mLiterals); break; case Operator::NULL_SAFE_EQUALS: - sstream << columnDebugString() << " null_safe_= " - << getLiteralString(mLiterals); + sstream << columnDebugString() << " null_safe_= " << getLiteralString(mLiterals); break; case Operator::LESS_THAN: sstream << columnDebugString() << " < " << getLiteralString(mLiterals); @@ -248,9 +227,8 @@ namespace orc { sstream << columnDebugString() << " between " << getLiteralsString(mLiterals); break; default: - sstream << "unknown operator, column: " - << columnDebugString() << ", literals: " - << getLiteralsString(mLiterals); + sstream << "unknown operator, column: " << columnDebugString() + << ", literals: " << getLiteralsString(mLiterals); } sstream << ')'; return sstream.str(); @@ -259,16 +237,11 @@ namespace orc { size_t PredicateLeaf::hashCode() const { size_t value = 0; std::for_each(mLiterals.cbegin(), mLiterals.cend(), - [&](const Literal& literal) { - value = value * 17 + literal.getHashCode(); - }); - auto colHash = mHasColumnName ? - std::hash<std::string>{}(mColumnName) : - std::hash<uint64_t>{}(mColumnId); - return value * 103 * 101 * 3 * 17 + - std::hash<int>{}(static_cast<int>(mOperator)) + - std::hash<int>{}(static_cast<int>(mType)) * 17 + - colHash * 3 * 17; + [&](const Literal& literal) { value = value * 17 + literal.getHashCode(); }); + auto colHash = + mHasColumnName ? std::hash<std::string>{}(mColumnName) : std::hash<uint64_t>{}(mColumnId); + return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(mOperator)) + + std::hash<int>{}(static_cast<int>(mType)) * 17 + colHash * 3 * 17; } bool PredicateLeaf::operator==(const PredicateLeaf& r) const { @@ -289,9 +262,7 @@ namespace orc { } // enum to mark the position of predicate in the range - enum class Location { - BEFORE, MIN, MIDDLE, MAX, AFTER - }; + enum class Location { BEFORE, MIN, MIDDLE, MAX, AFTER }; DIAGNOSTIC_PUSH DIAGNOSTIC_IGNORE("-Wfloat-equal") @@ -331,11 +302,8 @@ namespace orc { * @return the TruthValue result of the test */ template <typename T> - TruthValue evaluatePredicateRange(const PredicateLeaf::Operator op, - const std::vector<T>& values, - const T& minValue, - const T& maxValue, - bool hasNull) { + TruthValue evaluatePredicateRange(const PredicateLeaf::Operator op, const std::vector<T>& values, + const T& minValue, const T& maxValue, bool hasNull) { Location loc; switch (op) { case PredicateLeaf::Operator::NULL_SAFE_EQUALS: @@ -387,8 +355,7 @@ namespace orc { // are all of the values outside of the range? for (auto& value : values) { loc = compareToRange(value, minValue, maxValue); - if (loc == Location::MIN || loc == Location::MIDDLE || - loc == Location::MAX) { + if (loc == Location::MIN || loc == Location::MIDDLE || loc == Location::MAX) { return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; } } @@ -423,19 +390,17 @@ namespace orc { DIAGNOSTIC_POP - static TruthValue evaluateBoolPredicate( - const PredicateLeaf::Operator op, - const std::vector<Literal>& literals, - const proto::ColumnStatistics& stats) { - bool hasNull = stats.hasnull(); - if (!stats.has_bucketstatistics() || - stats.bucketstatistics().count_size() == 0) { + static TruthValue evaluateBoolPredicate(const PredicateLeaf::Operator op, + const std::vector<Literal>& literals, + const proto::ColumnStatistics& stats) { + bool hasNull = stats.has_null(); + if (!stats.has_bucket_statistics() || stats.bucket_statistics().count_size() == 0) { // does not have bool stats return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; } - auto trueCount = stats.bucketstatistics().count(0); - auto falseCount = stats.numberofvalues() - trueCount; + auto trueCount = stats.bucket_statistics().count(0); + auto falseCount = stats.number_of_values() - trueCount; switch (op) { case PredicateLeaf::Operator::IS_NULL: return hasNull ? TruthValue::YES_NO : TruthValue::NO; @@ -500,8 +465,7 @@ namespace orc { return result; } - static std::vector<Literal::Timestamp> literal2Timestamp( - const std::vector<Literal>& values) { + static std::vector<Literal::Timestamp> literal2Timestamp(const std::vector<Literal>& values) { std::vector<Literal::Timestamp> result; std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { if (!val.isNull()) { @@ -511,8 +475,7 @@ namespace orc { return result; } - static std::vector<Decimal> literal2Decimal( - const std::vector<Literal>& values) { + static std::vector<Decimal> literal2Decimal(const std::vector<Literal>& values) { std::vector<Decimal> result; std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { if (!val.isNull()) { @@ -522,8 +485,7 @@ namespace orc { return result; } - static std::vector<double> literal2Double( - const std::vector<Literal>& values) { + static std::vector<double> literal2Double(const std::vector<Literal>& values) { std::vector<double> result; std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { if (!val.isNull()) { @@ -533,8 +495,7 @@ namespace orc { return result; } - static std::vector<TString> literal2String( - const std::vector<Literal>& values) { + static std::vector<TString> literal2String(const std::vector<Literal>& values) { std::vector<TString> result; std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { if (!val.isNull()) { @@ -544,114 +505,84 @@ namespace orc { return result; } - TruthValue PredicateLeaf::evaluatePredicateMinMax( - const proto::ColumnStatistics& colStats) const { + TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const { TruthValue result = TruthValue::YES_NO_NULL; switch (mType) { case PredicateDataType::LONG: { - if (colStats.has_intstatistics() && - colStats.intstatistics().has_minimum() && - colStats.intstatistics().has_maximum()) { - const auto& stats = colStats.intstatistics(); - result = evaluatePredicateRange( - mOperator, - literal2Long(mLiterals), - stats.minimum(), - stats.maximum(), - colStats.hasnull()); + if (colStats.has_int_statistics() && colStats.int_statistics().has_minimum() && + colStats.int_statistics().has_maximum()) { + const auto& stats = colStats.int_statistics(); + result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(), + stats.maximum(), colStats.has_null()); } break; } case PredicateDataType::FLOAT: { - if (colStats.has_doublestatistics() && - colStats.doublestatistics().has_minimum() && - colStats.doublestatistics().has_maximum()) { - const auto& stats = colStats.doublestatistics(); + if (colStats.has_double_statistics() && colStats.double_statistics().has_minimum() && + colStats.double_statistics().has_maximum()) { + const auto& stats = colStats.double_statistics(); if (!std::isfinite(stats.sum())) { - result = colStats.hasnull() ? - TruthValue::YES_NO_NULL : TruthValue::YES_NO; + result = colStats.has_null() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; } else { - result = evaluatePredicateRange( - mOperator, - literal2Double(mLiterals), - stats.minimum(), - stats.maximum(), - colStats.hasnull()); + result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(), + stats.maximum(), colStats.has_null()); } } break; } case PredicateDataType::STRING: { - ///TODO: check lowerBound and upperBound as well - if (colStats.has_stringstatistics() && - colStats.stringstatistics().has_minimum() && - colStats.stringstatistics().has_maximum()) { - const auto& stats = colStats.stringstatistics(); - result = evaluatePredicateRange( - mOperator, - literal2String(mLiterals), - stats.minimum(), - stats.maximum(), - colStats.hasnull()); + /// TODO: check lowerBound and upperBound as well + if (colStats.has_string_statistics() && colStats.string_statistics().has_minimum() && + colStats.string_statistics().has_maximum()) { + const auto& stats = colStats.string_statistics(); + result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(), + stats.maximum(), colStats.has_null()); } break; } case PredicateDataType::DATE: { - if (colStats.has_datestatistics() && - colStats.datestatistics().has_minimum() && - colStats.datestatistics().has_maximum()) { - const auto& stats = colStats.datestatistics(); - result = evaluatePredicateRange( - mOperator, - literal2Date(mLiterals), - stats.minimum(), - stats.maximum(), - colStats.hasnull()); + if (colStats.has_date_statistics() && colStats.date_statistics().has_minimum() && + colStats.date_statistics().has_maximum()) { + const auto& stats = colStats.date_statistics(); + result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(), + stats.maximum(), colStats.has_null()); } break; } case PredicateDataType::TIMESTAMP: { - if (colStats.has_timestampstatistics() && - colStats.timestampstatistics().has_minimumutc() && - colStats.timestampstatistics().has_maximumutc()) { - const auto& stats = colStats.timestampstatistics(); + if (colStats.has_timestamp_statistics() && + colStats.timestamp_statistics().has_minimum_utc() && + colStats.timestamp_statistics().has_maximum_utc()) { + const auto& stats = colStats.timestamp_statistics(); constexpr int32_t DEFAULT_MIN_NANOS = 0; constexpr int32_t DEFAULT_MAX_NANOS = 999999; - int32_t minNano = stats.has_minimumnanos() ? - stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS; - int32_t maxNano = stats.has_maximumnanos() ? - stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS; + int32_t minNano = + stats.has_minimum_nanos() ? stats.minimum_nanos() - 1 : DEFAULT_MIN_NANOS; + int32_t maxNano = + stats.has_maximum_nanos() ? stats.maximum_nanos() - 1 : DEFAULT_MAX_NANOS; Literal::Timestamp minTimestamp( - stats.minimumutc() / 1000, - static_cast<int32_t>((stats.minimumutc() % 1000) * 1000000) + minNano); + stats.minimum_utc() / 1000, + static_cast<int32_t>((stats.minimum_utc() % 1000) * 1000000) + minNano); Literal::Timestamp maxTimestamp( - stats.maximumutc() / 1000, - static_cast<int32_t>((stats.maximumutc() % 1000) * 1000000) + maxNano); - result = evaluatePredicateRange( - mOperator, - literal2Timestamp(mLiterals), - minTimestamp, - maxTimestamp, - colStats.hasnull()); + stats.maximum_utc() / 1000, + static_cast<int32_t>((stats.maximum_utc() % 1000) * 1000000) + maxNano); + result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp, + maxTimestamp, colStats.has_null()); } break; } case PredicateDataType::DECIMAL: { - if (colStats.has_decimalstatistics() && - colStats.decimalstatistics().has_minimum() && - colStats.decimalstatistics().has_maximum()) { - const auto& stats = colStats.decimalstatistics(); - result = evaluatePredicateRange( - mOperator, - literal2Decimal(mLiterals), - Decimal(stats.minimum()), - Decimal(stats.maximum()), - colStats.hasnull()); + if (colStats.has_decimal_statistics() && colStats.decimal_statistics().has_minimum() && + colStats.decimal_statistics().has_maximum()) { + const auto& stats = colStats.decimal_statistics(); + result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals), + Decimal(stats.minimum()), Decimal(stats.maximum()), + colStats.has_null()); } break; } - case PredicateDataType::BOOLEAN: { - if (colStats.has_bucketstatistics()) { + case PredicateDataType::BOOLEAN: { + if (colStats.has_bucket_statistics()) { result = evaluateBoolPredicate(mOperator, mLiterals, colStats); } break; @@ -661,7 +592,7 @@ namespace orc { } // make sure null literal is respected for IN operator - if (mOperator == Operator::IN && colStats.hasnull()) { + if (mOperator == Operator::IN && colStats.has_null()) { for (const auto& literal : mLiterals) { if (literal.isNull()) { result = TruthValue::YES_NO_NULL; @@ -673,29 +604,24 @@ namespace orc { return result; } - static bool shouldEvaluateBloomFilter(PredicateLeaf::Operator op, - TruthValue result, - const BloomFilter * bloomFilter) { + static bool shouldEvaluateBloomFilter(PredicateLeaf::Operator op, TruthValue result, + const BloomFilter* bloomFilter) { // evaluate bloom filter only when // 1) Bloom filter is available // 2) Min/Max evaluation yield YES or MAYBE // 3) Predicate is EQUALS or IN list // 4) Decimal type stores its string representation // but has inconsistency in trailing zeros - if (bloomFilter != nullptr - && result != TruthValue::NO_NULL && result != TruthValue::NO - && (op == PredicateLeaf::Operator::EQUALS - || op == PredicateLeaf::Operator::NULL_SAFE_EQUALS - || op == PredicateLeaf::Operator::IN)) { + if (bloomFilter != nullptr && result != TruthValue::NO_NULL && result != TruthValue::NO && + (op == PredicateLeaf::Operator::EQUALS || op == PredicateLeaf::Operator::NULL_SAFE_EQUALS || + op == PredicateLeaf::Operator::IN)) { return true; } return false; } - static TruthValue checkInBloomFilter(PredicateLeaf::Operator, - PredicateDataType type, - const Literal& literal, - const BloomFilter * bf, + static TruthValue checkInBloomFilter(PredicateLeaf::Operator, PredicateDataType type, + const Literal& literal, const BloomFilter* bf, bool hasNull) { TruthValue result = hasNull ? TruthValue::NO_NULL : TruthValue::NO; if (literal.isNull()) { @@ -715,7 +641,7 @@ namespace orc { } } else if (type == PredicateDataType::DECIMAL) { std::string decimal = literal.getDecimal().toString(true); - if (bf->testBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()))) { + if (bf->testBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()))) { result = TruthValue::YES_NO_NULL; } } else if (type == PredicateDataType::TIMESTAMP) { @@ -737,25 +663,20 @@ namespace orc { return result; } - TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter * bf, - bool hasNull) const { + TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter* bf, bool hasNull) const { switch (mOperator) { case Operator::NULL_SAFE_EQUALS: // null safe equals does not return *_NULL variant. // So set hasNull to false - return checkInBloomFilter( - mOperator, mType, mLiterals.front(), bf, false); + return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, false); case Operator::EQUALS: - return checkInBloomFilter( - mOperator, mType, mLiterals.front(), bf, hasNull); + return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, hasNull); case Operator::IN: - for (const auto &literal : mLiterals) { + for (const auto& literal : mLiterals) { // if at least one value in IN list exist in bloom filter, // qualify the row group/stripe - TruthValue result = checkInBloomFilter( - mOperator, mType, literal, bf, hasNull); - if (result == TruthValue::YES_NO_NULL || - result == TruthValue::YES_NO) { + TruthValue result = checkInBloomFilter(mOperator, mType, literal, bf, hasNull); + if (result == TruthValue::YES_NO_NULL || result == TruthValue::YES_NO) { return result; } } @@ -771,7 +692,7 @@ namespace orc { TruthValue PredicateLeaf::evaluate(const WriterVersion writerVersion, const proto::ColumnStatistics& colStats, - const BloomFilter * bloomFilter) const { + const BloomFilter* bloomFilter) const { // files written before ORC-135 stores timestamp wrt to local timezone // causing issues with PPD. disable PPD for timestamp for all old files if (mType == PredicateDataType::TIMESTAMP) { @@ -780,14 +701,13 @@ namespace orc { } } - bool allNull = colStats.hasnull() && colStats.numberofvalues() == 0; - if (mOperator == Operator::IS_NULL || (( - mOperator == Operator::EQUALS || - mOperator == Operator::NULL_SAFE_EQUALS) && - mLiterals.at(0).isNull())) { + bool allNull = colStats.has_null() && colStats.number_of_values() == 0; + if (mOperator == Operator::IS_NULL || + ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) && + mLiterals.at(0).isNull())) { // IS_NULL operator does not need to check min/max stats and bloom filter - return allNull ? TruthValue::YES : - (colStats.hasnull() ? TruthValue::YES_NO : TruthValue::NO); + return allNull ? TruthValue::YES + : (colStats.has_null() ? TruthValue::YES_NO : TruthValue::NO); } else if (allNull) { // if we don't have any value, everything must have been null return TruthValue::IS_NULL; @@ -795,10 +715,10 @@ namespace orc { TruthValue result = evaluatePredicateMinMax(colStats); if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) { - return evaluatePredicateBloomFiter(bloomFilter, colStats.hasnull()); + return evaluatePredicateBloomFiter(bloomFilter, colStats.has_null()); } else { return result; } } -} // namespace orc +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh index 99791cf976..21ed456155 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.hh @@ -19,18 +19,17 @@ #ifndef ORC_PREDICATELEAF_HH #define ORC_PREDICATELEAF_HH -#include "wrap/orc-proto-wrapper.hh" #include "orc/Common.hh" #include "orc/sargs/Literal.hh" #include "orc/sargs/TruthValue.hh" +#include "wrap/orc-proto-wrapper.hh" #include <string> #include <vector> namespace orc { - static constexpr uint64_t INVALID_COLUMN_ID = - std::numeric_limits<uint64_t>::max(); + static constexpr uint64_t INVALID_COLUMN_ID = std::numeric_limits<uint64_t>::max(); class BloomFilter; @@ -38,7 +37,7 @@ namespace orc { * The primitive predicates that form a SearchArgument. */ class PredicateLeaf { - public: + public: /** * The possible operators for predicates. To get the opposites, construct * an expression with a not operator. @@ -55,9 +54,9 @@ namespace orc { // The possible types for sargs. enum class Type { - LONG = 0, // all of the integer types - FLOAT, // float and double - STRING, // string, char, varchar + LONG = 0, // all of the integer types + FLOAT, // float and double + STRING, // string, char, varchar DATE, DECIMAL, TIMESTAMP, @@ -66,34 +65,20 @@ namespace orc { PredicateLeaf() = default; - PredicateLeaf(Operator op, - PredicateDataType type, - const std::string& colName, - Literal literal); + PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, Literal literal); - PredicateLeaf(Operator op, - PredicateDataType type, - uint64_t columnId, - Literal literal); + PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, Literal literal); - PredicateLeaf(Operator op, - PredicateDataType type, - const std::string& colName, + PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, const std::initializer_list<Literal>& literalList); - PredicateLeaf(Operator op, - PredicateDataType type, - uint64_t columnId, + PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, const std::initializer_list<Literal>& literalList); - PredicateLeaf(Operator op, - PredicateDataType type, - const std::string& colName, + PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, const std::vector<Literal>& literalList); - PredicateLeaf(Operator op, - PredicateDataType type, - uint64_t columnId, + PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, const std::vector<Literal>& literalList); /** @@ -134,17 +119,18 @@ namespace orc { /** * Evaluate current PredicateLeaf based on ColumnStatistics and BloomFilter */ - TruthValue evaluate(const WriterVersion writerVersion, - const proto::ColumnStatistics& colStats, - const BloomFilter * bloomFilter) const; + TruthValue evaluate(const WriterVersion writerVersion, const proto::ColumnStatistics& colStats, + const BloomFilter* bloomFilter) const; std::string toString() const; bool operator==(const PredicateLeaf& r) const; - size_t getHashCode() const { return mHashCode; } + size_t getHashCode() const { + return mHashCode; + } - private: + private: size_t hashCode() const; void validate() const; @@ -152,13 +138,11 @@ namespace orc { std::string columnDebugString() const; - TruthValue evaluatePredicateMinMax( - const proto::ColumnStatistics& colStats) const; + TruthValue evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const; - TruthValue evaluatePredicateBloomFiter(const BloomFilter * bloomFilter, - bool hasNull) const; + TruthValue evaluatePredicateBloomFiter(const BloomFilter* bloomFilter, bool hasNull) const; - private: + private: Operator mOperator; PredicateDataType mType; std::string mColumnName; @@ -180,6 +164,6 @@ namespace orc { } }; -} // namespace orc +} // namespace orc -#endif //ORC_PREDICATELEAF_HH +#endif // ORC_PREDICATELEAF_HH diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc index 42a554f5ca..7032a88126 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc @@ -22,8 +22,7 @@ namespace orc { // find column id from column name - uint64_t SargsApplier::findColumn(const Type& type, - const std::string& colName) { + uint64_t SargsApplier::findColumn(const Type& type, const std::string& colName) { for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { // Only STRUCT type has field names if (type.getKind() == STRUCT && type.getFieldName(i) == colName) { @@ -38,19 +37,18 @@ namespace orc { return INVALID_COLUMN_ID; } - SargsApplier::SargsApplier(const Type& type, - const SearchArgument * searchArgument, - uint64_t rowIndexStride, - WriterVersion writerVersion) - : mType(type) - , mSearchArgument(searchArgument) - , mRowIndexStride(rowIndexStride) - , mWriterVersion(writerVersion) - , mStats(0, 0) - , mHasEvaluatedFileStats(false) - , mFileStatsEvalResult(true) { - const SearchArgumentImpl * sargs = - dynamic_cast<const SearchArgumentImpl *>(mSearchArgument); + SargsApplier::SargsApplier(const Type& type, const SearchArgument* searchArgument, + uint64_t rowIndexStride, WriterVersion writerVersion, + ReaderMetrics* metrics, const SchemaEvolution* schemaEvolution) + : mType(type), + mSearchArgument(searchArgument), + mSchemaEvolution(schemaEvolution), + mRowIndexStride(rowIndexStride), + mWriterVersion(writerVersion), + mHasEvaluatedFileStats(false), + mFileStatsEvalResult(true), + mMetrics(metrics) { + const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument); // find the mapping from predicate leaves to columns const std::vector<PredicateLeaf>& leaves = sargs->getLeaves(); @@ -64,13 +62,11 @@ namespace orc { } } - bool SargsApplier::pickRowGroups( - uint64_t rowsInStripe, - const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes, - const std::map<uint32_t, BloomFilterIndex>& bloomFilters) { + bool SargsApplier::pickRowGroups(uint64_t rowsInStripe, + const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes, + const std::map<uint32_t, BloomFilterIndex>& bloomFilters) { // init state of each row group - uint64_t groupsInStripe = - (rowsInStripe + mRowIndexStride - 1) / mRowIndexStride; + uint64_t groupsInStripe = (rowsInStripe + mRowIndexStride - 1) / mRowIndexStride; mNextSkippedRows.resize(groupsInStripe); mTotalRowsInStripe = rowsInStripe; @@ -79,10 +75,8 @@ namespace orc { return true; } - const auto& leaves = - dynamic_cast<const SearchArgumentImpl *>(mSearchArgument)->getLeaves(); - std::vector<TruthValue> leafValues( - leaves.size(), TruthValue::YES_NO_NULL); + const auto& leaves = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument)->getLeaves(); + std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL); mHasSelected = false; mHasSkipped = false; uint64_t nextSkippedRowGroup = groupsInStripe; @@ -95,10 +89,13 @@ namespace orc { if (columnIdx == INVALID_COLUMN_ID || rowIndexIter == rowIndexes.cend()) { // this column does not exist in current file leafValues[pred] = TruthValue::YES_NO_NULL; + } else if (mSchemaEvolution && !mSchemaEvolution->isSafePPDConversion(columnIdx)) { + // cannot evaluate predicate when ppd is not safe + leafValues[pred] = TruthValue::YES_NO_NULL; } else { // get column statistics const proto::ColumnStatistics& statistics = - rowIndexIter->second.entry(static_cast<int>(rowGroup)).statistics(); + rowIndexIter->second.entry(static_cast<int>(rowGroup)).statistics(); // get bloom filter std::shared_ptr<BloomFilter> bloomFilter; @@ -107,9 +104,7 @@ namespace orc { bloomFilter = iter->second.entries.at(rowGroup); } - leafValues[pred] = leaves[pred].evaluate(mWriterVersion, - statistics, - bloomFilter.get()); + leafValues[pred] = leaves[pred].evaluate(mWriterVersion, statistics, bloomFilter.get()); } } @@ -118,69 +113,76 @@ namespace orc { mNextSkippedRows[rowGroup] = 0; nextSkippedRowGroup = rowGroup; } else { - mNextSkippedRows[rowGroup] = (nextSkippedRowGroup == groupsInStripe) ? - rowsInStripe : (nextSkippedRowGroup * mRowIndexStride); + mNextSkippedRows[rowGroup] = (nextSkippedRowGroup == groupsInStripe) + ? rowsInStripe + : (nextSkippedRowGroup * mRowIndexStride); } mHasSelected |= needed; mHasSkipped |= !needed; } while (rowGroup != 0); // update stats - mStats.first = std::accumulate( - mNextSkippedRows.cbegin(), mNextSkippedRows.cend(), mStats.first, - [](bool rg, uint64_t s) { return rg ? 1 : 0 + s; }); - mStats.second += groupsInStripe; + uint64_t selectedRGs = std::accumulate( + mNextSkippedRows.cbegin(), mNextSkippedRows.cend(), 0UL, + [](uint64_t initVal, uint64_t rg) { return rg > 0 ? initVal + 1 : initVal; }); + if (mMetrics != nullptr) { + mMetrics->SelectedRowGroupCount.fetch_add(selectedRGs); + mMetrics->EvaluatedRowGroupCount.fetch_add(groupsInStripe); + } return mHasSelected; } - bool SargsApplier::evaluateColumnStatistics( - const PbColumnStatistics& colStats) const { - const SearchArgumentImpl * sargs = - dynamic_cast<const SearchArgumentImpl *>(mSearchArgument); + bool SargsApplier::evaluateColumnStatistics(const PbColumnStatistics& colStats) const { + const SearchArgumentImpl* sargs = dynamic_cast<const SearchArgumentImpl*>(mSearchArgument); if (sargs == nullptr) { throw InvalidArgument("Failed to cast to SearchArgumentImpl"); } const std::vector<PredicateLeaf>& leaves = sargs->getLeaves(); - std::vector<TruthValue> leafValues( - leaves.size(), TruthValue::YES_NO_NULL); + std::vector<TruthValue> leafValues(leaves.size(), TruthValue::YES_NO_NULL); for (size_t pred = 0; pred != leaves.size(); ++pred) { uint64_t columnId = mFilterColumns[pred]; - if (columnId != INVALID_COLUMN_ID && - colStats.size() > static_cast<int>(columnId)) { - leafValues[pred] = leaves[pred].evaluate( - mWriterVersion, colStats.Get(static_cast<int>(columnId)), nullptr); + if (columnId != INVALID_COLUMN_ID && colStats.size() > static_cast<int>(columnId)) { + leafValues[pred] = leaves[pred].evaluate(mWriterVersion, + colStats.Get(static_cast<int>(columnId)), nullptr); } } return isNeeded(mSearchArgument->evaluate(leafValues)); } - bool SargsApplier::evaluateStripeStatistics( - const proto::StripeStatistics& stripeStats) { - if (stripeStats.colstats_size() == 0) { + bool SargsApplier::evaluateStripeStatistics(const proto::StripeStatistics& stripeStats, + uint64_t stripeRowGroupCount) { + if (stripeStats.col_stats_size() == 0) { return true; } - bool ret = evaluateColumnStatistics(stripeStats.colstats()); + bool ret = evaluateColumnStatistics(stripeStats.col_stats()); if (!ret) { // reset mNextSkippedRows when the current stripe does not satisfy the PPD mNextSkippedRows.clear(); + if (mMetrics != nullptr) { + mMetrics->EvaluatedRowGroupCount.fetch_add(stripeRowGroupCount); + } } return ret; } - bool SargsApplier::evaluateFileStatistics(const proto::Footer& footer) { + bool SargsApplier::evaluateFileStatistics(const proto::Footer& footer, + uint64_t numRowGroupsInStripeRange) { if (!mHasEvaluatedFileStats) { if (footer.statistics_size() == 0) { mFileStatsEvalResult = true; } else { mFileStatsEvalResult = evaluateColumnStatistics(footer.statistics()); + if (!mFileStatsEvalResult && mMetrics != nullptr) { + mMetrics->EvaluatedRowGroupCount.fetch_add(numRowGroupsInStripeRange); + } } mHasEvaluatedFileStats = true; } return mFileStatsEvalResult; } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh index d8bdf852d0..73703dcf6b 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.hh @@ -19,62 +19,78 @@ #ifndef ORC_SARGSAPPLIER_HH #define ORC_SARGSAPPLIER_HH -#include "wrap/orc-proto-wrapper.hh" #include <orc/Common.hh> #include "orc/BloomFilter.hh" +#include "orc/Reader.hh" #include "orc/Type.hh" +#include "wrap/orc-proto-wrapper.hh" #include "sargs/SearchArgument.hh" +#include "SchemaEvolution.hh" + #include <unordered_map> namespace orc { class SargsApplier { - public: - SargsApplier(const Type& type, - const SearchArgument * searchArgument, - uint64_t rowIndexStride, - WriterVersion writerVersion); + public: + SargsApplier(const Type& type, const SearchArgument* searchArgument, uint64_t rowIndexStride, + WriterVersion writerVersion, ReaderMetrics* metrics, + const SchemaEvolution* schemaEvolution = nullptr); /** * Evaluate search argument on file statistics + * If file statistics don't satisfy the sargs, + * the EvaluatedRowGroupCount of Reader Metrics will be updated. + * Otherwise, Reader Metrics will not be updated and + * will require further evaluation. * @return true if file statistics satisfy the sargs */ - bool evaluateFileStatistics(const proto::Footer& footer); + bool evaluateFileStatistics(const proto::Footer& footer, uint64_t numRowGroupsInStripeRange); /** * Evaluate search argument on stripe statistics + * If stripe statistics don't satisfy the sargs, + * the EvaluatedRowGroupCount of Reader Metrics will be updated. + * Otherwise, Reader Metrics will not be updated and + * will require further evaluation. * @return true if stripe statistics satisfy the sargs */ - bool evaluateStripeStatistics(const proto::StripeStatistics& stripeStats); + bool evaluateStripeStatistics(const proto::StripeStatistics& stripeStats, + uint64_t stripeRowGroupCount); /** * TODO: use proto::RowIndex and proto::BloomFilter to do the evaluation * Pick the row groups that we need to load from the current stripe. * @return true if any row group is selected */ - bool pickRowGroups( - uint64_t rowsInStripe, - const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes, - const std::map<uint32_t, BloomFilterIndex>& bloomFilters); + bool pickRowGroups(uint64_t rowsInStripe, + const std::unordered_map<uint64_t, proto::RowIndex>& rowIndexes, + const std::map<uint32_t, BloomFilterIndex>& bloomFilters); /** * Return a vector of the next skipped row for each RowGroup. Each value is the row id * in stripe. 0 means the current RowGroup is entirely skipped. * Only valid after invoking pickRowGroups(). */ - const std::vector<uint64_t>& getNextSkippedRows() const { return mNextSkippedRows; } + const std::vector<uint64_t>& getNextSkippedRows() const { + return mNextSkippedRows; + } /** * Indicate whether any row group is selected in the last evaluation */ - bool hasSelected() const { return mHasSelected; } + bool hasSelected() const { + return mHasSelected; + } /** * Indicate whether any row group is skipped in the last evaluation */ - bool hasSkipped() const { return mHasSkipped; } + bool hasSkipped() const { + return mHasSkipped; + } /** * Whether any row group from current row in the stripe matches PPD. @@ -90,13 +106,17 @@ namespace orc { } std::pair<uint64_t, uint64_t> getStats() const { - return mStats; + if (mMetrics != nullptr) { + return std::make_pair(mMetrics->SelectedRowGroupCount.load(), + mMetrics->EvaluatedRowGroupCount.load()); + } else { + return {0, 0}; + } } - private: + private: // evaluate column statistics in the form of protobuf::RepeatedPtrField - typedef ::google::protobuf::RepeatedPtrField<proto::ColumnStatistics> - PbColumnStatistics; + typedef ::google::protobuf::RepeatedPtrField<proto::ColumnStatistics> PbColumnStatistics; bool evaluateColumnStatistics(const PbColumnStatistics& colStats) const; friend class TestSargsApplier_findColumnTest_Test; @@ -104,9 +124,10 @@ namespace orc { friend class TestSargsApplier_findMapColumnTest_Test; static uint64_t findColumn(const Type& type, const std::string& colName); - private: + private: const Type& mType; - const SearchArgument * mSearchArgument; + const SearchArgument* mSearchArgument; + const SchemaEvolution* mSchemaEvolution; uint64_t mRowIndexStride; WriterVersion mWriterVersion; // column ids for each predicate leaf in the search argument @@ -119,13 +140,14 @@ namespace orc { uint64_t mTotalRowsInStripe; bool mHasSelected; bool mHasSkipped; - // keep stats of selected RGs and evaluated RGs - std::pair<uint64_t, uint64_t> mStats; // store result of file stats evaluation bool mHasEvaluatedFileStats; bool mFileStatsEvalResult; + // use the SelectedRowGroupCount and EvaluatedRowGroupCount to + // keep stats of selected RGs and evaluated RGs + ReaderMetrics* mMetrics; }; -} +} // namespace orc -#endif //ORC_SARGSAPPLIER_HH +#endif // ORC_SARGSAPPLIER_HH diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc index f6abb316b5..806727f0a0 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc @@ -33,14 +33,12 @@ namespace orc { return mLeaves; } - const ExpressionTree * SearchArgumentImpl::getExpression() const { + const ExpressionTree* SearchArgumentImpl::getExpression() const { return mExpressionTree.get(); } - TruthValue SearchArgumentImpl::evaluate( - const std::vector<TruthValue>& leaves) const { - return mExpressionTree == nullptr ? - TruthValue::YES : mExpressionTree->evaluate(leaves); + TruthValue SearchArgumentImpl::evaluate(const std::vector<TruthValue>& leaves) const { + return mExpressionTree == nullptr ? TruthValue::YES : mExpressionTree->evaluate(leaves); } std::string SearchArgumentImpl::toString() const { @@ -61,8 +59,7 @@ namespace orc { mCurrTree.push_back(mRoot); } - SearchArgumentBuilder& - SearchArgumentBuilderImpl::start(ExpressionTree::Operator op) { + SearchArgumentBuilder& SearchArgumentBuilderImpl::start(ExpressionTree::Operator op) { TreeNode node = std::make_shared<ExpressionTree>(op); mCurrTree.front()->addChild(node); mCurrTree.push_front(node); @@ -84,13 +81,13 @@ namespace orc { SearchArgumentBuilder& SearchArgumentBuilderImpl::end() { TreeNode& current = mCurrTree.front(); if (current->getChildren().empty()) { - throw std::invalid_argument("Cannot create expression " + - mRoot->toString() + " with no children."); + throw std::invalid_argument("Cannot create expression " + mRoot->toString() + + " with no children."); } if (current->getOperator() == ExpressionTree::Operator::NOT && current->getChildren().size() != 1) { - throw std::invalid_argument("Can't create NOT expression " + - current->toString() + " with more than 1 child."); + throw std::invalid_argument("Can't create NOT expression " + current->toString() + + " with more than 1 child."); } mCurrTree.pop_front(); return *this; @@ -110,16 +107,14 @@ namespace orc { return columnId == INVALID_COLUMN_ID; } - template<typename T> - SearchArgumentBuilder& - SearchArgumentBuilderImpl::compareOperator(PredicateLeaf::Operator op, - T column, - PredicateDataType type, - Literal literal) { + template <typename T> + SearchArgumentBuilder& SearchArgumentBuilderImpl::compareOperator(PredicateLeaf::Operator op, + T column, + PredicateDataType type, + Literal literal) { TreeNode parent = mCurrTree.front(); if (isInvalidColumn(column)) { - parent->addChild( - std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); + parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); } else { PredicateLeaf leaf(op, type, column, literal); parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf))); @@ -130,29 +125,25 @@ namespace orc { SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThan(const std::string& column, PredicateDataType type, Literal literal) { - return compareOperator( - PredicateLeaf::Operator::LESS_THAN, column, type, literal); + return compareOperator(PredicateLeaf::Operator::LESS_THAN, column, type, literal); } SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThan(uint64_t columnId, PredicateDataType type, Literal literal) { - return compareOperator( - PredicateLeaf::Operator::LESS_THAN, columnId, type, literal); + return compareOperator(PredicateLeaf::Operator::LESS_THAN, columnId, type, literal); } SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThanEquals(const std::string& column, PredicateDataType type, Literal literal) { - return compareOperator( - PredicateLeaf::Operator::LESS_THAN_EQUALS, column, type, literal); + return compareOperator(PredicateLeaf::Operator::LESS_THAN_EQUALS, column, type, literal); } SearchArgumentBuilder& SearchArgumentBuilderImpl::lessThanEquals(uint64_t columnId, PredicateDataType type, Literal literal) { - return compareOperator( - PredicateLeaf::Operator::LESS_THAN_EQUALS, columnId, type, literal); + return compareOperator(PredicateLeaf::Operator::LESS_THAN_EQUALS, columnId, type, literal); } SearchArgumentBuilder& SearchArgumentBuilderImpl::equals(const std::string& column, @@ -161,8 +152,7 @@ namespace orc { if (literal.isNull()) { return isNull(column, type); } else { - return compareOperator( - PredicateLeaf::Operator::EQUALS, column, type, literal); + return compareOperator(PredicateLeaf::Operator::EQUALS, column, type, literal); } } @@ -172,54 +162,46 @@ namespace orc { if (literal.isNull()) { return isNull(columnId, type); } else { - return compareOperator( - PredicateLeaf::Operator::EQUALS, columnId, type, literal); + return compareOperator(PredicateLeaf::Operator::EQUALS, columnId, type, literal); } } SearchArgumentBuilder& SearchArgumentBuilderImpl::nullSafeEquals(const std::string& column, PredicateDataType type, Literal literal) { - return compareOperator( - PredicateLeaf::Operator::NULL_SAFE_EQUALS, column, type, literal); + return compareOperator(PredicateLeaf::Operator::NULL_SAFE_EQUALS, column, type, literal); } SearchArgumentBuilder& SearchArgumentBuilderImpl::nullSafeEquals(uint64_t columnId, PredicateDataType type, Literal literal) { - return compareOperator( - PredicateLeaf::Operator::NULL_SAFE_EQUALS, columnId, type, literal); + return compareOperator(PredicateLeaf::Operator::NULL_SAFE_EQUALS, columnId, type, literal); } - template<typename T, typename CONTAINER> - SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIn(T column, - PredicateDataType type, - const CONTAINER& literals) { - TreeNode &parent = mCurrTree.front(); + template <typename T, typename CONTAINER> + SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIn(T column, PredicateDataType type, + const CONTAINER& literals) { + TreeNode& parent = mCurrTree.front(); if (isInvalidColumn(column)) { - parent->addChild( - std::make_shared<ExpressionTree>((TruthValue::YES_NO_NULL))); + parent->addChild(std::make_shared<ExpressionTree>((TruthValue::YES_NO_NULL))); } else { if (literals.size() == 0) { - throw std::invalid_argument( - "Can't create in expression with no arguments"); + throw std::invalid_argument("Can't create in expression with no arguments"); } - PredicateLeaf leaf( - PredicateLeaf::Operator::IN, type, column, literals); + PredicateLeaf leaf(PredicateLeaf::Operator::IN, type, column, literals); parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf))); } return *this; } - SearchArgumentBuilder& SearchArgumentBuilderImpl::in(const std::string& column, - PredicateDataType type, - const std::initializer_list<Literal>& literals) { + SearchArgumentBuilder& SearchArgumentBuilderImpl::in( + const std::string& column, PredicateDataType type, + const std::initializer_list<Literal>& literals) { return addChildForIn(column, type, literals); } - SearchArgumentBuilder& SearchArgumentBuilderImpl::in(uint64_t columnId, - PredicateDataType type, - const std::initializer_list<Literal>& literals) { + SearchArgumentBuilder& SearchArgumentBuilderImpl::in( + uint64_t columnId, PredicateDataType type, const std::initializer_list<Literal>& literals) { return addChildForIn(columnId, type, literals); } @@ -229,23 +211,19 @@ namespace orc { return addChildForIn(column, type, literals); } - SearchArgumentBuilder& SearchArgumentBuilderImpl::in(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& SearchArgumentBuilderImpl::in(uint64_t columnId, PredicateDataType type, const std::vector<Literal>& literals) { return addChildForIn(columnId, type, literals); } - template<typename T> - SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIsNull(T column, PredicateDataType type) { + template <typename T> + SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForIsNull(T column, + PredicateDataType type) { TreeNode& parent = mCurrTree.front(); if (isInvalidColumn(column)) { - parent->addChild( - std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); + parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); } else { - PredicateLeaf leaf(PredicateLeaf::Operator::IS_NULL, - type, - column, - {}); + PredicateLeaf leaf(PredicateLeaf::Operator::IS_NULL, type, column, {}); parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf))); } return *this; @@ -261,34 +239,29 @@ namespace orc { return addChildForIsNull(columnId, type); } - template<typename T> + template <typename T> SearchArgumentBuilder& SearchArgumentBuilderImpl::addChildForBetween(T column, PredicateDataType type, - Literal lower, Literal upper) { + Literal lower, + Literal upper) { TreeNode& parent = mCurrTree.front(); if (isInvalidColumn(column)) { - parent->addChild( - std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); + parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL)); } else { - PredicateLeaf leaf(PredicateLeaf::Operator::BETWEEN, - type, - column, - { lower, upper }); + PredicateLeaf leaf(PredicateLeaf::Operator::BETWEEN, type, column, {lower, upper}); parent->addChild(std::make_shared<ExpressionTree>(addLeaf(leaf))); } return *this; } SearchArgumentBuilder& SearchArgumentBuilderImpl::between(const std::string& column, - PredicateDataType type, - Literal lower, + PredicateDataType type, Literal lower, Literal upper) { return addChildForBetween(column, type, lower, upper); } SearchArgumentBuilder& SearchArgumentBuilderImpl::between(uint64_t columnId, - PredicateDataType type, - Literal lower, + PredicateDataType type, Literal lower, Literal upper) { return addChildForBetween(columnId, type, lower, upper); } @@ -307,9 +280,7 @@ namespace orc { * @param leafReorder buffer for leaf reorder * @return the next available leaf id */ - static size_t compactLeaves(const TreeNode& tree, - size_t next, - size_t leafReorder[]) { + static size_t compactLeaves(const TreeNode& tree, size_t next, size_t leafReorder[]) { if (tree->getOperator() == ExpressionTree::Operator::LEAF) { size_t oldLeaf = tree->getLeaf(); if (leafReorder[oldLeaf] == UNUSED_LEAF) { @@ -378,18 +349,16 @@ namespace orc { case ExpressionTree::Operator::AND: { TreeNode result(new ExpressionTree(ExpressionTree::Operator::OR)); for (auto& kid : child->getChildren()) { - result->addChild(pushDownNot(std::make_shared<ExpressionTree>( - ExpressionTree::Operator::NOT, NodeList{ kid }) - )); + result->addChild(pushDownNot( + std::make_shared<ExpressionTree>(ExpressionTree::Operator::NOT, NodeList{kid}))); } return result; } case ExpressionTree::Operator::OR: { TreeNode result(new ExpressionTree(ExpressionTree::Operator::AND)); for (auto& kid : child->getChildren()) { - result->addChild(pushDownNot(std::make_shared<ExpressionTree>( - ExpressionTree::Operator::NOT, NodeList{ kid }) - )); + result->addChild(pushDownNot( + std::make_shared<ExpressionTree>(ExpressionTree::Operator::NOT, NodeList{kid}))); } return result; } @@ -432,8 +401,7 @@ namespace orc { case ExpressionTree::Operator::LEAF: case ExpressionTree::Operator::CONSTANT: default: - throw std::invalid_argument( - "Got a maybe as child of " + expr->toString()); + throw std::invalid_argument("Got a maybe as child of " + expr->toString()); } } else { expr->getChildren()[i] = child; @@ -444,8 +412,9 @@ namespace orc { if (!children.empty()) { // eliminate removed maybe nodes from expr std::vector<TreeNode> nodes; - std::for_each(children.begin(), children.end(), - [&](const TreeNode& node){ if (node) nodes.emplace_back(node); }); + std::for_each(children.begin(), children.end(), [&](const TreeNode& node) { + if (node) nodes.emplace_back(node); + }); std::swap(children, nodes); if (children.empty()) { return std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL); @@ -462,7 +431,7 @@ namespace orc { * @return the flattened expression, which will always be root with * potentially modified children. */ - TreeNode SearchArgumentBuilderImpl::flatten(TreeNode root) { + TreeNode SearchArgumentBuilderImpl::flatten(TreeNode root) { if (root) { std::vector<TreeNode> nodes; for (size_t i = 0; i != root->getChildren().size(); ++i) { @@ -524,10 +493,8 @@ namespace orc { } } if (andList.size() > 1) { - generateAllCombinations( - result, - std::vector<TreeNode>(andList.cbegin() + 1, andList.cend()), - nonAndList); + generateAllCombinations(result, std::vector<TreeNode>(andList.cbegin() + 1, andList.cend()), + nonAndList); } } @@ -576,8 +543,7 @@ namespace orc { } if (!andList.empty()) { if (checkCombinationsThreshold(andList)) { - root = std::make_shared<ExpressionTree>( - ExpressionTree::Operator::AND); + root = std::make_shared<ExpressionTree>(ExpressionTree::Operator::AND); generateAllCombinations(root->getChildren(), andList, nonAndList); } else { root = std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL); @@ -588,17 +554,15 @@ namespace orc { return root; } - SearchArgumentImpl::SearchArgumentImpl(TreeNode root, - const std::vector<PredicateLeaf>& leaves) - : mExpressionTree(root) - , mLeaves(leaves) { + SearchArgumentImpl::SearchArgumentImpl(TreeNode root, const std::vector<PredicateLeaf>& leaves) + : mExpressionTree(root), mLeaves(leaves) { // PASS } std::unique_ptr<SearchArgument> SearchArgumentBuilderImpl::build() { if (mCurrTree.size() != 1) { - throw std::invalid_argument("Failed to end " + - std::to_string(mCurrTree.size()) + " operations."); + throw std::invalid_argument("Failed to end " + std::to_string(mCurrTree.size()) + + " operations."); } mRoot = pushDownNot(mRoot); mRoot = foldMaybe(mRoot); @@ -612,18 +576,17 @@ namespace orc { std::vector<PredicateLeaf> leafList(newLeafCount, PredicateLeaf()); // build the new list - for (auto & leaf : mLeaves) { + for (auto& leaf : mLeaves) { size_t newLoc = leafReorder[leaf.second]; if (newLoc != UNUSED_LEAF) { leafList[newLoc] = leaf.first; } } - return std::unique_ptr<SearchArgument>( - new SearchArgumentImpl(mRoot, leafList)); + return std::make_unique<SearchArgumentImpl>(mRoot, leafList); } std::unique_ptr<SearchArgumentBuilder> SearchArgumentFactory::newBuilder() { - return std::unique_ptr<SearchArgumentBuilder>(new SearchArgumentBuilderImpl()); + return std::make_unique<SearchArgumentBuilderImpl>(); } -} // namespace orc +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh index 57d765e1df..4b74b28743 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh +++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh @@ -19,10 +19,10 @@ #ifndef ORC_SRC_SEARCHARGUMENT_HH #define ORC_SRC_SEARCHARGUMENT_HH -#include "wrap/orc-proto-wrapper.hh" #include "ExpressionTree.hh" #include "orc/sargs/SearchArgument.hh" #include "sargs/PredicateLeaf.hh" +#include "wrap/orc-proto-wrapper.hh" #include <deque> #include <stdexcept> @@ -40,7 +40,7 @@ namespace orc { * (<a href="http://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF</a>). */ class SearchArgumentImpl : public SearchArgument { - public: + public: SearchArgumentImpl(TreeNode root, const std::vector<PredicateLeaf>& leaves); /** @@ -54,7 +54,7 @@ namespace orc { * Get the expression tree. This should only needed for file formats that * need to translate the expression to an internal form. */ - const ExpressionTree * getExpression() const; + const ExpressionTree* getExpression() const; /** * Evaluate the entire predicate based on the values for the leaf predicates. @@ -65,7 +65,7 @@ namespace orc { std::string toString() const override; - private: + private: std::shared_ptr<ExpressionTree> mExpressionTree; std::vector<PredicateLeaf> mLeaves; }; @@ -75,7 +75,7 @@ namespace orc { * must call startOr, startAnd, or startNot before adding any leaves. */ class SearchArgumentBuilderImpl : public SearchArgumentBuilder { - public: + public: SearchArgumentBuilderImpl(); /** @@ -110,8 +110,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& lessThan(const std::string& column, - PredicateDataType type, + SearchArgumentBuilder& lessThan(const std::string& column, PredicateDataType type, Literal literal) override; /** @@ -121,8 +120,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& lessThan(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& lessThan(uint64_t columnId, PredicateDataType type, Literal literal) override; /** @@ -132,8 +130,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& lessThanEquals(const std::string& column, - PredicateDataType type, + SearchArgumentBuilder& lessThanEquals(const std::string& column, PredicateDataType type, Literal literal) override; /** @@ -143,8 +140,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& lessThanEquals(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& lessThanEquals(uint64_t columnId, PredicateDataType type, Literal literal) override; /** @@ -154,8 +150,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& equals(const std::string& column, - PredicateDataType type, + SearchArgumentBuilder& equals(const std::string& column, PredicateDataType type, Literal literal) override; /** @@ -165,8 +160,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& equals(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& equals(uint64_t columnId, PredicateDataType type, Literal literal) override; /** @@ -176,8 +170,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& nullSafeEquals(const std::string& column, - PredicateDataType type, + SearchArgumentBuilder& nullSafeEquals(const std::string& column, PredicateDataType type, Literal literal) override; /** @@ -187,8 +180,7 @@ namespace orc { * @param literal the literal * @return this */ - SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, PredicateDataType type, Literal literal) override; /** @@ -198,8 +190,7 @@ namespace orc { * @param literals the literals * @return this */ - SearchArgumentBuilder& in(const std::string& column, - PredicateDataType type, + SearchArgumentBuilder& in(const std::string& column, PredicateDataType type, const std::initializer_list<Literal>& literals) override; /** @@ -209,8 +200,7 @@ namespace orc { * @param literals the literals * @return this */ - SearchArgumentBuilder& in(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type, const std::initializer_list<Literal>& literals) override; /** @@ -220,8 +210,7 @@ namespace orc { * @param literals the literals * @return this */ - SearchArgumentBuilder& in(const std::string& column, - PredicateDataType type, + SearchArgumentBuilder& in(const std::string& column, PredicateDataType type, const std::vector<Literal>& literals) override; /** @@ -231,8 +220,7 @@ namespace orc { * @param literals the literals * @return this */ - SearchArgumentBuilder& in(uint64_t columnId, - PredicateDataType type, + SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type, const std::vector<Literal>& literals) override; /** @@ -241,8 +229,7 @@ namespace orc { * @param type the type of the expression * @return this */ - SearchArgumentBuilder& isNull(const std::string& column, - PredicateDataType type) override; + SearchArgumentBuilder& isNull(const std::string& column, PredicateDataType type) override; /** * Add an is null leaf to the current item on the stack. @@ -250,8 +237,7 @@ namespace orc { * @param type the type of the expression * @return this */ - SearchArgumentBuilder& isNull(uint64_t columnId, - PredicateDataType type) override; + SearchArgumentBuilder& isNull(uint64_t columnId, PredicateDataType type) override; /** * Add a between leaf to the current item on the stack. @@ -261,9 +247,7 @@ namespace orc { * @param upper the literal * @return this */ - SearchArgumentBuilder& between(const std::string& column, - PredicateDataType type, - Literal lower, + SearchArgumentBuilder& between(const std::string& column, PredicateDataType type, Literal lower, Literal upper) override; /** @@ -274,9 +258,7 @@ namespace orc { * @param upper the literal * @return this */ - SearchArgumentBuilder& between(uint64_t columnId, - PredicateDataType type, - Literal lower, + SearchArgumentBuilder& between(uint64_t columnId, PredicateDataType type, Literal lower, Literal upper) override; /** @@ -293,49 +275,40 @@ namespace orc { */ std::unique_ptr<SearchArgument> build() override; - private: + private: SearchArgumentBuilder& start(ExpressionTree::Operator op); size_t addLeaf(PredicateLeaf leaf); static bool isInvalidColumn(const std::string& column); static bool isInvalidColumn(uint64_t columnId); - template<typename T> - SearchArgumentBuilder& compareOperator(PredicateLeaf::Operator op, - T column, - PredicateDataType type, - Literal literal); + template <typename T> + SearchArgumentBuilder& compareOperator(PredicateLeaf::Operator op, T column, + PredicateDataType type, Literal literal); - template<typename T, typename CONTAINER> - SearchArgumentBuilder& addChildForIn(T column, - PredicateDataType type, + template <typename T, typename CONTAINER> + SearchArgumentBuilder& addChildForIn(T column, PredicateDataType type, const CONTAINER& literals); - template<typename T> - SearchArgumentBuilder& addChildForIsNull(T column, - PredicateDataType type); + template <typename T> + SearchArgumentBuilder& addChildForIsNull(T column, PredicateDataType type); - template<typename T> - SearchArgumentBuilder& addChildForBetween(T column, - PredicateDataType type, - Literal lower, + template <typename T> + SearchArgumentBuilder& addChildForBetween(T column, PredicateDataType type, Literal lower, Literal upper); - public: + public: static TreeNode pushDownNot(TreeNode root); static TreeNode foldMaybe(TreeNode expr); static TreeNode flatten(TreeNode root); static TreeNode convertToCNF(TreeNode root); - private: + private: std::deque<TreeNode> mCurrTree; - std::unordered_map<PredicateLeaf, - size_t, - PredicateLeafHash, - PredicateLeafComparator> mLeaves; + std::unordered_map<PredicateLeaf, size_t, PredicateLeafHash, PredicateLeafComparator> mLeaves; std::shared_ptr<ExpressionTree> mRoot; }; -} // namespace orc +} // namespace orc -#endif //ORC_SRC_SEARCHARGUMENT_HH +#endif // ORC_SRC_SEARCHARGUMENT_HH diff --git a/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc b/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc index fe00ed9472..4b3eda7e90 100644 --- a/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc +++ b/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc @@ -122,4 +122,4 @@ namespace orc { } } -} +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h index 605fbf826c..1373c18924 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h @@ -1,15 +1,20 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #ifndef CODED_STREAM_WRAPPER_HH @@ -20,12 +25,12 @@ DIAGNOSTIC_PUSH #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") +DIAGNOSTIC_IGNORE("-Wreserved-id-macro") #endif #if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wconversion") +DIAGNOSTIC_IGNORE("-Wconversion") #endif #include <google/protobuf/io/coded_stream.h> diff --git a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh index 5c161660cc..014c7d6570 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh +++ b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh @@ -1,15 +1,20 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #ifndef ORC_PROTO_WRAPPER_HH @@ -20,27 +25,27 @@ DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wconversion") - DIAGNOSTIC_IGNORE("-Wdeprecated") - DIAGNOSTIC_IGNORE("-Wsign-conversion") - DIAGNOSTIC_IGNORE("-Wunused-parameter") +DIAGNOSTIC_IGNORE("-Wconversion") +DIAGNOSTIC_IGNORE("-Wdeprecated") +DIAGNOSTIC_IGNORE("-Wsign-conversion") +DIAGNOSTIC_IGNORE("-Wunused-parameter") #endif #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wnested-anon-types") - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") - DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") - DIAGNOSTIC_IGNORE("-Wunknown-warning-option") - DIAGNOSTIC_IGNORE("-Wweak-vtables") - DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant") +DIAGNOSTIC_IGNORE("-Wnested-anon-types") +DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") +DIAGNOSTIC_IGNORE("-Wunknown-warning-option") +DIAGNOSTIC_IGNORE("-Wweak-vtables") +DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant") #endif #if defined(_MSC_VER) - DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned - DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false' +DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned +DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false' #endif -#include "contrib/libs/apache/orc/proto/orc_proto.pb.h" +#include "orc_proto.pb.h" DIAGNOSTIC_POP diff --git a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h index aeab0f0033..18166f7200 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h @@ -1,15 +1,20 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #ifndef SNAPPY_WRAPPER_HH @@ -20,7 +25,7 @@ DIAGNOSTIC_PUSH #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +DIAGNOSTIC_IGNORE("-Wreserved-id-macro") #endif #include <snappy.h> diff --git a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h index 1af0bd002d..0a42daaf84 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h @@ -1,15 +1,20 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ #ifndef ZERO_COPY_STREAM_WRAPPER_HH @@ -20,13 +25,13 @@ DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wdeprecated") - DIAGNOSTIC_IGNORE("-Wpadded") - DIAGNOSTIC_IGNORE("-Wunused-parameter") +DIAGNOSTIC_IGNORE("-Wdeprecated") +DIAGNOSTIC_IGNORE("-Wpadded") +DIAGNOSTIC_IGNORE("-Wunused-parameter") #endif #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +DIAGNOSTIC_IGNORE("-Wreserved-id-macro") #endif #include <google/protobuf/io/zero_copy_stream.h> diff --git a/contrib/libs/apache/orc/proto/orc_proto.proto b/contrib/libs/apache/orc/proto/orc_proto.proto deleted file mode 100644 index ff05657a54..0000000000 --- a/contrib/libs/apache/orc/proto/orc_proto.proto +++ /dev/null @@ -1,451 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto2"; - -package orc.proto; - -option java_package = "org.apache.orc"; - -message IntegerStatistics { - optional sint64 minimum = 1; - optional sint64 maximum = 2; - optional sint64 sum = 3; -} - -message DoubleStatistics { - optional double minimum = 1; - optional double maximum = 2; - optional double sum = 3; -} - -message StringStatistics { - optional string minimum = 1; - optional string maximum = 2; - // sum will store the total length of all strings in a stripe - optional sint64 sum = 3; - // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper - // bound instead of the minimum or maximum values above. - optional string lowerBound = 4; - optional string upperBound = 5; -} - -message BucketStatistics { - repeated uint64 count = 1 [packed=true]; -} - -message DecimalStatistics { - optional string minimum = 1; - optional string maximum = 2; - optional string sum = 3; -} - -message DateStatistics { - // min,max values saved as days since epoch - optional sint32 minimum = 1; - optional sint32 maximum = 2; -} - -message TimestampStatistics { - // min,max values saved as milliseconds since epoch - optional sint64 minimum = 1; - optional sint64 maximum = 2; - optional sint64 minimumUtc = 3; - optional sint64 maximumUtc = 4; - // store the lower 6 TS digits for min/max to achieve nanosecond precision - optional int32 minimumNanos = 5; - optional int32 maximumNanos = 6; -} - -message BinaryStatistics { - // sum will store the total binary blob length in a stripe - optional sint64 sum = 1; -} - -// Statistics for list and map -message CollectionStatistics { - optional uint64 minChildren = 1; - optional uint64 maxChildren = 2; - optional uint64 totalChildren = 3; -} - -message ColumnStatistics { - optional uint64 numberOfValues = 1; - optional IntegerStatistics intStatistics = 2; - optional DoubleStatistics doubleStatistics = 3; - optional StringStatistics stringStatistics = 4; - optional BucketStatistics bucketStatistics = 5; - optional DecimalStatistics decimalStatistics = 6; - optional DateStatistics dateStatistics = 7; - optional BinaryStatistics binaryStatistics = 8; - optional TimestampStatistics timestampStatistics = 9; - optional bool hasNull = 10; - optional uint64 bytesOnDisk = 11; - optional CollectionStatistics collectionStatistics = 12; -} - -message RowIndexEntry { - repeated uint64 positions = 1 [packed=true]; - optional ColumnStatistics statistics = 2; -} - -message RowIndex { - repeated RowIndexEntry entry = 1; -} - -message BloomFilter { - optional uint32 numHashFunctions = 1; - repeated fixed64 bitset = 2; - optional bytes utf8bitset = 3; -} - -message BloomFilterIndex { - repeated BloomFilter bloomFilter = 1; -} - -message Stream { - // if you add new index stream kinds, you need to make sure to update - // StreamName to ensure it is added to the stripe in the right area - enum Kind { - PRESENT = 0; - DATA = 1; - LENGTH = 2; - DICTIONARY_DATA = 3; - DICTIONARY_COUNT = 4; - SECONDARY = 5; - ROW_INDEX = 6; - BLOOM_FILTER = 7; - BLOOM_FILTER_UTF8 = 8; - // Virtual stream kinds to allocate space for encrypted index and data. - ENCRYPTED_INDEX = 9; - ENCRYPTED_DATA = 10; - - // stripe statistics streams - STRIPE_STATISTICS = 100; - // A virtual stream kind that is used for setting the encryption IV. - FILE_STATISTICS = 101; - } - optional Kind kind = 1; - optional uint32 column = 2; - optional uint64 length = 3; -} - -message ColumnEncoding { - enum Kind { - DIRECT = 0; - DICTIONARY = 1; - DIRECT_V2 = 2; - DICTIONARY_V2 = 3; - } - optional Kind kind = 1; - optional uint32 dictionarySize = 2; - - // The encoding of the bloom filters for this column: - // 0 or missing = none or original - // 1 = ORC-135 (utc for timestamps) - optional uint32 bloomEncoding = 3; -} - -message StripeEncryptionVariant { - repeated Stream streams = 1; - repeated ColumnEncoding encoding = 2; -} - -// each stripe looks like: -// index streams -// unencrypted -// variant 1..N -// data streams -// unencrypted -// variant 1..N -// footer - -message StripeFooter { - repeated Stream streams = 1; - repeated ColumnEncoding columns = 2; - optional string writerTimezone = 3; - // one for each column encryption variant - repeated StripeEncryptionVariant encryption = 4; -} - -// the file tail looks like: -// encrypted stripe statistics: ColumnarStripeStatistics (order by variant) -// stripe statistics: Metadata -// footer: Footer -// postscript: PostScript -// psLen: byte - -message StringPair { - optional string key = 1; - optional string value = 2; -} - -message Type { - enum Kind { - BOOLEAN = 0; - BYTE = 1; - SHORT = 2; - INT = 3; - LONG = 4; - FLOAT = 5; - DOUBLE = 6; - STRING = 7; - BINARY = 8; - TIMESTAMP = 9; - LIST = 10; - MAP = 11; - STRUCT = 12; - UNION = 13; - DECIMAL = 14; - DATE = 15; - VARCHAR = 16; - CHAR = 17; - TIMESTAMP_INSTANT = 18; - } - optional Kind kind = 1; - repeated uint32 subtypes = 2 [packed=true]; - repeated string fieldNames = 3; - optional uint32 maximumLength = 4; - optional uint32 precision = 5; - optional uint32 scale = 6; - repeated StringPair attributes = 7; -} - -message StripeInformation { - // the global file offset of the start of the stripe - optional uint64 offset = 1; - // the number of bytes of index - optional uint64 indexLength = 2; - // the number of bytes of data - optional uint64 dataLength = 3; - // the number of bytes in the stripe footer - optional uint64 footerLength = 4; - // the number of rows in this stripe - optional uint64 numberOfRows = 5; - // If this is present, the reader should use this value for the encryption - // stripe id for setting the encryption IV. Otherwise, the reader should - // use one larger than the previous stripe's encryptStripeId. - // For unmerged ORC files, the first stripe will use 1 and the rest of the - // stripes won't have it set. For merged files, the stripe information - // will be copied from their original files and thus the first stripe of - // each of the input files will reset it to 1. - // Note that 1 was choosen, because protobuf v3 doesn't serialize - // primitive types that are the default (eg. 0). - optional uint64 encryptStripeId = 6; - // For each encryption variant, the new encrypted local key to use - // until we find a replacement. - repeated bytes encryptedLocalKeys = 7; -} - -message UserMetadataItem { - optional string name = 1; - optional bytes value = 2; -} - -// StripeStatistics (1 per a stripe), which each contain the -// ColumnStatistics for each column. -// This message type is only used in ORC v0 and v1. -message StripeStatistics { - repeated ColumnStatistics colStats = 1; -} - -// This message type is only used in ORC v0 and v1. -message Metadata { - repeated StripeStatistics stripeStats = 1; -} - -// In ORC v2 (and for encrypted columns in v1), each column has -// their column statistics written separately. -message ColumnarStripeStatistics { - // one value for each stripe in the file - repeated ColumnStatistics colStats = 1; -} - -enum EncryptionAlgorithm { - UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms - AES_CTR_128 = 1; - AES_CTR_256 = 2; -} - -message FileStatistics { - repeated ColumnStatistics column = 1; -} - -// How was the data masked? This isn't necessary for reading the file, but -// is documentation about how the file was written. -message DataMask { - // the kind of masking, which may include third party masks - optional string name = 1; - // parameters for the mask - repeated string maskParameters = 2; - // the unencrypted column roots this mask was applied to - repeated uint32 columns = 3 [packed = true]; -} - -// Information about the encryption keys. -message EncryptionKey { - optional string keyName = 1; - optional uint32 keyVersion = 2; - optional EncryptionAlgorithm algorithm = 3; -} - -// The description of an encryption variant. -// Each variant is a single subtype that is encrypted with a single key. -message EncryptionVariant { - // the column id of the root - optional uint32 root = 1; - // The master key that was used to encrypt the local key, referenced as - // an index into the Encryption.key list. - optional uint32 key = 2; - // the encrypted key for the file footer - optional bytes encryptedKey = 3; - // the stripe statistics for this variant - repeated Stream stripeStatistics = 4; - // encrypted file statistics as a FileStatistics - optional bytes fileStatistics = 5; -} - -// Which KeyProvider encrypted the local keys. -enum KeyProviderKind { - UNKNOWN = 0; - HADOOP = 1; - AWS = 2; - GCP = 3; - AZURE = 4; -} - -message Encryption { - // all of the masks used in this file - repeated DataMask mask = 1; - // all of the keys used in this file - repeated EncryptionKey key = 2; - // The encrypted variants. - // Readers should prefer the first variant that the user has access to - // the corresponding key. If they don't have access to any of the keys, - // they should get the unencrypted masked data. - repeated EncryptionVariant variants = 3; - // How are the local keys encrypted? - optional KeyProviderKind keyProvider = 4; -} - -enum CalendarKind { - UNKNOWN_CALENDAR = 0; - // A hybrid Julian/Gregorian calendar with a cutover point in October 1582. - JULIAN_GREGORIAN = 1; - // A calendar that extends the Gregorian calendar back forever. - PROLEPTIC_GREGORIAN = 2; -} - -message Footer { - optional uint64 headerLength = 1; - optional uint64 contentLength = 2; - repeated StripeInformation stripes = 3; - repeated Type types = 4; - repeated UserMetadataItem metadata = 5; - optional uint64 numberOfRows = 6; - repeated ColumnStatistics statistics = 7; - optional uint32 rowIndexStride = 8; - - // Each implementation that writes ORC files should register for a code - // 0 = ORC Java - // 1 = ORC C++ - // 2 = Presto - // 3 = Scritchley Go from https://github.com/scritchley/orc - // 4 = Trino - optional uint32 writer = 9; - - // information about the encryption in this file - optional Encryption encryption = 10; - optional CalendarKind calendar = 11; - - // informative description about the version of the software that wrote - // the file. It is assumed to be within a given writer, so for example - // ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT". - optional string softwareVersion = 12; -} - -enum CompressionKind { - NONE = 0; - ZLIB = 1; - SNAPPY = 2; - LZO = 3; - LZ4 = 4; - ZSTD = 5; -} - -// Serialized length must be less that 255 bytes -message PostScript { - optional uint64 footerLength = 1; - optional CompressionKind compression = 2; - optional uint64 compressionBlockSize = 3; - // the version of the file format - // [0, 11] = Hive 0.11 - // [0, 12] = Hive 0.12 - repeated uint32 version = 4 [packed = true]; - optional uint64 metadataLength = 5; - - // The version of the writer that wrote the file. This number is - // updated when we make fixes or large changes to the writer so that - // readers can detect whether a given bug is present in the data. - // - // Only the Java ORC writer may use values under 6 (or missing) so that - // readers that predate ORC-202 treat the new writers correctly. Each - // writer should assign their own sequence of versions starting from 6. - // - // Version of the ORC Java writer: - // 0 = original - // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & - // string statistics use utf8 for min/max) - // 2 = HIVE-4243 fixed (use real column names from Hive tables) - // 3 = HIVE-12055 added (vectorized writer implementation) - // 4 = HIVE-13083 fixed (decimals write present stream correctly) - // 5 = ORC-101 fixed (bloom filters use utf8 consistently) - // 6 = ORC-135 fixed (timestamp statistics use utc) - // 7 = ORC-517 fixed (decimal64 min/max incorrect) - // 8 = ORC-203 added (trim very long string statistics) - // 9 = ORC-14 added (column encryption) - // - // Version of the ORC C++ writer: - // 6 = original - // - // Version of the Presto writer: - // 6 = original - // - // Version of the Scritchley Go writer: - // 6 = original - // - // Version of the Trino writer: - // 6 = original - // - optional uint32 writerVersion = 6; - - // the number of bytes in the encrypted stripe statistics - optional uint64 stripeStatisticsLength = 7; - - // Leave this last in the record - optional string magic = 8000; -} - -// The contents of the file tail that must be serialized. -// This gets serialized as part of OrcSplit, also used by footer cache. -message FileTail { - optional PostScript postscript = 1; - optional Footer footer = 2; - optional uint64 fileLength = 3; - optional uint64 postscriptLength = 4; -} diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make index be3b4d5a01..ec4d745340 100644 --- a/contrib/libs/apache/orc/ya.make +++ b/contrib/libs/apache/orc/ya.make @@ -6,11 +6,12 @@ LICENSE(Apache-2.0) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -VERSION(1.8.0) +VERSION(2.0.0) -ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-1.8.0.tar.gz) +ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.0.0.tar.gz) PEERDIR( + contrib/libs/apache/orc-format contrib/libs/lz4 contrib/libs/snappy contrib/libs/zlib @@ -20,7 +21,6 @@ PEERDIR( ADDINCL( GLOBAL contrib/libs/apache/orc/c++/include contrib/libs/apache/orc/c++/src - contrib/libs/apache/orc/proto contrib/libs/lz4 contrib/libs/zstd/include ) @@ -29,15 +29,23 @@ NO_COMPILER_WARNINGS() NO_UTIL() +CFLAGS( + -DENABLE_METRICS=0 +) + SRCS( c++/src/Adaptor.cc + c++/src/BlockBuffer.cc c++/src/BloomFilter.cc + c++/src/BpackingDefault.cc c++/src/ByteRLE.cc c++/src/ColumnPrinter.cc c++/src/ColumnReader.cc c++/src/ColumnWriter.cc c++/src/Common.cc c++/src/Compression.cc + c++/src/ConvertColumnReader.cc + c++/src/CpuInfoUtil.cc c++/src/Exceptions.cc c++/src/Int128.cc c++/src/LzoDecompressor.cc @@ -50,6 +58,7 @@ SRCS( c++/src/Reader.cc c++/src/RleDecoderV2.cc c++/src/RleEncoderV2.cc + c++/src/SchemaEvolution.cc c++/src/Statistics.cc c++/src/StripeStream.cc c++/src/Timezone.cc @@ -64,7 +73,6 @@ SRCS( c++/src/sargs/SargsApplier.cc c++/src/sargs/SearchArgument.cc c++/src/sargs/TruthValue.cc - proto/orc_proto.proto ) END() |