aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/orc/c++/src
diff options
context:
space:
mode:
authoriaz1607 <iaz1607@yandex-team.ru>2022-02-10 16:45:37 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:37 +0300
commite5437feb4ac2d2dc044e1090b9312dde5ef197e0 (patch)
treef5a238c69dd20a1fa2092127a31b8aff25020f7d /contrib/libs/apache/orc/c++/src
parentf4945d0a44b8770f0801de3056aa41639b0b7bd2 (diff)
downloadydb-e5437feb4ac2d2dc044e1090b9312dde5ef197e0.tar.gz
Restoring authorship annotation for <iaz1607@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/apache/orc/c++/src')
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor.cc176
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor.hh344
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.cc632
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.hh388
-rw-r--r--contrib/libs/apache/orc/c++/src/ByteRLE.cc1252
-rw-r--r--contrib/libs/apache/orc/c++/src/ByteRLE.hh234
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnPrinter.cc1494
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnReader.cc3670
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnReader.hh312
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.cc6024
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.hh442
-rw-r--r--contrib/libs/apache/orc/c++/src/Common.cc244
-rw-r--r--contrib/libs/apache/orc/c++/src/Compression.cc2298
-rw-r--r--contrib/libs/apache/orc/c++/src/Compression.hh116
-rw-r--r--contrib/libs/apache/orc/c++/src/Exceptions.cc156
-rw-r--r--contrib/libs/apache/orc/c++/src/Int128.cc988
-rw-r--r--contrib/libs/apache/orc/c++/src/LzoDecompressor.cc782
-rw-r--r--contrib/libs/apache/orc/c++/src/LzoDecompressor.hh84
-rw-r--r--contrib/libs/apache/orc/c++/src/MemoryPool.cc488
-rw-r--r--contrib/libs/apache/orc/c++/src/Murmur3.cc196
-rw-r--r--contrib/libs/apache/orc/c++/src/Murmur3.hh80
-rw-r--r--contrib/libs/apache/orc/c++/src/Options.hh516
-rw-r--r--contrib/libs/apache/orc/c++/src/OrcFile.cc364
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.cc242
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.hh310
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEV2Util.cc140
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEV2Util.hh162
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv1.cc604
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv1.hh182
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv2.hh502
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.cc2420
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.hh610
-rw-r--r--contrib/libs/apache/orc/c++/src/RleDecoderV2.cc852
-rw-r--r--contrib/libs/apache/orc/c++/src/RleEncoderV2.cc1542
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.cc816
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.hh2906
-rw-r--r--contrib/libs/apache/orc/c++/src/StripeStream.cc322
-rw-r--r--contrib/libs/apache/orc/c++/src/StripeStream.hh426
-rw-r--r--contrib/libs/apache/orc/c++/src/Timezone.cc1872
-rw-r--r--contrib/libs/apache/orc/c++/src/Timezone.hh260
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.cc1414
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.hh396
-rw-r--r--contrib/libs/apache/orc/c++/src/Vector.cc1036
-rw-r--r--contrib/libs/apache/orc/c++/src/Writer.cc1280
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.cc442
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.hh230
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.cc292
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.hh190
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h68
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh92
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h60
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h70
52 files changed, 20509 insertions, 20509 deletions
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.cc b/contrib/libs/apache/orc/c++/src/Adaptor.cc
index bf3a3e181b..f402d65adf 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor.cc
+++ b/contrib/libs/apache/orc/c++/src/Adaptor.cc
@@ -1,88 +1,88 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "Adaptor.hh"
-#include <sstream>
-#include <iomanip>
-
-#ifndef HAS_STOLL
-namespace std {
- int64_t std::stoll(std::string str) {
- int64_t val = 0;
- stringstream ss;
- ss << str;
- ss >> val;
- return val;
- }
-}
-#endif
-
-#ifndef HAS_STRPTIME
-char* strptime(const char* s, const char* f, struct tm* tm) {
- std::istringstream input(s);
- input.imbue(std::locale(setlocale(LC_ALL, nullptr)));
- input >> std::get_time(tm, f);
- if (input.fail()) return nullptr;
- return (char*)(s + input.tellg());
-}
-#endif
-
-#ifndef HAS_PREAD
- #ifdef _WIN32
-#include <Windows.h>
-#include <io.h>
-ssize_t pread(int fd, void* buf, size_t size, off_t offset) {
- auto handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
-
- OVERLAPPED ol;
- memset(&ol, 0, sizeof(OVERLAPPED));
- ol.Offset = offset;
-
- DWORD rt;
- if (!ReadFile(handle, buf, static_cast<DWORD>(size), &rt, &ol)) {
- errno = GetLastError();
- return -1;
- }
- return static_cast<ssize_t>(rt);
-}
- #else
- #error("pread() undefined: unknown environment")
- #endif
-#endif
-
-namespace orc {
-#ifdef HAS_DOUBLE_TO_STRING
- std::string to_string(double val) {
- return std::to_string(val);
- }
-#else
- std::string to_string(double val) {
- return std::to_string(static_cast<long double>(val));
- }
-#endif
-
-#ifdef HAS_INT64_TO_STRING
- std::string to_string(int64_t val) {
- return std::to_string(val);
- }
-#else
- std::string to_string(int64_t val) {
- return std::to_string(static_cast<long long int>(val));
- }
-#endif
-}
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "Adaptor.hh"
+#include <sstream>
+#include <iomanip>
+
+#ifndef HAS_STOLL
+namespace std {
+ int64_t std::stoll(std::string str) {
+ int64_t val = 0;
+ stringstream ss;
+ ss << str;
+ ss >> val;
+ return val;
+ }
+}
+#endif
+
+#ifndef HAS_STRPTIME
+char* strptime(const char* s, const char* f, struct tm* tm) {
+ std::istringstream input(s);
+ input.imbue(std::locale(setlocale(LC_ALL, nullptr)));
+ input >> std::get_time(tm, f);
+ if (input.fail()) return nullptr;
+ return (char*)(s + input.tellg());
+}
+#endif
+
+#ifndef HAS_PREAD
+ #ifdef _WIN32
+#include <Windows.h>
+#include <io.h>
+ssize_t pread(int fd, void* buf, size_t size, off_t offset) {
+ auto handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+
+ OVERLAPPED ol;
+ memset(&ol, 0, sizeof(OVERLAPPED));
+ ol.Offset = offset;
+
+ DWORD rt;
+ if (!ReadFile(handle, buf, static_cast<DWORD>(size), &rt, &ol)) {
+ errno = GetLastError();
+ return -1;
+ }
+ return static_cast<ssize_t>(rt);
+}
+ #else
+ #error("pread() undefined: unknown environment")
+ #endif
+#endif
+
+namespace orc {
+#ifdef HAS_DOUBLE_TO_STRING
+ std::string to_string(double val) {
+ return std::to_string(val);
+ }
+#else
+ std::string to_string(double val) {
+ return std::to_string(static_cast<long double>(val));
+ }
+#endif
+
+#ifdef HAS_INT64_TO_STRING
+ std::string to_string(int64_t val) {
+ return std::to_string(val);
+ }
+#else
+ std::string to_string(int64_t val) {
+ return std::to_string(static_cast<long long int>(val));
+ }
+#endif
+}
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.hh b/contrib/libs/apache/orc/c++/src/Adaptor.hh
index a91b9c894d..2d6be71faa 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor.hh
+++ b/contrib/libs/apache/orc/c++/src/Adaptor.hh
@@ -1,175 +1,175 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ADAPTER_HH
-#define ADAPTER_HH
-
-/* #undef INT64_IS_LL */
-#define HAS_CONSTEXPR
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ADAPTER_HH
+#define ADAPTER_HH
+
+/* #undef INT64_IS_LL */
+#define HAS_CONSTEXPR
#ifndef _MSC_VER
-#define HAS_PREAD
-#endif
-#define HAS_STRPTIME
-#define HAS_STOLL
-#define HAS_DIAGNOSTIC_PUSH
-#define HAS_DOUBLE_TO_STRING
-#define HAS_INT64_TO_STRING
-#define HAS_PRE_1970
+#define HAS_PREAD
+#endif
+#define HAS_STRPTIME
+#define HAS_STOLL
+#define HAS_DIAGNOSTIC_PUSH
+#define HAS_DOUBLE_TO_STRING
+#define HAS_INT64_TO_STRING
+#define HAS_PRE_1970
#define HAS_POST_2038
-#define HAS_STD_ISNAN
-#define HAS_STD_MUTEX
+#define HAS_STD_ISNAN
+#define HAS_STD_MUTEX
#ifndef _MSC_VER
#define HAS_BUILTIN_OVERFLOW_CHECK
#endif
-/* #undef NEEDS_REDUNDANT_MOVE */
-/* #undef NEEDS_Z_PREFIX */
-
-#include "orc/orc-config.hh"
-#include <string>
-
-#ifdef _MSC_VER
-#include <BaseTsd.h>
-typedef SSIZE_T ssize_t;
-#define timegm(tm) _mkgmtime(tm)
-#define gmtime_r(timep, result) (gmtime_s(result, timep) ? NULL : result)
-#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf)
-#endif
-
-#ifndef HAS_STOLL
- // A poor man's stoll that converts str to a long long int base 10
- namespace std {
- int64_t stoll(std::string str);
- }
-#endif
-
-#ifndef HAS_STRPTIME
- char* strptime(const char* buf, const char* format, struct tm* tm);
-#endif
-
-#ifndef HAS_PREAD
- ssize_t pread(int fd, void* buf, size_t count, off_t offset);
-#endif
-
-#ifdef INT64_IS_LL
- #define INT64_FORMAT_STRING "ll"
-#else
- #define INT64_FORMAT_STRING "l"
-#endif
-
-#ifndef ORC_CXX_HAS_NOEXCEPT
- #define noexcept ORC_NOEXCEPT
-#endif
-
-#ifndef ORC_CXX_HAS_OVERRIDE
- #define override ORC_OVERRIDE
-#endif
-
-#ifdef HAS_DIAGNOSTIC_PUSH
- #ifdef __clang__
- #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
- #define DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
- #elif defined(__GNUC__)
- #define DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
- #define DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
- #elif defined(_MSC_VER)
- #define DIAGNOSTIC_PUSH __pragma(warning(push))
- #define DIAGNOSTIC_POP __pragma(warning(pop))
- #else
- #error("Unknown compiler")
- #endif
-#else
- #define DIAGNOSTIC_PUSH
- #define DIAGNOSTIC_POP
-#endif
-
-#define PRAGMA(TXT) _Pragma(#TXT)
-
- #define DIAGNOSTIC_IGNORE(XXX)
-
-#ifndef ORC_CXX_HAS_UNIQUE_PTR
- #define unique_ptr auto_ptr
-#endif
-
-#ifndef UINT32_MAX
- #define UINT32_MAX 0xffffffff
-#endif
-
-#ifndef INT64_MAX
- #define INT64_MAX 0x7fffffffffffffff
-#endif
-
-#ifndef INT64_MIN
- #define INT64_MIN (-0x7fffffffffffffff - 1)
-#endif
-
-#define GTEST_LANG_CXX11 0
-
-#ifdef NEEDS_REDUNDANT_MOVE
- #define REDUNDANT_MOVE(XXX) std::move(XXX)
-#else
- #define REDUNDANT_MOVE(XXX) XXX
-#endif
-
-#ifndef HAS_STD_ISNAN
- #include <math.h>
- #define std::isnan(XXX) isnan(XXX)
-#else
- #include <cmath>
-#endif
-
-#ifndef HAS_STD_MUTEX
- #include <pthread.h>
- namespace orc {
- /**
- * Lock guard for pthread_mutex_t object using RAII
- * The Lock is automatically release when exiting current scope.
- */
- class LockORC {
- public:
- explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) {
- pthread_mutex_lock(&mutex_ref_);
- }
- ~LockORC() { pthread_mutex_unlock(&mutex_ref_); }
- private:
- // no default constructor
- LockORC();
- // prohibit copying
- LockORC(const LockORC&);
- LockORC& operator=(const LockORC&);
-
- pthread_mutex_t& mutex_ref_;
- };
- }
- #define std::mutex pthread_mutex_t
- #define std::lock_guard<std::mutex> LockORC
-#else
- #include <mutex>
-#endif
-
-#ifdef NEEDS_Z_PREFIX
-#define Z_PREFIX 1
-#endif
-
-namespace orc {
- std::string to_string(double val);
- std::string to_string(int64_t val);
-}
-
+/* #undef NEEDS_REDUNDANT_MOVE */
+/* #undef NEEDS_Z_PREFIX */
+
+#include "orc/orc-config.hh"
+#include <string>
+
+#ifdef _MSC_VER
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#define timegm(tm) _mkgmtime(tm)
+#define gmtime_r(timep, result) (gmtime_s(result, timep) ? NULL : result)
+#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf)
+#endif
+
+#ifndef HAS_STOLL
+ // A poor man's stoll that converts str to a long long int base 10
+ namespace std {
+ int64_t stoll(std::string str);
+ }
+#endif
+
+#ifndef HAS_STRPTIME
+ char* strptime(const char* buf, const char* format, struct tm* tm);
+#endif
+
+#ifndef HAS_PREAD
+ ssize_t pread(int fd, void* buf, size_t count, off_t offset);
+#endif
+
+#ifdef INT64_IS_LL
+ #define INT64_FORMAT_STRING "ll"
+#else
+ #define INT64_FORMAT_STRING "l"
+#endif
+
+#ifndef ORC_CXX_HAS_NOEXCEPT
+ #define noexcept ORC_NOEXCEPT
+#endif
+
+#ifndef ORC_CXX_HAS_OVERRIDE
+ #define override ORC_OVERRIDE
+#endif
+
+#ifdef HAS_DIAGNOSTIC_PUSH
+ #ifdef __clang__
+ #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+ #define DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+ #elif defined(__GNUC__)
+ #define DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+ #define DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+ #elif defined(_MSC_VER)
+ #define DIAGNOSTIC_PUSH __pragma(warning(push))
+ #define DIAGNOSTIC_POP __pragma(warning(pop))
+ #else
+ #error("Unknown compiler")
+ #endif
+#else
+ #define DIAGNOSTIC_PUSH
+ #define DIAGNOSTIC_POP
+#endif
+
+#define PRAGMA(TXT) _Pragma(#TXT)
+
+ #define DIAGNOSTIC_IGNORE(XXX)
+
+#ifndef ORC_CXX_HAS_UNIQUE_PTR
+ #define unique_ptr auto_ptr
+#endif
+
+#ifndef UINT32_MAX
+ #define UINT32_MAX 0xffffffff
+#endif
+
+#ifndef INT64_MAX
+ #define INT64_MAX 0x7fffffffffffffff
+#endif
+
+#ifndef INT64_MIN
+ #define INT64_MIN (-0x7fffffffffffffff - 1)
+#endif
+
+#define GTEST_LANG_CXX11 0
+
+#ifdef NEEDS_REDUNDANT_MOVE
+ #define REDUNDANT_MOVE(XXX) std::move(XXX)
+#else
+ #define REDUNDANT_MOVE(XXX) XXX
+#endif
+
+#ifndef HAS_STD_ISNAN
+ #include <math.h>
+ #define std::isnan(XXX) isnan(XXX)
+#else
+ #include <cmath>
+#endif
+
+#ifndef HAS_STD_MUTEX
+ #include <pthread.h>
+ namespace orc {
+ /**
+ * Lock guard for pthread_mutex_t object using RAII
+ * The Lock is automatically release when exiting current scope.
+ */
+ class LockORC {
+ public:
+ explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) {
+ pthread_mutex_lock(&mutex_ref_);
+ }
+ ~LockORC() { pthread_mutex_unlock(&mutex_ref_); }
+ private:
+ // no default constructor
+ LockORC();
+ // prohibit copying
+ LockORC(const LockORC&);
+ LockORC& operator=(const LockORC&);
+
+ pthread_mutex_t& mutex_ref_;
+ };
+ }
+ #define std::mutex pthread_mutex_t
+ #define std::lock_guard<std::mutex> LockORC
+#else
+ #include <mutex>
+#endif
+
+#ifdef NEEDS_Z_PREFIX
+#define Z_PREFIX 1
+#endif
+
+namespace orc {
+ std::string to_string(double val);
+ std::string to_string(int64_t val);
+}
+
#ifdef HAS_BUILTIN_OVERFLOW_CHECK
#define multiplyExact !__builtin_mul_overflow
#define addExact !__builtin_add_overflow
@@ -204,8 +204,8 @@ namespace orc {
}
#endif
-#ifndef HAS_CONSTEXPR
-#define constexpr const
-#endif
-
-#endif /* ADAPTER_HH */
+#ifndef HAS_CONSTEXPR
+#define constexpr const
+#endif
+
+#endif /* ADAPTER_HH */
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
index 8a1f1880e7..8ec0acda8c 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
@@ -1,328 +1,328 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "BloomFilter.hh"
-#include "Murmur3.hh"
-
-namespace orc {
-
- constexpr uint64_t BITS_OF_LONG = 64;
- constexpr uint8_t SHIFT_6_BITS = 6;
- constexpr uint8_t SHIFT_3_BITS = 3;
-
- static bool isLittleEndian() {
- static union { uint32_t i; char c[4]; } num = { 0x01020304 };
- return num.c[0] == 4;
- }
-
- /**
- * Implementation of BitSet
- */
- BitSet::BitSet(uint64_t numBits) {
- mData.resize(static_cast<size_t>(ceil(
- static_cast<double>(numBits) / BITS_OF_LONG)), 0);
- }
-
- BitSet::BitSet(const uint64_t * bits, uint64_t numBits) {
- // caller should make sure numBits is multiple of 64
- mData.resize(numBits >> SHIFT_6_BITS, 0);
- memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS);
- }
-
- void BitSet::set(uint64_t index) {
- mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG));
- }
-
- bool BitSet::get(uint64_t index) {
- return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0;
- }
-
- uint64_t BitSet::bitSize() {
- return mData.size() << SHIFT_6_BITS;
- }
-
- void BitSet::merge(const BitSet& other) {
- if (mData.size() != other.mData.size()) {
- std::stringstream ss;
- ss << "BitSet must be of equal length ("
- << mData.size() << " != " << other.mData.size() << ")";
- throw std::logic_error(ss.str());
- }
-
- for (size_t i = 0; i != mData.size(); i++) {
- mData[i] |= other.mData[i];
- }
- }
-
- void BitSet::clear() {
- memset(mData.data(), 0, sizeof(uint64_t) * mData.size());
- }
-
- const uint64_t * BitSet::getData() const {
- return mData.data();
- }
-
- bool BitSet::operator==(const BitSet& other) const {
- return mData == other.mData;
- }
-
- /**
- * Helper functions
- */
- void checkArgument(bool expression, const std::string& message) {
- if (!expression) {
- throw std::logic_error(message);
- }
- }
-
- int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) {
- double n = static_cast<double>(expectedEntries);
- return std::max<int32_t>(1, static_cast<int32_t>(
- std::round(static_cast<double>(numBits) / n * std::log(2.0))));
- }
-
- int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) {
- double n = static_cast<double>(expectedEntries);
- return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) * std::log(2.0)));
- }
-
- // We use the trick mentioned in "Less Hashing, Same Performance:
- // Building a Better Bloom Filter" by Kirsch et.al. From abstract
- // 'only two hash functions are necessary to effectively implement
- // a Bloom filter without any loss in the asymptotic false positive
- // probability'
- // Lets split up 64-bit hashcode into two 32-bit hash codes and employ
- // the technique mentioned in the above paper
- inline uint64_t getBytesHash(const char * data, int64_t length) {
- if (data == nullptr) {
- return Murmur3::NULL_HASHCODE;
- }
-
- return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data),
- static_cast<uint32_t>(length));
- }
-
- /**
- * Implementation of BloomFilter
- */
- BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) {
- checkArgument(expectedEntries > 0,
- "expectedEntries should be > 0");
- checkArgument(fpp > 0.0 && fpp < 1.0,
- "False positive probability should be > 0.0 & < 1.0");
-
- uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp));
- // make 'mNumBits' multiple of 64
- mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG));
- mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits);
- mBitSet.reset(new BitSet(mNumBits));
- }
-
- void BloomFilterImpl::addBytes(const char * data, int64_t length) {
- uint64_t hash64 = getBytesHash(data, length);
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BloomFilter.hh"
+#include "Murmur3.hh"
+
+namespace orc {
+
+ constexpr uint64_t BITS_OF_LONG = 64;
+ constexpr uint8_t SHIFT_6_BITS = 6;
+ constexpr uint8_t SHIFT_3_BITS = 3;
+
+ static bool isLittleEndian() {
+ static union { uint32_t i; char c[4]; } num = { 0x01020304 };
+ return num.c[0] == 4;
+ }
+
+ /**
+ * Implementation of BitSet
+ */
+ BitSet::BitSet(uint64_t numBits) {
+ mData.resize(static_cast<size_t>(ceil(
+ static_cast<double>(numBits) / BITS_OF_LONG)), 0);
+ }
+
+ BitSet::BitSet(const uint64_t * bits, uint64_t numBits) {
+ // caller should make sure numBits is multiple of 64
+ mData.resize(numBits >> SHIFT_6_BITS, 0);
+ memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS);
+ }
+
+ void BitSet::set(uint64_t index) {
+ mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG));
+ }
+
+ bool BitSet::get(uint64_t index) {
+ return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0;
+ }
+
+ uint64_t BitSet::bitSize() {
+ return mData.size() << SHIFT_6_BITS;
+ }
+
+ void BitSet::merge(const BitSet& other) {
+ if (mData.size() != other.mData.size()) {
+ std::stringstream ss;
+ ss << "BitSet must be of equal length ("
+ << mData.size() << " != " << other.mData.size() << ")";
+ throw std::logic_error(ss.str());
+ }
+
+ for (size_t i = 0; i != mData.size(); i++) {
+ mData[i] |= other.mData[i];
+ }
+ }
+
+ void BitSet::clear() {
+ memset(mData.data(), 0, sizeof(uint64_t) * mData.size());
+ }
+
+ const uint64_t * BitSet::getData() const {
+ return mData.data();
+ }
+
+ bool BitSet::operator==(const BitSet& other) const {
+ return mData == other.mData;
+ }
+
+ /**
+ * Helper functions
+ */
+ void checkArgument(bool expression, const std::string& message) {
+ if (!expression) {
+ throw std::logic_error(message);
+ }
+ }
+
+ int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) {
+ double n = static_cast<double>(expectedEntries);
+ return std::max<int32_t>(1, static_cast<int32_t>(
+ std::round(static_cast<double>(numBits) / n * std::log(2.0))));
+ }
+
+ int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) {
+ double n = static_cast<double>(expectedEntries);
+ return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) * std::log(2.0)));
+ }
+
+ // We use the trick mentioned in "Less Hashing, Same Performance:
+ // Building a Better Bloom Filter" by Kirsch et.al. From abstract
+ // 'only two hash functions are necessary to effectively implement
+ // a Bloom filter without any loss in the asymptotic false positive
+ // probability'
+ // Lets split up 64-bit hashcode into two 32-bit hash codes and employ
+ // the technique mentioned in the above paper
+ inline uint64_t getBytesHash(const char * data, int64_t length) {
+ if (data == nullptr) {
+ return Murmur3::NULL_HASHCODE;
+ }
+
+ return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data),
+ static_cast<uint32_t>(length));
+ }
+
+ /**
+ * Implementation of BloomFilter
+ */
+ BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) {
+ checkArgument(expectedEntries > 0,
+ "expectedEntries should be > 0");
+ checkArgument(fpp > 0.0 && fpp < 1.0,
+ "False positive probability should be > 0.0 & < 1.0");
+
+ uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp));
+ // make 'mNumBits' multiple of 64
+ mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG));
+ mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits);
+ mBitSet.reset(new BitSet(mNumBits));
+ }
+
+ void BloomFilterImpl::addBytes(const char * data, int64_t length) {
+ uint64_t hash64 = getBytesHash(data, length);
addHash(static_cast<int64_t>(hash64));
- }
-
- void BloomFilterImpl::addLong(int64_t data) {
+ }
+
+ void BloomFilterImpl::addLong(int64_t data) {
addHash(getLongHash(data));
- }
-
- bool BloomFilterImpl::testBytes(const char * data, int64_t length) const {
- uint64_t hash64 = getBytesHash(data, length);
+ }
+
+ bool BloomFilterImpl::testBytes(const char * data, int64_t length) const {
+ uint64_t hash64 = getBytesHash(data, length);
return testHash(static_cast<int64_t>(hash64));
- }
-
- bool BloomFilterImpl::testLong(int64_t data) const {
+ }
+
+ bool BloomFilterImpl::testLong(int64_t data) const {
return testHash(getLongHash(data));
- }
-
- uint64_t BloomFilterImpl::sizeInBytes() const {
- return getBitSize() >> SHIFT_3_BITS;
- }
-
- uint64_t BloomFilterImpl::getBitSize() const {
- return mBitSet->bitSize();
- }
-
- int32_t BloomFilterImpl::getNumHashFunctions() const {
- return mNumHashFunctions;
- }
-
- DIAGNOSTIC_PUSH
-
-#if defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wundefined-reinterpret-cast")
-#endif
-
-#if defined(__GNUC__)
- DIAGNOSTIC_IGNORE("-Wstrict-aliasing")
-#endif
-
- // caller should make sure input proto::BloomFilter is valid since
- // no check will be performed in the following constructor
- BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) {
- mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions());
-
- const std::string& bitsetStr = bloomFilter.utf8bitset();
- mNumBits = bitsetStr.size() << SHIFT_3_BITS;
- checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!");
-
- const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data());
- if (isLittleEndian()) {
- mBitSet.reset(new BitSet(bitset, mNumBits));
- } else {
- std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS);
- for (size_t i = 0; i != longs.size(); ++i) {
- // convert little-endian to big-endian
- const uint64_t src = bitset[i];
- uint64_t& dst = longs[i];
- for (size_t bit = 0; bit != 64; bit += 8) {
- dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit));
- }
- }
-
- mBitSet.reset(new BitSet(longs.data(), mNumBits));
- }
- }
-
- void BloomFilterImpl::addDouble(double data) {
- addLong(reinterpret_cast<int64_t&>(data));
- }
-
- bool BloomFilterImpl::testDouble(double data) const{
- return testLong(reinterpret_cast<int64_t&>(data));
- }
-
- DIAGNOSTIC_POP
-
+ }
+
+ uint64_t BloomFilterImpl::sizeInBytes() const {
+ return getBitSize() >> SHIFT_3_BITS;
+ }
+
+ uint64_t BloomFilterImpl::getBitSize() const {
+ return mBitSet->bitSize();
+ }
+
+ int32_t BloomFilterImpl::getNumHashFunctions() const {
+ return mNumHashFunctions;
+ }
+
+ DIAGNOSTIC_PUSH
+
+#if defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wundefined-reinterpret-cast")
+#endif
+
+#if defined(__GNUC__)
+ DIAGNOSTIC_IGNORE("-Wstrict-aliasing")
+#endif
+
+ // caller should make sure input proto::BloomFilter is valid since
+ // no check will be performed in the following constructor
+ BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) {
+ mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions());
+
+ const std::string& bitsetStr = bloomFilter.utf8bitset();
+ mNumBits = bitsetStr.size() << SHIFT_3_BITS;
+ checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!");
+
+ const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data());
+ if (isLittleEndian()) {
+ mBitSet.reset(new BitSet(bitset, mNumBits));
+ } else {
+ std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS);
+ for (size_t i = 0; i != longs.size(); ++i) {
+ // convert little-endian to big-endian
+ const uint64_t src = bitset[i];
+ uint64_t& dst = longs[i];
+ for (size_t bit = 0; bit != 64; bit += 8) {
+ dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit));
+ }
+ }
+
+ mBitSet.reset(new BitSet(longs.data(), mNumBits));
+ }
+ }
+
+ void BloomFilterImpl::addDouble(double data) {
+ addLong(reinterpret_cast<int64_t&>(data));
+ }
+
+ bool BloomFilterImpl::testDouble(double data) const{
+ return testLong(reinterpret_cast<int64_t&>(data));
+ }
+
+ DIAGNOSTIC_POP
+
void BloomFilterImpl::addHash(int64_t hash64) {
- int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
+ int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
// So we cast hash64 to uint64_t here for an unsigned right shift.
int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32);
-
- for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
- int32_t combinedHash = hash1 + i * hash2;
- // hashcode should be positive, flip all the bits if it's negative
- if (combinedHash < 0) {
- combinedHash = ~combinedHash;
- }
- uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits;
- mBitSet->set(pos);
- }
- }
-
+
+ for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
+ int32_t combinedHash = hash1 + i * hash2;
+ // hashcode should be positive, flip all the bits if it's negative
+ if (combinedHash < 0) {
+ combinedHash = ~combinedHash;
+ }
+ uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits;
+ mBitSet->set(pos);
+ }
+ }
+
bool BloomFilterImpl::testHash(int64_t hash64) const{
- int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
+ int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
// So we cast hash64 to uint64_t here for an unsigned right shift.
int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32);
-
- for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
- int32_t combinedHash = hash1 + i * hash2;
- // hashcode should be positive, flip all the bits if it's negative
- if (combinedHash < 0) {
- combinedHash = ~combinedHash;
- }
- uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits;
- if (!mBitSet->get(pos)) {
- return false;
- }
- }
- return true;
- }
-
- void BloomFilterImpl::merge(const BloomFilterImpl& other) {
- if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) {
- std::stringstream ss;
- ss << "BloomFilters are not compatible for merging: "
- << "this: numBits:" << mNumBits
- << ",numHashFunctions:" << mNumHashFunctions
- << ", that: numBits:" << other.mNumBits
- << ",numHashFunctions:" << other.mNumHashFunctions;
- throw std::logic_error(ss.str());
- }
-
- mBitSet->merge(*other.mBitSet);
- }
-
- void BloomFilterImpl::reset() {
- mBitSet->clear();
- }
-
- void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const {
- bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions));
-
- // According to ORC standard, the encoding is a sequence of bytes with
- // a little endian encoding in the utf8bitset field.
- if (isLittleEndian()) {
- // bytes are already organized in little endian; thus no conversion needed
- const char * bitset = reinterpret_cast<const char *>(mBitSet->getData());
- bloomFilter.set_utf8bitset(bitset, sizeInBytes());
- } else {
- std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0);
- const uint64_t * longs = mBitSet->getData();
- for (size_t i = 0; i != bitset.size(); ++i) {
- uint64_t& dst = bitset[i];
- const uint64_t src = longs[i];
- // convert big-endian to little-endian
- for (size_t bit = 0; bit != 64; bit += 8) {
- dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit));
- }
- }
- bloomFilter.set_utf8bitset(bitset.data(), sizeInBytes());
- }
- }
-
- bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const {
- return mNumBits == other.mNumBits &&
- mNumHashFunctions == other.mNumHashFunctions &&
- *mBitSet == *other.mBitSet;
- }
-
- BloomFilter::~BloomFilter() {
- // PASS
- }
-
- std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize(
- const proto::Stream_Kind& streamKind,
- const proto::ColumnEncoding& encoding,
- const proto::BloomFilter& bloomFilter) {
-
- std::unique_ptr<BloomFilter> ret(nullptr);
-
- // only BLOOM_FILTER_UTF8 is supported
- if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) {
- return ret;
- }
-
- // make sure we don't use unknown encodings or original timestamp encodings
- if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) {
- return ret;
- }
-
- // make sure all required fields exist
- if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) {
- return ret;
- }
-
- ret.reset(new BloomFilterImpl(bloomFilter));
- return ret;
- }
-
-}
+
+ for (int32_t i = 1; i <= mNumHashFunctions; ++i) {
+ int32_t combinedHash = hash1 + i * hash2;
+ // hashcode should be positive, flip all the bits if it's negative
+ if (combinedHash < 0) {
+ combinedHash = ~combinedHash;
+ }
+ uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits;
+ if (!mBitSet->get(pos)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void BloomFilterImpl::merge(const BloomFilterImpl& other) {
+ if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) {
+ std::stringstream ss;
+ ss << "BloomFilters are not compatible for merging: "
+ << "this: numBits:" << mNumBits
+ << ",numHashFunctions:" << mNumHashFunctions
+ << ", that: numBits:" << other.mNumBits
+ << ",numHashFunctions:" << other.mNumHashFunctions;
+ throw std::logic_error(ss.str());
+ }
+
+ mBitSet->merge(*other.mBitSet);
+ }
+
+ void BloomFilterImpl::reset() {
+ mBitSet->clear();
+ }
+
+ void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const {
+ bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions));
+
+ // According to ORC standard, the encoding is a sequence of bytes with
+ // a little endian encoding in the utf8bitset field.
+ if (isLittleEndian()) {
+ // bytes are already organized in little endian; thus no conversion needed
+ const char * bitset = reinterpret_cast<const char *>(mBitSet->getData());
+ bloomFilter.set_utf8bitset(bitset, sizeInBytes());
+ } else {
+ std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0);
+ const uint64_t * longs = mBitSet->getData();
+ for (size_t i = 0; i != bitset.size(); ++i) {
+ uint64_t& dst = bitset[i];
+ const uint64_t src = longs[i];
+ // convert big-endian to little-endian
+ for (size_t bit = 0; bit != 64; bit += 8) {
+ dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit));
+ }
+ }
+ bloomFilter.set_utf8bitset(bitset.data(), sizeInBytes());
+ }
+ }
+
+ bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const {
+ return mNumBits == other.mNumBits &&
+ mNumHashFunctions == other.mNumHashFunctions &&
+ *mBitSet == *other.mBitSet;
+ }
+
+ BloomFilter::~BloomFilter() {
+ // PASS
+ }
+
+ std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize(
+ const proto::Stream_Kind& streamKind,
+ const proto::ColumnEncoding& encoding,
+ const proto::BloomFilter& bloomFilter) {
+
+ std::unique_ptr<BloomFilter> ret(nullptr);
+
+ // only BLOOM_FILTER_UTF8 is supported
+ if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) {
+ return ret;
+ }
+
+ // make sure we don't use unknown encodings or original timestamp encodings
+ if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) {
+ return ret;
+ }
+
+ // make sure all required fields exist
+ if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) {
+ return ret;
+ }
+
+ ret.reset(new BloomFilterImpl(bloomFilter));
+ return ret;
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
index cf18a46fd9..ab2006bdae 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
@@ -1,197 +1,197 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_BLOOMFILTER_IMPL_HH
-#define ORC_BLOOMFILTER_IMPL_HH
-
-#include "orc/BloomFilter.hh"
-#include "wrap/orc-proto-wrapper.hh"
-
-#include <cmath>
-#include <sstream>
-#include <vector>
-
-namespace orc {
-
- /**
- * Bare metal bit set implementation. For performance reasons, this implementation does not check
- * for index bounds nor expand the bit set size if the specified index is greater than the size.
- */
- class BitSet {
- public:
- /**
- * Creates an empty BitSet
- *
- * @param numBits - number of bits used
- */
- BitSet(uint64_t numBits);
-
- /**
- * Creates BitSet from serialized uint64_t buffer
- *
- * @param bits - serialized uint64_t buffer of bitset
- * @param numBits - number of bits used
- */
- BitSet(const uint64_t * bits, uint64_t numBits);
-
- /**
- * Sets the bit at specified index.
- *
- * @param index - position
- */
- void set(uint64_t index);
-
- /**
- * Returns true if the bit is set in the specified index.
- *
- * @param index - position
- * @return - value at the bit position
- */
- bool get(uint64_t index);
-
- /**
- * Number of bits
- */
- uint64_t bitSize();
-
- /**
- * Combines the two BitSets using bitwise OR.
- */
- void merge(const BitSet& other);
-
- /**
- * Clears the bit set.
- */
- void clear();
-
- /**
- * Gets underlying raw data
- */
- const uint64_t * getData() const;
-
- /**
- * Compares two BitSets
- */
- bool operator==(const BitSet& other) const;
-
- private:
- std::vector<uint64_t> mData;
- };
-
- /**
- * BloomFilter is a probabilistic data structure for set membership check.
- * BloomFilters are highly space efficient when compared to using a HashSet.
- * Because of the probabilistic nature of bloom filter false positive (element
- * not present in bloom filter but test() says true) are possible but false
- * negatives are not possible (if element is present then test() will never
- * say false). The false positive probability is configurable (default: 5%)
- * depending on which storage requirement may increase or decrease. Lower the
- * false positive probability greater is the space requirement.
- *
- * Bloom filters are sensitive to number of elements that will be inserted in
- * the bloom filter. During the creation of bloom filter expected number of
- * entries must be specified. If the number of insertions exceed the specified
- * initial number of entries then false positive probability will increase
- * accordingly.
- *
- * Internally, this implementation of bloom filter uses Murmur3 fast
- * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than
- * Murmur3 in Java, it suffers from hash collisions for specific sequence of
- * repeating bytes. Check the following link for more info
- * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
- *
- * Note that this class is here for backwards compatibility, because it uses
- * the JVM default character set for strings. All new users should
- * BloomFilterUtf8, which always uses UTF8 for the encoding.
- */
- class BloomFilterImpl : public BloomFilter {
- public:
- /**
- * Creates an empty BloomFilter
- *
- * @param expectedEntries - number of entries it will hold
- * @param fpp - false positive probability
- */
- BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP);
-
- /**
- * Creates a BloomFilter by deserializing the proto-buf version
- *
- * caller should make sure input proto::BloomFilter is valid
- */
- BloomFilterImpl(const proto::BloomFilter& bloomFilter);
-
- /**
- * Adds a new element to the BloomFilter
- */
- void addBytes(const char * data, int64_t length);
- void addLong(int64_t data);
- void addDouble(double data);
-
- /**
- * Test if the element exists in BloomFilter
- */
- bool testBytes(const char * data, int64_t length) const override;
- bool testLong(int64_t data) const override;
- bool testDouble(double data) const override;
-
- uint64_t sizeInBytes() const;
- uint64_t getBitSize() const;
- int32_t getNumHashFunctions() const;
-
- void merge(const BloomFilterImpl& other);
-
- void reset();
-
- bool operator==(const BloomFilterImpl& other) const;
-
- private:
- friend struct BloomFilterUTF8Utils;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BLOOMFILTER_IMPL_HH
+#define ORC_BLOOMFILTER_IMPL_HH
+
+#include "orc/BloomFilter.hh"
+#include "wrap/orc-proto-wrapper.hh"
+
+#include <cmath>
+#include <sstream>
+#include <vector>
+
+namespace orc {
+
+ /**
+ * Bare metal bit set implementation. For performance reasons, this implementation does not check
+ * for index bounds nor expand the bit set size if the specified index is greater than the size.
+ */
+ class BitSet {
+ public:
+ /**
+ * Creates an empty BitSet
+ *
+ * @param numBits - number of bits used
+ */
+ BitSet(uint64_t numBits);
+
+ /**
+ * Creates BitSet from serialized uint64_t buffer
+ *
+ * @param bits - serialized uint64_t buffer of bitset
+ * @param numBits - number of bits used
+ */
+ BitSet(const uint64_t * bits, uint64_t numBits);
+
+ /**
+ * Sets the bit at specified index.
+ *
+ * @param index - position
+ */
+ void set(uint64_t index);
+
+ /**
+ * Returns true if the bit is set in the specified index.
+ *
+ * @param index - position
+ * @return - value at the bit position
+ */
+ bool get(uint64_t index);
+
+ /**
+ * Number of bits
+ */
+ uint64_t bitSize();
+
+ /**
+ * Combines the two BitSets using bitwise OR.
+ */
+ void merge(const BitSet& other);
+
+ /**
+ * Clears the bit set.
+ */
+ void clear();
+
+ /**
+ * Gets underlying raw data
+ */
+ const uint64_t * getData() const;
+
+ /**
+ * Compares two BitSets
+ */
+ bool operator==(const BitSet& other) const;
+
+ private:
+ std::vector<uint64_t> mData;
+ };
+
+ /**
+ * BloomFilter is a probabilistic data structure for set membership check.
+ * BloomFilters are highly space efficient when compared to using a HashSet.
+ * Because of the probabilistic nature of bloom filter false positive (element
+ * not present in bloom filter but test() says true) are possible but false
+ * negatives are not possible (if element is present then test() will never
+ * say false). The false positive probability is configurable (default: 5%)
+ * depending on which storage requirement may increase or decrease. Lower the
+ * false positive probability greater is the space requirement.
+ *
+ * Bloom filters are sensitive to number of elements that will be inserted in
+ * the bloom filter. During the creation of bloom filter expected number of
+ * entries must be specified. If the number of insertions exceed the specified
+ * initial number of entries then false positive probability will increase
+ * accordingly.
+ *
+ * Internally, this implementation of bloom filter uses Murmur3 fast
+ * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than
+ * Murmur3 in Java, it suffers from hash collisions for specific sequence of
+ * repeating bytes. Check the following link for more info
+ * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
+ *
+ * Note that this class is here for backwards compatibility, because it uses
+ * the JVM default character set for strings. All new users should
+ * BloomFilterUtf8, which always uses UTF8 for the encoding.
+ */
+ class BloomFilterImpl : public BloomFilter {
+ public:
+ /**
+ * Creates an empty BloomFilter
+ *
+ * @param expectedEntries - number of entries it will hold
+ * @param fpp - false positive probability
+ */
+ BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP);
+
+ /**
+ * Creates a BloomFilter by deserializing the proto-buf version
+ *
+ * caller should make sure input proto::BloomFilter is valid
+ */
+ BloomFilterImpl(const proto::BloomFilter& bloomFilter);
+
+ /**
+ * Adds a new element to the BloomFilter
+ */
+ void addBytes(const char * data, int64_t length);
+ void addLong(int64_t data);
+ void addDouble(double data);
+
+ /**
+ * Test if the element exists in BloomFilter
+ */
+ bool testBytes(const char * data, int64_t length) const override;
+ bool testLong(int64_t data) const override;
+ bool testDouble(double data) const override;
+
+ uint64_t sizeInBytes() const;
+ uint64_t getBitSize() const;
+ int32_t getNumHashFunctions() const;
+
+ void merge(const BloomFilterImpl& other);
+
+ void reset();
+
+ bool operator==(const BloomFilterImpl& other) const;
+
+ private:
+ friend struct BloomFilterUTF8Utils;
friend class TestBloomFilter_testBloomFilterBasicOperations_Test;
-
- // compute k hash values from hash64 and set bits
+
+ // compute k hash values from hash64 and set bits
void addHash(int64_t hash64);
-
- // compute k hash values from hash64 and check bits
+
+ // compute k hash values from hash64 and check bits
bool testHash(int64_t hash64) const;
-
- void serialize(proto::BloomFilter& bloomFilter) const;
-
- private:
- static constexpr double DEFAULT_FPP = 0.05;
- uint64_t mNumBits;
- int32_t mNumHashFunctions;
- std::unique_ptr<BitSet> mBitSet;
- };
-
- struct BloomFilterUTF8Utils {
- // serialize BloomFilter in protobuf
- static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) {
- in.serialize(out);
- }
-
- // deserialize BloomFilter from protobuf
- static std::unique_ptr<BloomFilter>
- deserialize(const proto::Stream_Kind& streamKind,
- const proto::ColumnEncoding& columnEncoding,
- const proto::BloomFilter& bloomFilter);
- };
-
+
+ void serialize(proto::BloomFilter& bloomFilter) const;
+
+ private:
+ static constexpr double DEFAULT_FPP = 0.05;
+ uint64_t mNumBits;
+ int32_t mNumHashFunctions;
+ std::unique_ptr<BitSet> mBitSet;
+ };
+
+ struct BloomFilterUTF8Utils {
+ // serialize BloomFilter in protobuf
+ static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) {
+ in.serialize(out);
+ }
+
+ // deserialize BloomFilter from protobuf
+ static std::unique_ptr<BloomFilter>
+ deserialize(const proto::Stream_Kind& streamKind,
+ const proto::ColumnEncoding& columnEncoding,
+ const proto::BloomFilter& bloomFilter);
+ };
+
// Thomas Wang's integer hash function
// http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
// Put this in header file so tests can use it as well.
@@ -205,6 +205,6 @@ namespace orc {
key = key + (key << 31);
return key;
}
-}
-
-#endif //ORC_BLOOMFILTER_IMPL_HH
+}
+
+#endif //ORC_BLOOMFILTER_IMPL_HH
diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc
index ee1a4575dc..30f5148b7c 100644
--- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc
+++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc
@@ -1,626 +1,626 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithm>
-#include <iostream>
-#include <string.h>
-#include <utility>
-
-#include "ByteRLE.hh"
-#include "orc/Exceptions.hh"
-
-namespace orc {
-
- const int MINIMUM_REPEAT = 3;
- const int MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT;
- const int MAX_LITERAL_SIZE = 128;
-
- ByteRleEncoder::~ByteRleEncoder() {
- // PASS
- }
-
- class ByteRleEncoderImpl : public ByteRleEncoder {
- public:
- ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
- virtual ~ByteRleEncoderImpl() override;
-
- /**
- * Encode the next batch of values.
- * @param data to be encoded
- * @param numValues the number of values to be encoded
- * @param notNull If the pointer is null, all values are read. If the
- * pointer is not null, positions that are false are skipped.
- */
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) override;
-
- /**
- * Get size of buffer used so far.
- */
- virtual uint64_t getBufferSize() const override;
-
- /**
- * Flush underlying BufferedOutputStream.
- */
- virtual uint64_t flush() override;
-
- virtual void recordPosition(PositionRecorder* recorder) const override;
-
- protected:
- std::unique_ptr<BufferedOutputStream> outputStream;
- char* literals;
- int numLiterals;
- bool repeat;
- int tailRunLength;
- int bufferPosition;
- int bufferLength;
- char* buffer;
-
- void writeByte(char c);
- void writeValues();
- void write(char c);
- };
-
- ByteRleEncoderImpl::ByteRleEncoderImpl(
- std::unique_ptr<BufferedOutputStream> output)
- : outputStream(std::move(output)) {
- literals = new char[MAX_LITERAL_SIZE];
- numLiterals = 0;
- tailRunLength = 0;
- repeat = false;
- bufferPosition = 0;
- bufferLength = 0;
- buffer = nullptr;
- }
-
- ByteRleEncoderImpl::~ByteRleEncoderImpl() {
- // PASS
- delete [] literals;
- }
-
- void ByteRleEncoderImpl::writeByte(char c) {
- if (bufferPosition == bufferLength) {
- int addedSize = 0;
- if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
- throw std::bad_alloc();
- }
- bufferPosition = 0;
- bufferLength = addedSize;
- }
- buffer[bufferPosition++] = c;
- }
-
- void ByteRleEncoderImpl::add(
- const char* data,
- uint64_t numValues,
- const char* notNull) {
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- write(data[i]);
- }
- }
- }
-
- void ByteRleEncoderImpl::writeValues() {
- if (numLiterals != 0) {
- if (repeat) {
- writeByte(
- static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
- writeByte(literals[0]);
- } else {
- writeByte(static_cast<char>(-numLiterals));
- for (int i = 0; i < numLiterals; ++i) {
- writeByte(literals[i]);
- }
- }
- repeat = false;
- tailRunLength = 0;
- numLiterals = 0;
- }
- }
-
- uint64_t ByteRleEncoderImpl::flush() {
- writeValues();
- outputStream->BackUp(bufferLength - bufferPosition);
- uint64_t dataSize = outputStream->flush();
- bufferLength = bufferPosition = 0;
- return dataSize;
- }
-
- void ByteRleEncoderImpl::write(char value) {
- if (numLiterals == 0) {
- literals[numLiterals++] = value;
- tailRunLength = 1;
- } else if (repeat) {
- if (value == literals[0]) {
- numLiterals += 1;
- if (numLiterals == MAXIMUM_REPEAT) {
- writeValues();
- }
- } else {
- writeValues();
- literals[numLiterals++] = value;
- tailRunLength = 1;
- }
- } else {
- if (value == literals[numLiterals - 1]) {
- tailRunLength += 1;
- } else {
- tailRunLength = 1;
- }
- if (tailRunLength == MINIMUM_REPEAT) {
- if (numLiterals + 1 == MINIMUM_REPEAT) {
- repeat = true;
- numLiterals += 1;
- } else {
- numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
- writeValues();
- literals[0] = value;
- repeat = true;
- numLiterals = MINIMUM_REPEAT;
- }
- } else {
- literals[numLiterals++] = value;
- if (numLiterals == MAX_LITERAL_SIZE) {
- writeValues();
- }
- }
- }
- }
-
- uint64_t ByteRleEncoderImpl::getBufferSize() const {
- return outputStream->getSize();
- }
-
- void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const {
- uint64_t flushedSize = outputStream->getSize();
- uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
- if (outputStream->isCompressed()) {
- // start of the compression chunk in the stream
- recorder->add(flushedSize);
- // number of decompressed bytes that need to be consumed
- recorder->add(unflushedSize);
- } else {
- flushedSize -= static_cast<uint64_t>(bufferLength);
- // byte offset of the RLE run’s start location
- recorder->add(flushedSize + unflushedSize);
- }
- recorder->add(static_cast<uint64_t>(numLiterals));
- }
-
- std::unique_ptr<ByteRleEncoder> createByteRleEncoder
- (std::unique_ptr<BufferedOutputStream> output) {
- return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl
- (std::move(output)));
- }
-
- class BooleanRleEncoderImpl : public ByteRleEncoderImpl {
- public:
- BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
- virtual ~BooleanRleEncoderImpl() override;
-
- /**
- * Encode the next batch of values
- * @param data to be encoded
- * @param numValues the number of values to be encoded
- * @param notNull If the pointer is null, all values are read. If the
- * pointer is not null, positions that are false are skipped.
- */
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) override;
-
- /**
- * Flushing underlying BufferedOutputStream
- */
- virtual uint64_t flush() override;
-
- virtual void recordPosition(PositionRecorder* recorder) const override;
-
- private:
- int bitsRemained;
- char current;
-
- };
-
- BooleanRleEncoderImpl::BooleanRleEncoderImpl(
- std::unique_ptr<BufferedOutputStream> output)
- : ByteRleEncoderImpl(std::move(output)) {
- bitsRemained = 8;
- current = static_cast<char>(0);
- }
-
- BooleanRleEncoderImpl::~BooleanRleEncoderImpl() {
- // PASS
- }
-
- void BooleanRleEncoderImpl::add(
- const char* data,
- uint64_t numValues,
- const char* notNull) {
- for (uint64_t i = 0; i < numValues; ++i) {
- if (bitsRemained == 0) {
- write(current);
- current = static_cast<char>(0);
- bitsRemained = 8;
- }
- if (!notNull || notNull[i]) {
- if (!data || data[i]) {
- current =
- static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
- }
- --bitsRemained;
- }
- }
- if (bitsRemained == 0) {
- write(current);
- current = static_cast<char>(0);
- bitsRemained = 8;
- }
- }
-
- uint64_t BooleanRleEncoderImpl::flush() {
- if (bitsRemained != 8) {
- write(current);
- }
- bitsRemained = 8;
- current = static_cast<char>(0);
- return ByteRleEncoderImpl::flush();
- }
-
- void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
- ByteRleEncoderImpl::recordPosition(recorder);
- recorder->add(static_cast<uint64_t>(8 - bitsRemained));
- }
-
- std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
- (std::unique_ptr<BufferedOutputStream> output) {
- BooleanRleEncoderImpl* encoder =
- new BooleanRleEncoderImpl(std::move(output)) ;
- return std::unique_ptr<ByteRleEncoder>(
- reinterpret_cast<ByteRleEncoder*>(encoder));
- }
-
- ByteRleDecoder::~ByteRleDecoder() {
- // PASS
- }
-
- class ByteRleDecoderImpl: public ByteRleDecoder {
- public:
- ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
-
- virtual ~ByteRleDecoderImpl();
-
- /**
- * Seek to a particular spot.
- */
- virtual void seek(PositionProvider&);
-
- /**
- * Seek over a given number of values.
- */
- virtual void skip(uint64_t numValues);
-
- /**
- * Read a number of values into the batch.
- */
- virtual void next(char* data, uint64_t numValues, char* notNull);
-
- protected:
- inline void nextBuffer();
- inline signed char readByte();
- inline void readHeader();
-
- std::unique_ptr<SeekableInputStream> inputStream;
- size_t remainingValues;
- char value;
- const char* bufferStart;
- const char* bufferEnd;
- bool repeating;
- };
-
- void ByteRleDecoderImpl::nextBuffer() {
- int bufferLength;
- const void* bufferPointer;
- bool result = inputStream->Next(&bufferPointer, &bufferLength);
- if (!result) {
- throw ParseError("bad read in nextBuffer");
- }
- bufferStart = static_cast<const char*>(bufferPointer);
- bufferEnd = bufferStart + bufferLength;
- }
-
- signed char ByteRleDecoderImpl::readByte() {
- if (bufferStart == bufferEnd) {
- nextBuffer();
- }
- return *(bufferStart++);
- }
-
- void ByteRleDecoderImpl::readHeader() {
- signed char ch = readByte();
- if (ch < 0) {
- remainingValues = static_cast<size_t>(-ch);
- repeating = false;
- } else {
- remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT;
- repeating = true;
- value = readByte();
- }
- }
-
- ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream>
- input) {
- inputStream = std::move(input);
- repeating = false;
- remainingValues = 0;
- value = 0;
- bufferStart = nullptr;
- bufferEnd = nullptr;
- }
-
- ByteRleDecoderImpl::~ByteRleDecoderImpl() {
- // PASS
- }
-
- void ByteRleDecoderImpl::seek(PositionProvider& location) {
- // move the input stream
- inputStream->seek(location);
- // force a re-read from the stream
- bufferEnd = bufferStart;
- // read a new header
- readHeader();
- // skip ahead the given number of records
- ByteRleDecoderImpl::skip(location.next());
- }
-
- void ByteRleDecoderImpl::skip(uint64_t numValues) {
- while (numValues > 0) {
- if (remainingValues == 0) {
- readHeader();
- }
- size_t count = std::min(static_cast<size_t>(numValues), remainingValues);
- remainingValues -= count;
- numValues -= count;
- // for literals we need to skip over count bytes, which may involve
- // reading from the underlying stream
- if (!repeating) {
- size_t consumedBytes = count;
- while (consumedBytes > 0) {
- if (bufferStart == bufferEnd) {
- nextBuffer();
- }
- size_t skipSize = std::min(static_cast<size_t>(consumedBytes),
- static_cast<size_t>(bufferEnd -
- bufferStart));
- bufferStart += skipSize;
- consumedBytes -= skipSize;
- }
- }
- }
- }
-
- void ByteRleDecoderImpl::next(char* data, uint64_t numValues,
- char* notNull) {
- uint64_t position = 0;
- // skip over null values
- while (notNull && position < numValues && !notNull[position]) {
- position += 1;
- }
- while (position < numValues) {
- // if we are out of values, read more
- if (remainingValues == 0) {
- readHeader();
- }
- // how many do we read out of this block?
- size_t count = std::min(static_cast<size_t>(numValues - position),
- remainingValues);
- uint64_t consumed = 0;
- if (repeating) {
- if (notNull) {
- for(uint64_t i=0; i < count; ++i) {
- if (notNull[position + i]) {
- data[position + i] = value;
- consumed += 1;
- }
- }
- } else {
- memset(data + position, value, count);
- consumed = count;
- }
- } else {
- if (notNull) {
- for(uint64_t i=0; i < count; ++i) {
- if (notNull[position + i]) {
- data[position + i] = readByte();
- consumed += 1;
- }
- }
- } else {
- uint64_t i = 0;
- while (i < count) {
- if (bufferStart == bufferEnd) {
- nextBuffer();
- }
- uint64_t copyBytes =
- std::min(static_cast<uint64_t>(count - i),
- static_cast<uint64_t>(bufferEnd - bufferStart));
- memcpy(data + position + i, bufferStart, copyBytes);
- bufferStart += copyBytes;
- i += copyBytes;
- }
- consumed = count;
- }
- }
- remainingValues -= consumed;
- position += count;
- // skip over any null values
- while (notNull && position < numValues && !notNull[position]) {
- position += 1;
- }
- }
- }
-
- std::unique_ptr<ByteRleDecoder> createByteRleDecoder
- (std::unique_ptr<SeekableInputStream> input) {
- return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl
- (std::move(input)));
- }
-
- class BooleanRleDecoderImpl: public ByteRleDecoderImpl {
- public:
- BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
-
- virtual ~BooleanRleDecoderImpl();
-
- /**
- * Seek to a particular spot.
- */
- virtual void seek(PositionProvider&);
-
- /**
- * Seek over a given number of values.
- */
- virtual void skip(uint64_t numValues);
-
- /**
- * Read a number of values into the batch.
- */
- virtual void next(char* data, uint64_t numValues, char* notNull);
-
- protected:
- size_t remainingBits;
- char lastByte;
- };
-
- BooleanRleDecoderImpl::BooleanRleDecoderImpl
- (std::unique_ptr<SeekableInputStream> input
- ): ByteRleDecoderImpl(std::move(input)) {
- remainingBits = 0;
- lastByte = 0;
- }
-
- BooleanRleDecoderImpl::~BooleanRleDecoderImpl() {
- // PASS
- }
-
- void BooleanRleDecoderImpl::seek(PositionProvider& location) {
- ByteRleDecoderImpl::seek(location);
- uint64_t consumed = location.next();
- remainingBits = 0;
- if (consumed > 8) {
- throw ParseError("bad position");
- }
- if (consumed != 0) {
- remainingBits = 8 - consumed;
- ByteRleDecoderImpl::next(&lastByte, 1, nullptr);
- }
- }
-
- void BooleanRleDecoderImpl::skip(uint64_t numValues) {
- if (numValues <= remainingBits) {
- remainingBits -= numValues;
- } else {
- numValues -= remainingBits;
- uint64_t bytesSkipped = numValues / 8;
- ByteRleDecoderImpl::skip(bytesSkipped);
- if (numValues % 8 != 0) {
- ByteRleDecoderImpl::next(&lastByte, 1, nullptr);
- remainingBits = 8 - (numValues % 8);
- } else {
- remainingBits = 0;
- }
- }
- }
-
- void BooleanRleDecoderImpl::next(char* data, uint64_t numValues,
- char* notNull) {
- // next spot to fill in
- uint64_t position = 0;
-
- // use up any remaining bits
- if (notNull) {
- while(remainingBits > 0 && position < numValues) {
- if (notNull[position]) {
- remainingBits -= 1;
- data[position] = (static_cast<unsigned char>(lastByte) >>
- remainingBits) & 0x1;
- } else {
- data[position] = 0;
- }
- position += 1;
- }
- } else {
- while(remainingBits > 0 && position < numValues) {
- remainingBits -= 1;
- data[position++] = (static_cast<unsigned char>(lastByte) >>
- remainingBits) & 0x1;
- }
- }
-
- // count the number of nonNulls remaining
- uint64_t nonNulls = numValues - position;
- if (notNull) {
- for(uint64_t i=position; i < numValues; ++i) {
- if (!notNull[i]) {
- nonNulls -= 1;
- }
- }
- }
-
- // fill in the remaining values
- if (nonNulls == 0) {
- while (position < numValues) {
- data[position++] = 0;
- }
- } else if (position < numValues) {
- // read the new bytes into the array
- uint64_t bytesRead = (nonNulls + 7) / 8;
- ByteRleDecoderImpl::next(data + position, bytesRead, nullptr);
- lastByte = data[position + bytesRead - 1];
- remainingBits = bytesRead * 8 - nonNulls;
- // expand the array backwards so that we don't clobber the data
- uint64_t bitsLeft = bytesRead * 8 - remainingBits;
- if (notNull) {
- for(int64_t i=static_cast<int64_t>(numValues) - 1;
- i >= static_cast<int64_t>(position); --i) {
- if (notNull[i]) {
- uint64_t shiftPosn = (-bitsLeft) % 8;
- data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
- bitsLeft -= 1;
- } else {
- data[i] = 0;
- }
- }
- } else {
- for(int64_t i=static_cast<int64_t>(numValues) - 1;
- i >= static_cast<int64_t>(position); --i, --bitsLeft) {
- uint64_t shiftPosn = (-bitsLeft) % 8;
- data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
- }
- }
- }
- }
-
- std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
- (std::unique_ptr<SeekableInputStream> input) {
- BooleanRleDecoderImpl* decoder =
- new BooleanRleDecoderImpl(std::move(input));
- return std::unique_ptr<ByteRleDecoder>(
- reinterpret_cast<ByteRleDecoder*>(decoder));
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <iostream>
+#include <string.h>
+#include <utility>
+
+#include "ByteRLE.hh"
+#include "orc/Exceptions.hh"
+
+namespace orc {
+
+ const int MINIMUM_REPEAT = 3;
+ const int MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT;
+ const int MAX_LITERAL_SIZE = 128;
+
+ ByteRleEncoder::~ByteRleEncoder() {
+ // PASS
+ }
+
+ class ByteRleEncoderImpl : public ByteRleEncoder {
+ public:
+ ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
+ virtual ~ByteRleEncoderImpl() override;
+
+ /**
+ * Encode the next batch of values.
+ * @param data to be encoded
+ * @param numValues the number of values to be encoded
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void add(const char* data, uint64_t numValues,
+ const char* notNull) override;
+
+ /**
+ * Get size of buffer used so far.
+ */
+ virtual uint64_t getBufferSize() const override;
+
+ /**
+ * Flush underlying BufferedOutputStream.
+ */
+ virtual uint64_t flush() override;
+
+ virtual void recordPosition(PositionRecorder* recorder) const override;
+
+ protected:
+ std::unique_ptr<BufferedOutputStream> outputStream;
+ char* literals;
+ int numLiterals;
+ bool repeat;
+ int tailRunLength;
+ int bufferPosition;
+ int bufferLength;
+ char* buffer;
+
+ void writeByte(char c);
+ void writeValues();
+ void write(char c);
+ };
+
+ ByteRleEncoderImpl::ByteRleEncoderImpl(
+ std::unique_ptr<BufferedOutputStream> output)
+ : outputStream(std::move(output)) {
+ literals = new char[MAX_LITERAL_SIZE];
+ numLiterals = 0;
+ tailRunLength = 0;
+ repeat = false;
+ bufferPosition = 0;
+ bufferLength = 0;
+ buffer = nullptr;
+ }
+
+ ByteRleEncoderImpl::~ByteRleEncoderImpl() {
+ // PASS
+ delete [] literals;
+ }
+
+ void ByteRleEncoderImpl::writeByte(char c) {
+ if (bufferPosition == bufferLength) {
+ int addedSize = 0;
+ if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
+ throw std::bad_alloc();
+ }
+ bufferPosition = 0;
+ bufferLength = addedSize;
+ }
+ buffer[bufferPosition++] = c;
+ }
+
+ void ByteRleEncoderImpl::add(
+ const char* data,
+ uint64_t numValues,
+ const char* notNull) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ write(data[i]);
+ }
+ }
+ }
+
+ void ByteRleEncoderImpl::writeValues() {
+ if (numLiterals != 0) {
+ if (repeat) {
+ writeByte(
+ static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
+ writeByte(literals[0]);
+ } else {
+ writeByte(static_cast<char>(-numLiterals));
+ for (int i = 0; i < numLiterals; ++i) {
+ writeByte(literals[i]);
+ }
+ }
+ repeat = false;
+ tailRunLength = 0;
+ numLiterals = 0;
+ }
+ }
+
+ uint64_t ByteRleEncoderImpl::flush() {
+ writeValues();
+ outputStream->BackUp(bufferLength - bufferPosition);
+ uint64_t dataSize = outputStream->flush();
+ bufferLength = bufferPosition = 0;
+ return dataSize;
+ }
+
+ void ByteRleEncoderImpl::write(char value) {
+ if (numLiterals == 0) {
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0]) {
+ numLiterals += 1;
+ if (numLiterals == MAXIMUM_REPEAT) {
+ writeValues();
+ }
+ } else {
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ }
+ } else {
+ if (value == literals[numLiterals - 1]) {
+ tailRunLength += 1;
+ } else {
+ tailRunLength = 1;
+ }
+ if (tailRunLength == MINIMUM_REPEAT) {
+ if (numLiterals + 1 == MINIMUM_REPEAT) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
+ writeValues();
+ literals[0] = value;
+ repeat = true;
+ numLiterals = MINIMUM_REPEAT;
+ }
+ } else {
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
+ }
+ }
+ }
+
+ uint64_t ByteRleEncoderImpl::getBufferSize() const {
+ return outputStream->getSize();
+ }
+
+ void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const {
+ uint64_t flushedSize = outputStream->getSize();
+ uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
+ if (outputStream->isCompressed()) {
+ // start of the compression chunk in the stream
+ recorder->add(flushedSize);
+ // number of decompressed bytes that need to be consumed
+ recorder->add(unflushedSize);
+ } else {
+ flushedSize -= static_cast<uint64_t>(bufferLength);
+ // byte offset of the RLE run’s start location
+ recorder->add(flushedSize + unflushedSize);
+ }
+ recorder->add(static_cast<uint64_t>(numLiterals));
+ }
+
+ std::unique_ptr<ByteRleEncoder> createByteRleEncoder
+ (std::unique_ptr<BufferedOutputStream> output) {
+ return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl
+ (std::move(output)));
+ }
+
+ class BooleanRleEncoderImpl : public ByteRleEncoderImpl {
+ public:
+ BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
+ virtual ~BooleanRleEncoderImpl() override;
+
+ /**
+ * Encode the next batch of values
+ * @param data to be encoded
+ * @param numValues the number of values to be encoded
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void add(const char* data, uint64_t numValues,
+ const char* notNull) override;
+
+ /**
+ * Flushing underlying BufferedOutputStream
+ */
+ virtual uint64_t flush() override;
+
+ virtual void recordPosition(PositionRecorder* recorder) const override;
+
+ private:
+ int bitsRemained;
+ char current;
+
+ };
+
+ BooleanRleEncoderImpl::BooleanRleEncoderImpl(
+ std::unique_ptr<BufferedOutputStream> output)
+ : ByteRleEncoderImpl(std::move(output)) {
+ bitsRemained = 8;
+ current = static_cast<char>(0);
+ }
+
+ BooleanRleEncoderImpl::~BooleanRleEncoderImpl() {
+ // PASS
+ }
+
+ void BooleanRleEncoderImpl::add(
+ const char* data,
+ uint64_t numValues,
+ const char* notNull) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (bitsRemained == 0) {
+ write(current);
+ current = static_cast<char>(0);
+ bitsRemained = 8;
+ }
+ if (!notNull || notNull[i]) {
+ if (!data || data[i]) {
+ current =
+ static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
+ }
+ --bitsRemained;
+ }
+ }
+ if (bitsRemained == 0) {
+ write(current);
+ current = static_cast<char>(0);
+ bitsRemained = 8;
+ }
+ }
+
+ uint64_t BooleanRleEncoderImpl::flush() {
+ if (bitsRemained != 8) {
+ write(current);
+ }
+ bitsRemained = 8;
+ current = static_cast<char>(0);
+ return ByteRleEncoderImpl::flush();
+ }
+
+ void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
+ ByteRleEncoderImpl::recordPosition(recorder);
+ recorder->add(static_cast<uint64_t>(8 - bitsRemained));
+ }
+
+ std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
+ (std::unique_ptr<BufferedOutputStream> output) {
+ BooleanRleEncoderImpl* encoder =
+ new BooleanRleEncoderImpl(std::move(output)) ;
+ return std::unique_ptr<ByteRleEncoder>(
+ reinterpret_cast<ByteRleEncoder*>(encoder));
+ }
+
+ ByteRleDecoder::~ByteRleDecoder() {
+ // PASS
+ }
+
+ class ByteRleDecoderImpl: public ByteRleDecoder {
+ public:
+ ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
+
+ virtual ~ByteRleDecoderImpl();
+
+ /**
+ * Seek to a particular spot.
+ */
+ virtual void seek(PositionProvider&);
+
+ /**
+ * Seek over a given number of values.
+ */
+ virtual void skip(uint64_t numValues);
+
+ /**
+ * Read a number of values into the batch.
+ */
+ virtual void next(char* data, uint64_t numValues, char* notNull);
+
+ protected:
+ inline void nextBuffer();
+ inline signed char readByte();
+ inline void readHeader();
+
+ std::unique_ptr<SeekableInputStream> inputStream;
+ size_t remainingValues;
+ char value;
+ const char* bufferStart;
+ const char* bufferEnd;
+ bool repeating;
+ };
+
+ void ByteRleDecoderImpl::nextBuffer() {
+ int bufferLength;
+ const void* bufferPointer;
+ bool result = inputStream->Next(&bufferPointer, &bufferLength);
+ if (!result) {
+ throw ParseError("bad read in nextBuffer");
+ }
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+
+ signed char ByteRleDecoderImpl::readByte() {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ return *(bufferStart++);
+ }
+
+ void ByteRleDecoderImpl::readHeader() {
+ signed char ch = readByte();
+ if (ch < 0) {
+ remainingValues = static_cast<size_t>(-ch);
+ repeating = false;
+ } else {
+ remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT;
+ repeating = true;
+ value = readByte();
+ }
+ }
+
+ ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream>
+ input) {
+ inputStream = std::move(input);
+ repeating = false;
+ remainingValues = 0;
+ value = 0;
+ bufferStart = nullptr;
+ bufferEnd = nullptr;
+ }
+
+ ByteRleDecoderImpl::~ByteRleDecoderImpl() {
+ // PASS
+ }
+
+ void ByteRleDecoderImpl::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // force a re-read from the stream
+ bufferEnd = bufferStart;
+ // read a new header
+ readHeader();
+ // skip ahead the given number of records
+ ByteRleDecoderImpl::skip(location.next());
+ }
+
+ void ByteRleDecoderImpl::skip(uint64_t numValues) {
+ while (numValues > 0) {
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ size_t count = std::min(static_cast<size_t>(numValues), remainingValues);
+ remainingValues -= count;
+ numValues -= count;
+ // for literals we need to skip over count bytes, which may involve
+ // reading from the underlying stream
+ if (!repeating) {
+ size_t consumedBytes = count;
+ while (consumedBytes > 0) {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ size_t skipSize = std::min(static_cast<size_t>(consumedBytes),
+ static_cast<size_t>(bufferEnd -
+ bufferStart));
+ bufferStart += skipSize;
+ consumedBytes -= skipSize;
+ }
+ }
+ }
+ }
+
+ void ByteRleDecoderImpl::next(char* data, uint64_t numValues,
+ char* notNull) {
+ uint64_t position = 0;
+ // skip over null values
+ while (notNull && position < numValues && !notNull[position]) {
+ position += 1;
+ }
+ while (position < numValues) {
+ // if we are out of values, read more
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ // how many do we read out of this block?
+ size_t count = std::min(static_cast<size_t>(numValues - position),
+ remainingValues);
+ uint64_t consumed = 0;
+ if (repeating) {
+ if (notNull) {
+ for(uint64_t i=0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = value;
+ consumed += 1;
+ }
+ }
+ } else {
+ memset(data + position, value, count);
+ consumed = count;
+ }
+ } else {
+ if (notNull) {
+ for(uint64_t i=0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = readByte();
+ consumed += 1;
+ }
+ }
+ } else {
+ uint64_t i = 0;
+ while (i < count) {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ uint64_t copyBytes =
+ std::min(static_cast<uint64_t>(count - i),
+ static_cast<uint64_t>(bufferEnd - bufferStart));
+ memcpy(data + position + i, bufferStart, copyBytes);
+ bufferStart += copyBytes;
+ i += copyBytes;
+ }
+ consumed = count;
+ }
+ }
+ remainingValues -= consumed;
+ position += count;
+ // skip over any null values
+ while (notNull && position < numValues && !notNull[position]) {
+ position += 1;
+ }
+ }
+ }
+
+ std::unique_ptr<ByteRleDecoder> createByteRleDecoder
+ (std::unique_ptr<SeekableInputStream> input) {
+ return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl
+ (std::move(input)));
+ }
+
+ class BooleanRleDecoderImpl: public ByteRleDecoderImpl {
+ public:
+ BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
+
+ virtual ~BooleanRleDecoderImpl();
+
+ /**
+ * Seek to a particular spot.
+ */
+ virtual void seek(PositionProvider&);
+
+ /**
+ * Seek over a given number of values.
+ */
+ virtual void skip(uint64_t numValues);
+
+ /**
+ * Read a number of values into the batch.
+ */
+ virtual void next(char* data, uint64_t numValues, char* notNull);
+
+ protected:
+ size_t remainingBits;
+ char lastByte;
+ };
+
+ BooleanRleDecoderImpl::BooleanRleDecoderImpl
+ (std::unique_ptr<SeekableInputStream> input
+ ): ByteRleDecoderImpl(std::move(input)) {
+ remainingBits = 0;
+ lastByte = 0;
+ }
+
+ BooleanRleDecoderImpl::~BooleanRleDecoderImpl() {
+ // PASS
+ }
+
+ void BooleanRleDecoderImpl::seek(PositionProvider& location) {
+ ByteRleDecoderImpl::seek(location);
+ uint64_t consumed = location.next();
+ remainingBits = 0;
+ if (consumed > 8) {
+ throw ParseError("bad position");
+ }
+ if (consumed != 0) {
+ remainingBits = 8 - consumed;
+ ByteRleDecoderImpl::next(&lastByte, 1, nullptr);
+ }
+ }
+
+ void BooleanRleDecoderImpl::skip(uint64_t numValues) {
+ if (numValues <= remainingBits) {
+ remainingBits -= numValues;
+ } else {
+ numValues -= remainingBits;
+ uint64_t bytesSkipped = numValues / 8;
+ ByteRleDecoderImpl::skip(bytesSkipped);
+ if (numValues % 8 != 0) {
+ ByteRleDecoderImpl::next(&lastByte, 1, nullptr);
+ remainingBits = 8 - (numValues % 8);
+ } else {
+ remainingBits = 0;
+ }
+ }
+ }
+
+ void BooleanRleDecoderImpl::next(char* data, uint64_t numValues,
+ char* notNull) {
+ // next spot to fill in
+ uint64_t position = 0;
+
+ // use up any remaining bits
+ if (notNull) {
+ while(remainingBits > 0 && position < numValues) {
+ if (notNull[position]) {
+ remainingBits -= 1;
+ data[position] = (static_cast<unsigned char>(lastByte) >>
+ remainingBits) & 0x1;
+ } else {
+ data[position] = 0;
+ }
+ position += 1;
+ }
+ } else {
+ while(remainingBits > 0 && position < numValues) {
+ remainingBits -= 1;
+ data[position++] = (static_cast<unsigned char>(lastByte) >>
+ remainingBits) & 0x1;
+ }
+ }
+
+ // count the number of nonNulls remaining
+ uint64_t nonNulls = numValues - position;
+ if (notNull) {
+ for(uint64_t i=position; i < numValues; ++i) {
+ if (!notNull[i]) {
+ nonNulls -= 1;
+ }
+ }
+ }
+
+ // fill in the remaining values
+ if (nonNulls == 0) {
+ while (position < numValues) {
+ data[position++] = 0;
+ }
+ } else if (position < numValues) {
+ // read the new bytes into the array
+ uint64_t bytesRead = (nonNulls + 7) / 8;
+ ByteRleDecoderImpl::next(data + position, bytesRead, nullptr);
+ lastByte = data[position + bytesRead - 1];
+ remainingBits = bytesRead * 8 - nonNulls;
+ // expand the array backwards so that we don't clobber the data
+ uint64_t bitsLeft = bytesRead * 8 - remainingBits;
+ if (notNull) {
+ for(int64_t i=static_cast<int64_t>(numValues) - 1;
+ i >= static_cast<int64_t>(position); --i) {
+ if (notNull[i]) {
+ uint64_t shiftPosn = (-bitsLeft) % 8;
+ data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
+ bitsLeft -= 1;
+ } else {
+ data[i] = 0;
+ }
+ }
+ } else {
+ for(int64_t i=static_cast<int64_t>(numValues) - 1;
+ i >= static_cast<int64_t>(position); --i, --bitsLeft) {
+ uint64_t shiftPosn = (-bitsLeft) % 8;
+ data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
+ }
+ }
+ }
+ }
+
+ std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
+ (std::unique_ptr<SeekableInputStream> input) {
+ BooleanRleDecoderImpl* decoder =
+ new BooleanRleDecoderImpl(std::move(input));
+ return std::unique_ptr<ByteRleDecoder>(
+ reinterpret_cast<ByteRleDecoder*>(decoder));
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh
index 71ca579cd3..b799675aee 100644
--- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh
+++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh
@@ -1,117 +1,117 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_BYTE_RLE_HH
-#define ORC_BYTE_RLE_HH
-
-#include <memory>
-
-#include "io/InputStream.hh"
-#include "io/OutputStream.hh"
-
-namespace orc {
-
- class ByteRleEncoder {
- public:
- virtual ~ByteRleEncoder();
-
- /**
- * Encode the next batch of values
- * @param data to be encoded
- * @param numValues the number of values to be encoded
- * @param notNull If the pointer is null, all values are read. If the
- * pointer is not null, positions that are false are skipped.
- */
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) = 0;
-
- /**
- * Get size of buffer used so far.
- */
- virtual uint64_t getBufferSize() const = 0;
-
- /**
- * Flushing underlying output stream
- */
- virtual uint64_t flush() = 0;
-
- /**
- * record current position
- * @param recorder use the recorder to record current positions
- */
- virtual void recordPosition(PositionRecorder* recorder) const = 0;
- };
-
- class ByteRleDecoder {
- public:
- virtual ~ByteRleDecoder();
-
- /**
- * Seek to a particular spot.
- */
- virtual void seek(PositionProvider&) = 0;
-
- /**
- * Seek over a given number of values.
- */
- virtual void skip(uint64_t numValues) = 0;
-
- /**
- * Read a number of values into the batch.
- * @param data the array to read into
- * @param numValues the number of values to read
- * @param notNull If the pointer is null, all values are read. If the
- * pointer is not null, positions that are false are skipped.
- */
- virtual void next(char* data, uint64_t numValues, char* notNull) = 0;
- };
-
- /**
- * Create a byte RLE encoder.
- * @param output the output stream to write to
- */
- std::unique_ptr<ByteRleEncoder> createByteRleEncoder
- (std::unique_ptr<BufferedOutputStream> output);
-
- /**
- * Create a boolean RLE encoder.
- * @param output the output stream to write to
- */
- std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
- (std::unique_ptr<BufferedOutputStream> output);
-
- /**
- * Create a byte RLE decoder.
- * @param input the input stream to read from
- */
- std::unique_ptr<ByteRleDecoder> createByteRleDecoder
- (std::unique_ptr<SeekableInputStream> input);
-
- /**
- * Create a boolean RLE decoder.
- *
- * Unlike the other RLE decoders, the boolean decoder sets the data to 0
- * if the value is masked by notNull. This is required for the notNull stream
- * processing to properly apply multiple masks from nested types.
- * @param input the input stream to read from
- */
- std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
- (std::unique_ptr<SeekableInputStream> input);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BYTE_RLE_HH
+#define ORC_BYTE_RLE_HH
+
+#include <memory>
+
+#include "io/InputStream.hh"
+#include "io/OutputStream.hh"
+
+namespace orc {
+
+ class ByteRleEncoder {
+ public:
+ virtual ~ByteRleEncoder();
+
+ /**
+ * Encode the next batch of values
+ * @param data to be encoded
+ * @param numValues the number of values to be encoded
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void add(const char* data, uint64_t numValues,
+ const char* notNull) = 0;
+
+ /**
+ * Get size of buffer used so far.
+ */
+ virtual uint64_t getBufferSize() const = 0;
+
+ /**
+ * Flushing underlying output stream
+ */
+ virtual uint64_t flush() = 0;
+
+ /**
+ * record current position
+ * @param recorder use the recorder to record current positions
+ */
+ virtual void recordPosition(PositionRecorder* recorder) const = 0;
+ };
+
+ class ByteRleDecoder {
+ public:
+ virtual ~ByteRleDecoder();
+
+ /**
+ * Seek to a particular spot.
+ */
+ virtual void seek(PositionProvider&) = 0;
+
+ /**
+ * Seek over a given number of values.
+ */
+ virtual void skip(uint64_t numValues) = 0;
+
+ /**
+ * Read a number of values into the batch.
+ * @param data the array to read into
+ * @param numValues the number of values to read
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void next(char* data, uint64_t numValues, char* notNull) = 0;
+ };
+
+ /**
+ * Create a byte RLE encoder.
+ * @param output the output stream to write to
+ */
+ std::unique_ptr<ByteRleEncoder> createByteRleEncoder
+ (std::unique_ptr<BufferedOutputStream> output);
+
+ /**
+ * Create a boolean RLE encoder.
+ * @param output the output stream to write to
+ */
+ std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
+ (std::unique_ptr<BufferedOutputStream> output);
+
+ /**
+ * Create a byte RLE decoder.
+ * @param input the input stream to read from
+ */
+ std::unique_ptr<ByteRleDecoder> createByteRleDecoder
+ (std::unique_ptr<SeekableInputStream> input);
+
+ /**
+ * Create a boolean RLE decoder.
+ *
+ * Unlike the other RLE decoders, the boolean decoder sets the data to 0
+ * if the value is masked by notNull. This is required for the notNull stream
+ * processing to properly apply multiple masks from nested types.
+ * @param input the input stream to read from
+ */
+ std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
+ (std::unique_ptr<SeekableInputStream> input);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
index b4b5860cad..91c2904038 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
@@ -1,747 +1,747 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/ColumnPrinter.hh"
-#include "orc/orc-config.hh"
-
-#include "Adaptor.hh"
-
-#include <limits>
-#include <sstream>
-#include <stdexcept>
-#include <time.h>
-#include <typeinfo>
-
-#ifdef __clang__
- #pragma clang diagnostic ignored "-Wformat-security"
-#endif
-
-namespace orc {
-
- class VoidColumnPrinter: public ColumnPrinter {
- public:
- VoidColumnPrinter(std::string&);
- ~VoidColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class BooleanColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* data;
- public:
- BooleanColumnPrinter(std::string&);
- ~BooleanColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class LongColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* data;
- public:
- LongColumnPrinter(std::string&);
- ~LongColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class DoubleColumnPrinter: public ColumnPrinter {
- private:
- const double* data;
- const bool isFloat;
-
- public:
- DoubleColumnPrinter(std::string&, const Type& type);
- virtual ~DoubleColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class TimestampColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* seconds;
- const int64_t* nanoseconds;
-
- public:
- TimestampColumnPrinter(std::string&);
- ~TimestampColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class DateColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* data;
-
- public:
- DateColumnPrinter(std::string&);
- ~DateColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class Decimal64ColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* data;
- int32_t scale;
- public:
- Decimal64ColumnPrinter(std::string&);
- ~Decimal64ColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class Decimal128ColumnPrinter: public ColumnPrinter {
- private:
- const Int128* data;
- int32_t scale;
- public:
- Decimal128ColumnPrinter(std::string&);
- ~Decimal128ColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class StringColumnPrinter: public ColumnPrinter {
- private:
- const char* const * start;
- const int64_t* length;
- public:
- StringColumnPrinter(std::string&);
- virtual ~StringColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class BinaryColumnPrinter: public ColumnPrinter {
- private:
- const char* const * start;
- const int64_t* length;
- public:
- BinaryColumnPrinter(std::string&);
- virtual ~BinaryColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class ListColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* offsets;
- std::unique_ptr<ColumnPrinter> elementPrinter;
-
- public:
- ListColumnPrinter(std::string&, const Type& type);
- virtual ~ListColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class MapColumnPrinter: public ColumnPrinter {
- private:
- const int64_t* offsets;
- std::unique_ptr<ColumnPrinter> keyPrinter;
- std::unique_ptr<ColumnPrinter> elementPrinter;
-
- public:
- MapColumnPrinter(std::string&, const Type& type);
- virtual ~MapColumnPrinter() override {}
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class UnionColumnPrinter: public ColumnPrinter {
- private:
- const unsigned char *tags;
- const uint64_t* offsets;
- std::vector<ColumnPrinter*> fieldPrinter;
-
- public:
- UnionColumnPrinter(std::string&, const Type& type);
- virtual ~UnionColumnPrinter() override;
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- class StructColumnPrinter: public ColumnPrinter {
- private:
- std::vector<ColumnPrinter*> fieldPrinter;
- std::vector<std::string> fieldNames;
- public:
- StructColumnPrinter(std::string&, const Type& type);
- virtual ~StructColumnPrinter() override;
- void printRow(uint64_t rowId) override;
- void reset(const ColumnVectorBatch& batch) override;
- };
-
- void writeChar(std::string& file, char ch) {
- file += ch;
- }
-
- void writeString(std::string& file, const char *ptr) {
- size_t len = strlen(ptr);
- file.append(ptr, len);
- }
-
- ColumnPrinter::ColumnPrinter(std::string& _buffer
- ): buffer(_buffer) {
- notNull = nullptr;
- hasNulls = false;
- }
-
- ColumnPrinter::~ColumnPrinter() {
- // PASS
- }
-
- void ColumnPrinter::reset(const ColumnVectorBatch& batch) {
- hasNulls = batch.hasNulls;
- if (hasNulls) {
- notNull = batch.notNull.data();
- } else {
- notNull = nullptr ;
- }
- }
-
- std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer,
- const Type* type) {
- ColumnPrinter *result = nullptr;
- if (type == nullptr) {
- result = new VoidColumnPrinter(buffer);
- } else {
- switch(static_cast<int64_t>(type->getKind())) {
- case BOOLEAN:
- result = new BooleanColumnPrinter(buffer);
- break;
-
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- result = new LongColumnPrinter(buffer);
- break;
-
- case FLOAT:
- case DOUBLE:
- result = new DoubleColumnPrinter(buffer, *type);
- break;
-
- case STRING:
- case VARCHAR :
- case CHAR:
- result = new StringColumnPrinter(buffer);
- break;
-
- case BINARY:
- result = new BinaryColumnPrinter(buffer);
- break;
-
- case TIMESTAMP:
- result = new TimestampColumnPrinter(buffer);
- break;
-
- case LIST:
- result = new ListColumnPrinter(buffer, *type);
- break;
-
- case MAP:
- result = new MapColumnPrinter(buffer, *type);
- break;
-
- case STRUCT:
- result = new StructColumnPrinter(buffer, *type);
- break;
-
- case DECIMAL:
- if (type->getPrecision() == 0 || type->getPrecision() > 18) {
- result = new Decimal128ColumnPrinter(buffer);
- } else {
- result = new Decimal64ColumnPrinter(buffer);
- }
- break;
-
- case DATE:
- result = new DateColumnPrinter(buffer);
- break;
-
- case UNION:
- result = new UnionColumnPrinter(buffer, *type);
- break;
-
- default:
- throw std::logic_error("unknown batch type");
- }
- }
- return std::unique_ptr<ColumnPrinter>(result);
- }
-
- VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer) {
- // PASS
- }
-
- void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
- // PASS
- }
-
- void VoidColumnPrinter::printRow(uint64_t) {
- writeString(buffer, "null");
- }
-
- LongColumnPrinter::LongColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
- // PASS
- }
-
- void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
- }
-
- void LongColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
- static_cast<int64_t >(data[rowId]));
- writeString(buffer, numBuffer);
- }
- }
-
- DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- data(nullptr),
- isFloat(type.getKind() == FLOAT){
- // PASS
- }
-
- void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
- }
-
- void DoubleColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g",
- data[rowId]);
- writeString(buffer, numBuffer);
- }
- }
-
- Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr),
- scale(0) {
- // PASS
- }
-
- void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
- scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
- }
-
- std::string toDecimalString(int64_t value, int32_t scale) {
- std::stringstream buffer;
- if (scale == 0) {
- buffer << value;
- return buffer.str();
- }
- std::string sign = "";
- if (value < 0) {
- sign = "-";
- value = -value;
- }
- buffer << value;
- std::string str = buffer.str();
- int32_t len = static_cast<int32_t>(str.length());
- if (len > scale) {
- return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(scale));
- } else if (len == scale) {
- return sign + "0." + str;
- } else {
- std::string result = sign + "0.";
- for(int32_t i=0; i < scale - len; ++i) {
- result += "0";
- }
- return result + str;
- }
- }
-
- void Decimal64ColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeString(buffer, toDecimalString(data[rowId], scale).c_str());
- }
- }
-
- Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr),
- scale(0) {
- // PASS
- }
-
- void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
- scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
- }
-
- void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeString(buffer, data[rowId].toDecimalString(scale).c_str());
- }
- }
-
- StringColumnPrinter::StringColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- start(nullptr),
- length(nullptr) {
- // PASS
- }
-
- void StringColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- start = dynamic_cast<const StringVectorBatch&>(batch).data.data();
- length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
- }
-
- void StringColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeChar(buffer, '"');
- for(int64_t i=0; i < length[rowId]; ++i) {
- char ch = static_cast<char>(start[rowId][i]);
- switch (ch) {
- case '\\':
- writeString(buffer, "\\\\");
- break;
- case '\b':
- writeString(buffer, "\\b");
- break;
- case '\f':
- writeString(buffer, "\\f");
- break;
- case '\n':
- writeString(buffer, "\\n");
- break;
- case '\r':
- writeString(buffer, "\\r");
- break;
- case '\t':
- writeString(buffer, "\\t");
- break;
- case '"':
- writeString(buffer, "\\\"");
- break;
- default:
- writeChar(buffer, ch);
- break;
- }
- }
- writeChar(buffer, '"');
- }
- }
-
- ListColumnPrinter::ListColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- offsets(nullptr) {
- elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
- }
-
- void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
- elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).
- elements);
- }
-
- void ListColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeChar(buffer, '[');
- for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
- if (i != offsets[rowId]) {
- writeString(buffer, ", ");
- }
- elementPrinter->printRow(static_cast<uint64_t>(i));
- }
- writeChar(buffer, ']');
- }
- }
-
- MapColumnPrinter::MapColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- offsets(nullptr) {
- keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
- elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
- }
-
- void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
- offsets = myBatch.offsets.data();
- keyPrinter->reset(*myBatch.keys);
- elementPrinter->reset(*myBatch.elements);
- }
-
- void MapColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeChar(buffer, '[');
- for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
- if (i != offsets[rowId]) {
- writeString(buffer, ", ");
- }
- writeString(buffer, "{\"key\": ");
- keyPrinter->printRow(static_cast<uint64_t>(i));
- writeString(buffer, ", \"value\": ");
- elementPrinter->printRow(static_cast<uint64_t>(i));
- writeChar(buffer, '}');
- }
- writeChar(buffer, ']');
- }
- }
-
- UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- tags(nullptr),
- offsets(nullptr) {
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
- fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))
- .release());
- }
- }
-
- UnionColumnPrinter::~UnionColumnPrinter() {
- for (size_t i = 0; i < fieldPrinter.size(); i++) {
- delete fieldPrinter[i];
- }
- }
-
- void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- const UnionVectorBatch& unionBatch =
- dynamic_cast<const UnionVectorBatch&>(batch);
- tags = unionBatch.tags.data();
- offsets = unionBatch.offsets.data();
- for(size_t i=0; i < fieldPrinter.size(); ++i) {
- fieldPrinter[i]->reset(*(unionBatch.children[i]));
- }
- }
-
- void UnionColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeString(buffer, "{\"tag\": ");
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
- static_cast<int64_t>(tags[rowId]));
- writeString(buffer, numBuffer);
- writeString(buffer, ", \"value\": ");
- fieldPrinter[tags[rowId]]->printRow(offsets[rowId]);
- writeChar(buffer, '}');
- }
- }
-
- StructColumnPrinter::StructColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer) {
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
- fieldNames.push_back(type.getFieldName(i));
- fieldPrinter.push_back(createColumnPrinter(buffer,
- type.getSubtype(i))
- .release());
- }
- }
-
- StructColumnPrinter::~StructColumnPrinter() {
- for (size_t i = 0; i < fieldPrinter.size(); i++) {
- delete fieldPrinter[i];
- }
- }
-
- void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- const StructVectorBatch& structBatch =
- dynamic_cast<const StructVectorBatch&>(batch);
- for(size_t i=0; i < fieldPrinter.size(); ++i) {
- fieldPrinter[i]->reset(*(structBatch.fields[i]));
- }
- }
-
- void StructColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeChar(buffer, '{');
- for(unsigned int i=0; i < fieldPrinter.size(); ++i) {
- if (i != 0) {
- writeString(buffer, ", ");
- }
- writeChar(buffer, '"');
- writeString(buffer, fieldNames[i].c_str());
- writeString(buffer, "\": ");
- fieldPrinter[i]->printRow(rowId);
- }
- writeChar(buffer, '}');
- }
- }
-
- DateColumnPrinter::DateColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
- // PASS
- }
-
- void DateColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- const time_t timeValue = data[rowId] * 24 * 60 * 60;
- struct tm tmValue;
- gmtime_r(&timeValue, &tmValue);
- char timeBuffer[11];
- strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue);
- writeChar(buffer, '"');
- writeString(buffer, timeBuffer);
- writeChar(buffer, '"');
- }
- }
-
- void DateColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
- }
-
- BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
- // PASS
- }
-
- void BooleanColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeString(buffer, (data[rowId] ? "true" : "false"));
- }
- }
-
- void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
- }
-
- BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- start(nullptr),
- length(nullptr) {
- // PASS
- }
-
- void BinaryColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeChar(buffer, '[');
- for(int64_t i=0; i < length[rowId]; ++i) {
- if (i != 0) {
- writeString(buffer, ", ");
- }
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), "%d",
- (static_cast<const int>(start[rowId][i]) & 0xff));
- writeString(buffer, numBuffer);
- }
- writeChar(buffer, ']');
- }
- }
-
- void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- start = dynamic_cast<const StringVectorBatch&>(batch).data.data();
- length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
- }
-
- TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- seconds(nullptr),
- nanoseconds(nullptr) {
- // PASS
- }
-
- void TimestampColumnPrinter::printRow(uint64_t rowId) {
- const int64_t NANO_DIGITS = 9;
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- int64_t nanos = nanoseconds[rowId];
- time_t secs = static_cast<time_t>(seconds[rowId]);
- struct tm tmValue;
- gmtime_r(&secs, &tmValue);
- char timeBuffer[20];
- strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- writeChar(buffer, '"');
- writeString(buffer, timeBuffer);
- writeChar(buffer, '.');
- // remove trailing zeros off the back of the nanos value.
- int64_t zeroDigits = 0;
- if (nanos == 0) {
- zeroDigits = 8;
- } else {
- while (nanos % 10 == 0) {
- nanos /= 10;
- zeroDigits += 1;
- }
- }
- char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer),
- "%0*" INT64_FORMAT_STRING "d\"",
- static_cast<int>(NANO_DIGITS - zeroDigits),
- static_cast<int64_t >(nanos));
- writeString(buffer, numBuffer);
- }
- }
-
- void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- const TimestampVectorBatch& ts =
- dynamic_cast<const TimestampVectorBatch&>(batch);
- seconds = ts.data.data();
- nanoseconds = ts.nanoseconds.data();
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/ColumnPrinter.hh"
+#include "orc/orc-config.hh"
+
+#include "Adaptor.hh"
+
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <time.h>
+#include <typeinfo>
+
+#ifdef __clang__
+ #pragma clang diagnostic ignored "-Wformat-security"
+#endif
+
+namespace orc {
+
+ class VoidColumnPrinter: public ColumnPrinter {
+ public:
+ VoidColumnPrinter(std::string&);
+ ~VoidColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class BooleanColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* data;
+ public:
+ BooleanColumnPrinter(std::string&);
+ ~BooleanColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class LongColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* data;
+ public:
+ LongColumnPrinter(std::string&);
+ ~LongColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class DoubleColumnPrinter: public ColumnPrinter {
+ private:
+ const double* data;
+ const bool isFloat;
+
+ public:
+ DoubleColumnPrinter(std::string&, const Type& type);
+ virtual ~DoubleColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class TimestampColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* seconds;
+ const int64_t* nanoseconds;
+
+ public:
+ TimestampColumnPrinter(std::string&);
+ ~TimestampColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class DateColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* data;
+
+ public:
+ DateColumnPrinter(std::string&);
+ ~DateColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class Decimal64ColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* data;
+ int32_t scale;
+ public:
+ Decimal64ColumnPrinter(std::string&);
+ ~Decimal64ColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class Decimal128ColumnPrinter: public ColumnPrinter {
+ private:
+ const Int128* data;
+ int32_t scale;
+ public:
+ Decimal128ColumnPrinter(std::string&);
+ ~Decimal128ColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class StringColumnPrinter: public ColumnPrinter {
+ private:
+ const char* const * start;
+ const int64_t* length;
+ public:
+ StringColumnPrinter(std::string&);
+ virtual ~StringColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class BinaryColumnPrinter: public ColumnPrinter {
+ private:
+ const char* const * start;
+ const int64_t* length;
+ public:
+ BinaryColumnPrinter(std::string&);
+ virtual ~BinaryColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class ListColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* offsets;
+ std::unique_ptr<ColumnPrinter> elementPrinter;
+
+ public:
+ ListColumnPrinter(std::string&, const Type& type);
+ virtual ~ListColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class MapColumnPrinter: public ColumnPrinter {
+ private:
+ const int64_t* offsets;
+ std::unique_ptr<ColumnPrinter> keyPrinter;
+ std::unique_ptr<ColumnPrinter> elementPrinter;
+
+ public:
+ MapColumnPrinter(std::string&, const Type& type);
+ virtual ~MapColumnPrinter() override {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class UnionColumnPrinter: public ColumnPrinter {
+ private:
+ const unsigned char *tags;
+ const uint64_t* offsets;
+ std::vector<ColumnPrinter*> fieldPrinter;
+
+ public:
+ UnionColumnPrinter(std::string&, const Type& type);
+ virtual ~UnionColumnPrinter() override;
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ class StructColumnPrinter: public ColumnPrinter {
+ private:
+ std::vector<ColumnPrinter*> fieldPrinter;
+ std::vector<std::string> fieldNames;
+ public:
+ StructColumnPrinter(std::string&, const Type& type);
+ virtual ~StructColumnPrinter() override;
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+ };
+
+ void writeChar(std::string& file, char ch) {
+ file += ch;
+ }
+
+ void writeString(std::string& file, const char *ptr) {
+ size_t len = strlen(ptr);
+ file.append(ptr, len);
+ }
+
+ ColumnPrinter::ColumnPrinter(std::string& _buffer
+ ): buffer(_buffer) {
+ notNull = nullptr;
+ hasNulls = false;
+ }
+
+ ColumnPrinter::~ColumnPrinter() {
+ // PASS
+ }
+
+ void ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ hasNulls = batch.hasNulls;
+ if (hasNulls) {
+ notNull = batch.notNull.data();
+ } else {
+ notNull = nullptr ;
+ }
+ }
+
+ std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer,
+ const Type* type) {
+ ColumnPrinter *result = nullptr;
+ if (type == nullptr) {
+ result = new VoidColumnPrinter(buffer);
+ } else {
+ switch(static_cast<int64_t>(type->getKind())) {
+ case BOOLEAN:
+ result = new BooleanColumnPrinter(buffer);
+ break;
+
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ result = new LongColumnPrinter(buffer);
+ break;
+
+ case FLOAT:
+ case DOUBLE:
+ result = new DoubleColumnPrinter(buffer, *type);
+ break;
+
+ case STRING:
+ case VARCHAR :
+ case CHAR:
+ result = new StringColumnPrinter(buffer);
+ break;
+
+ case BINARY:
+ result = new BinaryColumnPrinter(buffer);
+ break;
+
+ case TIMESTAMP:
+ result = new TimestampColumnPrinter(buffer);
+ break;
+
+ case LIST:
+ result = new ListColumnPrinter(buffer, *type);
+ break;
+
+ case MAP:
+ result = new MapColumnPrinter(buffer, *type);
+ break;
+
+ case STRUCT:
+ result = new StructColumnPrinter(buffer, *type);
+ break;
+
+ case DECIMAL:
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ result = new Decimal128ColumnPrinter(buffer);
+ } else {
+ result = new Decimal64ColumnPrinter(buffer);
+ }
+ break;
+
+ case DATE:
+ result = new DateColumnPrinter(buffer);
+ break;
+
+ case UNION:
+ result = new UnionColumnPrinter(buffer, *type);
+ break;
+
+ default:
+ throw std::logic_error("unknown batch type");
+ }
+ }
+ return std::unique_ptr<ColumnPrinter>(result);
+ }
+
+ VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer) {
+ // PASS
+ }
+
+ void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
+ // PASS
+ }
+
+ void VoidColumnPrinter::printRow(uint64_t) {
+ writeString(buffer, "null");
+ }
+
+ LongColumnPrinter::LongColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ data(nullptr) {
+ // PASS
+ }
+
+ void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+ }
+
+ void LongColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
+ static_cast<int64_t >(data[rowId]));
+ writeString(buffer, numBuffer);
+ }
+ }
+
+ DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer,
+ const Type& type
+ ): ColumnPrinter(_buffer),
+ data(nullptr),
+ isFloat(type.getKind() == FLOAT){
+ // PASS
+ }
+
+ void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
+ }
+
+ void DoubleColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g",
+ data[rowId]);
+ writeString(buffer, numBuffer);
+ }
+ }
+
+ Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ data(nullptr),
+ scale(0) {
+ // PASS
+ }
+
+ void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
+ scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
+ }
+
+ std::string toDecimalString(int64_t value, int32_t scale) {
+ std::stringstream buffer;
+ if (scale == 0) {
+ buffer << value;
+ return buffer.str();
+ }
+ std::string sign = "";
+ if (value < 0) {
+ sign = "-";
+ value = -value;
+ }
+ buffer << value;
+ std::string str = buffer.str();
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len > scale) {
+ return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len == scale) {
+ return sign + "0." + str;
+ } else {
+ std::string result = sign + "0.";
+ for(int32_t i=0; i < scale - len; ++i) {
+ result += "0";
+ }
+ return result + str;
+ }
+ }
+
+ void Decimal64ColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, toDecimalString(data[rowId], scale).c_str());
+ }
+ }
+
+ Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ data(nullptr),
+ scale(0) {
+ // PASS
+ }
+
+ void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
+ scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
+ }
+
+ void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, data[rowId].toDecimalString(scale).c_str());
+ }
+ }
+
+ StringColumnPrinter::StringColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ start(nullptr),
+ length(nullptr) {
+ // PASS
+ }
+
+ void StringColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ start = dynamic_cast<const StringVectorBatch&>(batch).data.data();
+ length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
+ }
+
+ void StringColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '"');
+ for(int64_t i=0; i < length[rowId]; ++i) {
+ char ch = static_cast<char>(start[rowId][i]);
+ switch (ch) {
+ case '\\':
+ writeString(buffer, "\\\\");
+ break;
+ case '\b':
+ writeString(buffer, "\\b");
+ break;
+ case '\f':
+ writeString(buffer, "\\f");
+ break;
+ case '\n':
+ writeString(buffer, "\\n");
+ break;
+ case '\r':
+ writeString(buffer, "\\r");
+ break;
+ case '\t':
+ writeString(buffer, "\\t");
+ break;
+ case '"':
+ writeString(buffer, "\\\"");
+ break;
+ default:
+ writeChar(buffer, ch);
+ break;
+ }
+ }
+ writeChar(buffer, '"');
+ }
+ }
+
+ ListColumnPrinter::ListColumnPrinter(std::string& _buffer,
+ const Type& type
+ ): ColumnPrinter(_buffer),
+ offsets(nullptr) {
+ elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
+ }
+
+ void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
+ elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).
+ elements);
+ }
+
+ void ListColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '[');
+ for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
+ if (i != offsets[rowId]) {
+ writeString(buffer, ", ");
+ }
+ elementPrinter->printRow(static_cast<uint64_t>(i));
+ }
+ writeChar(buffer, ']');
+ }
+ }
+
+ MapColumnPrinter::MapColumnPrinter(std::string& _buffer,
+ const Type& type
+ ): ColumnPrinter(_buffer),
+ offsets(nullptr) {
+ keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
+ elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
+ }
+
+ void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
+ offsets = myBatch.offsets.data();
+ keyPrinter->reset(*myBatch.keys);
+ elementPrinter->reset(*myBatch.elements);
+ }
+
+ void MapColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '[');
+ for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
+ if (i != offsets[rowId]) {
+ writeString(buffer, ", ");
+ }
+ writeString(buffer, "{\"key\": ");
+ keyPrinter->printRow(static_cast<uint64_t>(i));
+ writeString(buffer, ", \"value\": ");
+ elementPrinter->printRow(static_cast<uint64_t>(i));
+ writeChar(buffer, '}');
+ }
+ writeChar(buffer, ']');
+ }
+ }
+
+ UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer,
+ const Type& type
+ ): ColumnPrinter(_buffer),
+ tags(nullptr),
+ offsets(nullptr) {
+ for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i))
+ .release());
+ }
+ }
+
+ UnionColumnPrinter::~UnionColumnPrinter() {
+ for (size_t i = 0; i < fieldPrinter.size(); i++) {
+ delete fieldPrinter[i];
+ }
+ }
+
+ void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const UnionVectorBatch& unionBatch =
+ dynamic_cast<const UnionVectorBatch&>(batch);
+ tags = unionBatch.tags.data();
+ offsets = unionBatch.offsets.data();
+ for(size_t i=0; i < fieldPrinter.size(); ++i) {
+ fieldPrinter[i]->reset(*(unionBatch.children[i]));
+ }
+ }
+
+ void UnionColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, "{\"tag\": ");
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
+ static_cast<int64_t>(tags[rowId]));
+ writeString(buffer, numBuffer);
+ writeString(buffer, ", \"value\": ");
+ fieldPrinter[tags[rowId]]->printRow(offsets[rowId]);
+ writeChar(buffer, '}');
+ }
+ }
+
+ StructColumnPrinter::StructColumnPrinter(std::string& _buffer,
+ const Type& type
+ ): ColumnPrinter(_buffer) {
+ for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ fieldNames.push_back(type.getFieldName(i));
+ fieldPrinter.push_back(createColumnPrinter(buffer,
+ type.getSubtype(i))
+ .release());
+ }
+ }
+
+ StructColumnPrinter::~StructColumnPrinter() {
+ for (size_t i = 0; i < fieldPrinter.size(); i++) {
+ delete fieldPrinter[i];
+ }
+ }
+
+ void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const StructVectorBatch& structBatch =
+ dynamic_cast<const StructVectorBatch&>(batch);
+ for(size_t i=0; i < fieldPrinter.size(); ++i) {
+ fieldPrinter[i]->reset(*(structBatch.fields[i]));
+ }
+ }
+
+ void StructColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '{');
+ for(unsigned int i=0; i < fieldPrinter.size(); ++i) {
+ if (i != 0) {
+ writeString(buffer, ", ");
+ }
+ writeChar(buffer, '"');
+ writeString(buffer, fieldNames[i].c_str());
+ writeString(buffer, "\": ");
+ fieldPrinter[i]->printRow(rowId);
+ }
+ writeChar(buffer, '}');
+ }
+ }
+
+ DateColumnPrinter::DateColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ data(nullptr) {
+ // PASS
+ }
+
+ void DateColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ const time_t timeValue = data[rowId] * 24 * 60 * 60;
+ struct tm tmValue;
+ gmtime_r(&timeValue, &tmValue);
+ char timeBuffer[11];
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue);
+ writeChar(buffer, '"');
+ writeString(buffer, timeBuffer);
+ writeChar(buffer, '"');
+ }
+ }
+
+ void DateColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+ }
+
+ BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ data(nullptr) {
+ // PASS
+ }
+
+ void BooleanColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, (data[rowId] ? "true" : "false"));
+ }
+ }
+
+ void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+ }
+
+ BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ start(nullptr),
+ length(nullptr) {
+ // PASS
+ }
+
+ void BinaryColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '[');
+ for(int64_t i=0; i < length[rowId]; ++i) {
+ if (i != 0) {
+ writeString(buffer, ", ");
+ }
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), "%d",
+ (static_cast<const int>(start[rowId][i]) & 0xff));
+ writeString(buffer, numBuffer);
+ }
+ writeChar(buffer, ']');
+ }
+ }
+
+ void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ start = dynamic_cast<const StringVectorBatch&>(batch).data.data();
+ length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
+ }
+
+ TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer
+ ): ColumnPrinter(_buffer),
+ seconds(nullptr),
+ nanoseconds(nullptr) {
+ // PASS
+ }
+
+ void TimestampColumnPrinter::printRow(uint64_t rowId) {
+ const int64_t NANO_DIGITS = 9;
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ int64_t nanos = nanoseconds[rowId];
+ time_t secs = static_cast<time_t>(seconds[rowId]);
+ struct tm tmValue;
+ gmtime_r(&secs, &tmValue);
+ char timeBuffer[20];
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
+ writeChar(buffer, '"');
+ writeString(buffer, timeBuffer);
+ writeChar(buffer, '.');
+ // remove trailing zeros off the back of the nanos value.
+ int64_t zeroDigits = 0;
+ if (nanos == 0) {
+ zeroDigits = 8;
+ } else {
+ while (nanos % 10 == 0) {
+ nanos /= 10;
+ zeroDigits += 1;
+ }
+ }
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer),
+ "%0*" INT64_FORMAT_STRING "d\"",
+ static_cast<int>(NANO_DIGITS - zeroDigits),
+ static_cast<int64_t >(nanos));
+ writeString(buffer, numBuffer);
+ }
+ }
+
+ void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const TimestampVectorBatch& ts =
+ dynamic_cast<const TimestampVectorBatch&>(batch);
+ seconds = ts.data.data();
+ nanoseconds = ts.nanoseconds.data();
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
index 8cf660be11..aa891f5074 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
@@ -1,1836 +1,1836 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Int128.hh"
-
-#include "Adaptor.hh"
-#include "ByteRLE.hh"
-#include "ColumnReader.hh"
-#include "orc/Exceptions.hh"
-#include "RLE.hh"
-
-#include <math.h>
-#include <iostream>
-
-namespace orc {
-
- StripeStreams::~StripeStreams() {
- // PASS
- }
-
- inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
- switch (static_cast<int64_t>(kind)) {
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- return RleVersion_1;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return RleVersion_2;
- default:
- throw ParseError("Unknown encoding in convertRleVersion");
- }
- }
-
- ColumnReader::ColumnReader(const Type& type,
- StripeStreams& stripe
- ): columnId(type.getColumnId()),
- memoryPool(stripe.getMemoryPool()) {
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
- if (stream.get()) {
- notNullDecoder = createBooleanRleDecoder(std::move(stream));
- }
- }
-
- ColumnReader::~ColumnReader() {
- // PASS
- }
-
- uint64_t ColumnReader::skip(uint64_t numValues) {
- ByteRleDecoder* decoder = notNullDecoder.get();
- if (decoder) {
- // page through the values that we want to skip
- // and count how many are non-null
- const size_t MAX_BUFFER_SIZE = 32768;
- size_t bufferSize = std::min(MAX_BUFFER_SIZE,
- static_cast<size_t>(numValues));
- char buffer[MAX_BUFFER_SIZE];
- uint64_t remaining = numValues;
- while (remaining > 0) {
- uint64_t chunkSize =
- std::min(remaining,
- static_cast<uint64_t>(bufferSize));
- decoder->next(buffer, chunkSize, nullptr);
- remaining -= chunkSize;
- for(uint64_t i=0; i < chunkSize; ++i) {
- if (!buffer[i]) {
- numValues -= 1;
- }
- }
- }
- }
- return numValues;
- }
-
- void ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* incomingMask) {
- if (numValues > rowBatch.capacity) {
- rowBatch.resize(numValues);
- }
- rowBatch.numElements = numValues;
- ByteRleDecoder* decoder = notNullDecoder.get();
- if (decoder) {
- char* notNullArray = rowBatch.notNull.data();
- decoder->next(notNullArray, numValues, incomingMask);
- // check to see if there are nulls in this batch
- for(uint64_t i=0; i < numValues; ++i) {
- if (!notNullArray[i]) {
- rowBatch.hasNulls = true;
- return;
- }
- }
- } else if (incomingMask) {
- // If we don't have a notNull stream, copy the incomingMask
- rowBatch.hasNulls = true;
- memcpy(rowBatch.notNull.data(), incomingMask, numValues);
- return;
- }
- rowBatch.hasNulls = false;
- }
-
- void ColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- if (notNullDecoder.get()) {
- notNullDecoder->seek(positions.at(columnId));
- }
- }
-
- /**
- * Expand an array of bytes in place to the corresponding array of longs.
- * Has to work backwards so that they data isn't clobbered during the
- * expansion.
- * @param buffer the array of chars and array of longs that need to be
- * expanded
- * @param numValues the number of bytes to convert to longs
- */
- void expandBytesToLongs(int64_t* buffer, uint64_t numValues) {
- for(size_t i=numValues - 1; i < numValues; --i) {
- buffer[i] = reinterpret_cast<char *>(buffer)[i];
- }
- }
-
- class BooleanColumnReader: public ColumnReader {
- private:
- std::unique_ptr<orc::ByteRleDecoder> rle;
-
- public:
- BooleanColumnReader(const Type& type, StripeStreams& stipe);
- ~BooleanColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- BooleanColumnReader::BooleanColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe){
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Boolean column");
- rle = createBooleanRleDecoder(std::move(stream));
- }
-
- BooleanColumnReader::~BooleanColumnReader() {
- // PASS
- }
-
- uint64_t BooleanColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
- return numValues;
- }
-
- void BooleanColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- // Since the byte rle places the output in a char* instead of long*,
- // we cheat here and use the long* and then expand it in a second pass.
- int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
- expandBytesToLongs(ptr, numValues);
- }
-
- void BooleanColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- }
-
- class ByteColumnReader: public ColumnReader {
- private:
- std::unique_ptr<orc::ByteRleDecoder> rle;
-
- public:
- ByteColumnReader(const Type& type, StripeStreams& stipe);
- ~ByteColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- ByteColumnReader::ByteColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe){
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Byte column");
- rle = createByteRleDecoder(std::move(stream));
- }
-
- ByteColumnReader::~ByteColumnReader() {
- // PASS
- }
-
- uint64_t ByteColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
- return numValues;
- }
-
- void ByteColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- // Since the byte rle places the output in a char* instead of long*,
- // we cheat here and use the long* and then expand it in a second pass.
- int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
- expandBytesToLongs(ptr, numValues);
- }
-
- void ByteColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- }
-
- class IntegerColumnReader: public ColumnReader {
- protected:
- std::unique_ptr<orc::RleDecoder> rle;
-
- public:
- IntegerColumnReader(const Type& type, StripeStreams& stripe);
- ~IntegerColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- IntegerColumnReader::IntegerColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Integer column");
- rle = createRleDecoder(std::move(stream), true, vers, memoryPool);
- }
-
- IntegerColumnReader::~IntegerColumnReader() {
- // PASS
- }
-
- uint64_t IntegerColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
- return numValues;
- }
-
- void IntegerColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
- }
-
- void IntegerColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- }
-
- class TimestampColumnReader: public ColumnReader {
- private:
- std::unique_ptr<orc::RleDecoder> secondsRle;
- std::unique_ptr<orc::RleDecoder> nanoRle;
- const Timezone& writerTimezone;
- const int64_t epochOffset;
-
- public:
- TimestampColumnReader(const Type& type, StripeStreams& stripe);
- ~TimestampColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
-
- TimestampColumnReader::TimestampColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- writerTimezone(stripe.getWriterTimezone()),
- epochOffset(writerTimezone.getEpoch()) {
- RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Timestamp column");
- secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool);
- stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
- if (stream == nullptr)
- throw ParseError("SECONDARY stream not found in Timestamp column");
- nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool);
- }
-
- TimestampColumnReader::~TimestampColumnReader() {
- // PASS
- }
-
- uint64_t TimestampColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- secondsRle->skip(numValues);
- nanoRle->skip(numValues);
- return numValues;
- }
-
- void TimestampColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- TimestampVectorBatch& timestampBatch =
- dynamic_cast<TimestampVectorBatch&>(rowBatch);
- int64_t *secsBuffer = timestampBatch.data.data();
- secondsRle->next(secsBuffer, numValues, notNull);
- int64_t *nanoBuffer = timestampBatch.nanoseconds.data();
- nanoRle->next(nanoBuffer, numValues, notNull);
-
- // Construct the values
- for(uint64_t i=0; i < numValues; i++) {
- if (notNull == nullptr || notNull[i]) {
- uint64_t zeros = nanoBuffer[i] & 0x7;
- nanoBuffer[i] >>= 3;
- if (zeros != 0) {
- for(uint64_t j = 0; j <= zeros; ++j) {
- nanoBuffer[i] *= 10;
- }
- }
- int64_t writerTime = secsBuffer[i] + epochOffset;
- secsBuffer[i] = writerTimezone.convertToUTC(writerTime);
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Int128.hh"
+
+#include "Adaptor.hh"
+#include "ByteRLE.hh"
+#include "ColumnReader.hh"
+#include "orc/Exceptions.hh"
+#include "RLE.hh"
+
+#include <math.h>
+#include <iostream>
+
+namespace orc {
+
+ StripeStreams::~StripeStreams() {
+ // PASS
+ }
+
+ inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
+ switch (static_cast<int64_t>(kind)) {
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ return RleVersion_1;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return RleVersion_2;
+ default:
+ throw ParseError("Unknown encoding in convertRleVersion");
+ }
+ }
+
+ ColumnReader::ColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): columnId(type.getColumnId()),
+ memoryPool(stripe.getMemoryPool()) {
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
+ if (stream.get()) {
+ notNullDecoder = createBooleanRleDecoder(std::move(stream));
+ }
+ }
+
+ ColumnReader::~ColumnReader() {
+ // PASS
+ }
+
+ uint64_t ColumnReader::skip(uint64_t numValues) {
+ ByteRleDecoder* decoder = notNullDecoder.get();
+ if (decoder) {
+ // page through the values that we want to skip
+ // and count how many are non-null
+ const size_t MAX_BUFFER_SIZE = 32768;
+ size_t bufferSize = std::min(MAX_BUFFER_SIZE,
+ static_cast<size_t>(numValues));
+ char buffer[MAX_BUFFER_SIZE];
+ uint64_t remaining = numValues;
+ while (remaining > 0) {
+ uint64_t chunkSize =
+ std::min(remaining,
+ static_cast<uint64_t>(bufferSize));
+ decoder->next(buffer, chunkSize, nullptr);
+ remaining -= chunkSize;
+ for(uint64_t i=0; i < chunkSize; ++i) {
+ if (!buffer[i]) {
+ numValues -= 1;
+ }
+ }
+ }
+ }
+ return numValues;
+ }
+
+ void ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* incomingMask) {
+ if (numValues > rowBatch.capacity) {
+ rowBatch.resize(numValues);
+ }
+ rowBatch.numElements = numValues;
+ ByteRleDecoder* decoder = notNullDecoder.get();
+ if (decoder) {
+ char* notNullArray = rowBatch.notNull.data();
+ decoder->next(notNullArray, numValues, incomingMask);
+ // check to see if there are nulls in this batch
+ for(uint64_t i=0; i < numValues; ++i) {
+ if (!notNullArray[i]) {
+ rowBatch.hasNulls = true;
+ return;
+ }
+ }
+ } else if (incomingMask) {
+ // If we don't have a notNull stream, copy the incomingMask
+ rowBatch.hasNulls = true;
+ memcpy(rowBatch.notNull.data(), incomingMask, numValues);
+ return;
+ }
+ rowBatch.hasNulls = false;
+ }
+
+ void ColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ if (notNullDecoder.get()) {
+ notNullDecoder->seek(positions.at(columnId));
+ }
+ }
+
+ /**
+ * Expand an array of bytes in place to the corresponding array of longs.
+ * Has to work backwards so that they data isn't clobbered during the
+ * expansion.
+ * @param buffer the array of chars and array of longs that need to be
+ * expanded
+ * @param numValues the number of bytes to convert to longs
+ */
+ void expandBytesToLongs(int64_t* buffer, uint64_t numValues) {
+ for(size_t i=numValues - 1; i < numValues; --i) {
+ buffer[i] = reinterpret_cast<char *>(buffer)[i];
+ }
+ }
+
+ class BooleanColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<orc::ByteRleDecoder> rle;
+
+ public:
+ BooleanColumnReader(const Type& type, StripeStreams& stipe);
+ ~BooleanColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+
+ BooleanColumnReader::BooleanColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe){
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr)
+ throw ParseError("DATA stream not found in Boolean column");
+ rle = createBooleanRleDecoder(std::move(stream));
+ }
+
+ BooleanColumnReader::~BooleanColumnReader() {
+ // PASS
+ }
+
+ uint64_t BooleanColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+ }
+
+ void BooleanColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // Since the byte rle places the output in a char* instead of long*,
+ // we cheat here and use the long* and then expand it in a second pass.
+ int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr),
+ numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ expandBytesToLongs(ptr, numValues);
+ }
+
+ void BooleanColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ }
+
+ class ByteColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<orc::ByteRleDecoder> rle;
+
+ public:
+ ByteColumnReader(const Type& type, StripeStreams& stipe);
+ ~ByteColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+
+ ByteColumnReader::ByteColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe){
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr)
+ throw ParseError("DATA stream not found in Byte column");
+ rle = createByteRleDecoder(std::move(stream));
+ }
+
+ ByteColumnReader::~ByteColumnReader() {
+ // PASS
+ }
+
+ uint64_t ByteColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+ }
+
+ void ByteColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // Since the byte rle places the output in a char* instead of long*,
+ // we cheat here and use the long* and then expand it in a second pass.
+ int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr),
+ numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ expandBytesToLongs(ptr, numValues);
+ }
+
+ void ByteColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ }
+
+ class IntegerColumnReader: public ColumnReader {
+ protected:
+ std::unique_ptr<orc::RleDecoder> rle;
+
+ public:
+ IntegerColumnReader(const Type& type, StripeStreams& stripe);
+ ~IntegerColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+
+ IntegerColumnReader::IntegerColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr)
+ throw ParseError("DATA stream not found in Integer column");
+ rle = createRleDecoder(std::move(stream), true, vers, memoryPool);
+ }
+
+ IntegerColumnReader::~IntegerColumnReader() {
+ // PASS
+ }
+
+ uint64_t IntegerColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+ }
+
+ void IntegerColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(),
+ numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ }
+
+ void IntegerColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ }
+
+ class TimestampColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<orc::RleDecoder> secondsRle;
+ std::unique_ptr<orc::RleDecoder> nanoRle;
+ const Timezone& writerTimezone;
+ const int64_t epochOffset;
+
+ public:
+ TimestampColumnReader(const Type& type, StripeStreams& stripe);
+ ~TimestampColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+
+
+ TimestampColumnReader::TimestampColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe),
+ writerTimezone(stripe.getWriterTimezone()),
+ epochOffset(writerTimezone.getEpoch()) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr)
+ throw ParseError("DATA stream not found in Timestamp column");
+ secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool);
+ stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
+ if (stream == nullptr)
+ throw ParseError("SECONDARY stream not found in Timestamp column");
+ nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool);
+ }
+
+ TimestampColumnReader::~TimestampColumnReader() {
+ // PASS
+ }
+
+ uint64_t TimestampColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ secondsRle->skip(numValues);
+ nanoRle->skip(numValues);
+ return numValues;
+ }
+
+ void TimestampColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ TimestampVectorBatch& timestampBatch =
+ dynamic_cast<TimestampVectorBatch&>(rowBatch);
+ int64_t *secsBuffer = timestampBatch.data.data();
+ secondsRle->next(secsBuffer, numValues, notNull);
+ int64_t *nanoBuffer = timestampBatch.nanoseconds.data();
+ nanoRle->next(nanoBuffer, numValues, notNull);
+
+ // Construct the values
+ for(uint64_t i=0; i < numValues; i++) {
+ if (notNull == nullptr || notNull[i]) {
+ uint64_t zeros = nanoBuffer[i] & 0x7;
+ nanoBuffer[i] >>= 3;
+ if (zeros != 0) {
+ for(uint64_t j = 0; j <= zeros; ++j) {
+ nanoBuffer[i] *= 10;
+ }
+ }
+ int64_t writerTime = secsBuffer[i] + epochOffset;
+ secsBuffer[i] = writerTimezone.convertToUTC(writerTime);
if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) {
- secsBuffer[i] -= 1;
- }
- }
- }
- }
-
- void TimestampColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- secondsRle->seek(positions.at(columnId));
- nanoRle->seek(positions.at(columnId));
- }
-
- class DoubleColumnReader: public ColumnReader {
- public:
- DoubleColumnReader(const Type& type, StripeStreams& stripe);
- ~DoubleColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
-
- private:
- std::unique_ptr<SeekableInputStream> inputStream;
- TypeKind columnKind;
- const uint64_t bytesPerValue ;
- const char *bufferPointer;
- const char *bufferEnd;
-
- unsigned char readByte() {
- if (bufferPointer == bufferEnd) {
- int length;
- if (!inputStream->Next
- (reinterpret_cast<const void**>(&bufferPointer), &length)) {
- throw ParseError("bad read in DoubleColumnReader::next()");
- }
- bufferEnd = bufferPointer + length;
- }
- return static_cast<unsigned char>(*(bufferPointer++));
- }
-
- double readDouble() {
- int64_t bits = 0;
- for (uint64_t i=0; i < 8; i++) {
- bits |= static_cast<int64_t>(readByte()) << (i*8);
- }
- double *result = reinterpret_cast<double*>(&bits);
- return *result;
- }
-
- double readFloat() {
- int32_t bits = 0;
- for (uint64_t i=0; i < 4; i++) {
- bits |= readByte() << (i*8);
- }
- float *result = reinterpret_cast<float*>(&bits);
- return static_cast<double>(*result);
- }
- };
-
- DoubleColumnReader::DoubleColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- columnKind(type.getKind()),
- bytesPerValue((type.getKind() ==
- FLOAT) ? 4 : 8),
- bufferPointer(nullptr),
- bufferEnd(nullptr) {
- inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (inputStream == nullptr)
- throw ParseError("DATA stream not found in Double column");
- }
-
- DoubleColumnReader::~DoubleColumnReader() {
- // PASS
- }
-
- uint64_t DoubleColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
-
- if (static_cast<size_t>(bufferEnd - bufferPointer) >=
- bytesPerValue * numValues) {
- bufferPointer += bytesPerValue * numValues;
- } else {
- size_t sizeToSkip = bytesPerValue * numValues -
- static_cast<size_t>(bufferEnd - bufferPointer);
- const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
- while (sizeToSkip != 0) {
- size_t step = sizeToSkip > cap ? cap : sizeToSkip;
- inputStream->Skip(static_cast<int>(step));
- sizeToSkip -= step;
- }
- bufferEnd = nullptr;
- bufferPointer = nullptr;
- }
-
- return numValues;
- }
-
- void DoubleColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- // update the notNull from the parent class
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data();
-
- if (columnKind == FLOAT) {
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- outArray[i] = readFloat();
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- outArray[i] = readFloat();
- }
- }
- } else {
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- outArray[i] = readDouble();
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- outArray[i] = readDouble();
- }
- }
- }
- }
-
- void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) {
- int64_t posn = 0;
- while (posn < bufferSize) {
- const void* chunk;
- int length;
- if (!stream->Next(&chunk, &length)) {
- throw ParseError("bad read in readFully");
- }
- if (posn + length > bufferSize) {
- throw ParseError("Corrupt dictionary blob in StringDictionaryColumn");
- }
- memcpy(buffer + posn, chunk, static_cast<size_t>(length));
- posn += length;
- }
- }
-
- void DoubleColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- inputStream->seek(positions.at(columnId));
- }
-
- class StringDictionaryColumnReader: public ColumnReader {
- private:
- std::shared_ptr<StringDictionary> dictionary;
- std::unique_ptr<RleDecoder> rle;
-
- public:
- StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
- ~StringDictionaryColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- StringDictionaryColumnReader::StringDictionaryColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- dictionary(new StringDictionary(stripe.getMemoryPool())) {
- RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
- .kind());
- uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize();
- rle = createRleDecoder(stripe.getStream(columnId,
- proto::Stream_Kind_DATA,
- true),
- false, rleVersion, memoryPool);
- std::unique_ptr<RleDecoder> lengthDecoder =
- createRleDecoder(stripe.getStream(columnId,
- proto::Stream_Kind_LENGTH,
- false),
- false, rleVersion, memoryPool);
- dictionary->dictionaryOffset.resize(dictSize + 1);
- int64_t* lengthArray = dictionary->dictionaryOffset.data();
- lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
- lengthArray[0] = 0;
- for(uint32_t i = 1; i < dictSize + 1; ++i) {
- lengthArray[i] += lengthArray[i - 1];
- }
- dictionary->dictionaryBlob.resize(
- static_cast<uint64_t>(lengthArray[dictSize]));
- std::unique_ptr<SeekableInputStream> blobStream =
- stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
- readFully(
- dictionary->dictionaryBlob.data(),
- lengthArray[dictSize],
- blobStream.get());
- }
-
- StringDictionaryColumnReader::~StringDictionaryColumnReader() {
- // PASS
- }
-
- uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- rle->skip(numValues);
- return numValues;
- }
-
- void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- // update the notNull from the parent class
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char *blob = dictionary->dictionaryBlob.data();
- int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data();
- char **outputStarts = byteBatch.data.data();
- int64_t *outputLengths = byteBatch.length.data();
- rle->next(outputLengths, numValues, notNull);
- uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1;
- if (notNull) {
- for(uint64_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- int64_t entry = outputLengths[i];
- if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) {
- throw ParseError("Entry index out of range in StringDictionaryColumn");
- }
- outputStarts[i] = blob + dictionaryOffsets[entry];
- outputLengths[i] = dictionaryOffsets[entry+1] -
- dictionaryOffsets[entry];
- }
- }
- } else {
- for(uint64_t i=0; i < numValues; ++i) {
- int64_t entry = outputLengths[i];
- if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) {
- throw ParseError("Entry index out of range in StringDictionaryColumn");
- }
- outputStarts[i] = blob + dictionaryOffsets[entry];
- outputLengths[i] = dictionaryOffsets[entry+1] -
- dictionaryOffsets[entry];
- }
- }
- }
-
- void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- rowBatch.isEncoded = true;
-
- EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch);
- batch.dictionary = this->dictionary;
-
- // Length buffer is reused to save dictionary entry ids
- rle->next(batch.index.data(), numValues, notNull);
- }
-
- void StringDictionaryColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- }
-
-
- class StringDirectColumnReader: public ColumnReader {
- private:
- std::unique_ptr<RleDecoder> lengthRle;
- std::unique_ptr<SeekableInputStream> blobStream;
- const char *lastBuffer;
- size_t lastBufferLength;
-
- /**
- * Compute the total length of the values.
- * @param lengths the array of lengths
- * @param notNull the array of notNull flags
- * @param numValues the lengths of the arrays
- * @return the total number of bytes for the non-null values
- */
- size_t computeSize(const int64_t *lengths, const char *notNull,
- uint64_t numValues);
-
- public:
- StringDirectColumnReader(const Type& type, StripeStreams& stipe);
- ~StringDirectColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
-
- StringDirectColumnReader::StringDirectColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
- .kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in StringDirectColumn");
- lengthRle = createRleDecoder(
- std::move(stream), false, rleVersion, memoryPool);
- blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (blobStream == nullptr)
- throw ParseError("DATA stream not found in StringDirectColumn");
- lastBuffer = nullptr;
- lastBufferLength = 0;
- }
-
- StringDirectColumnReader::~StringDirectColumnReader() {
- // PASS
- }
-
- uint64_t StringDirectColumnReader::skip(uint64_t numValues) {
- const size_t BUFFER_SIZE = 1024;
- numValues = ColumnReader::skip(numValues);
- int64_t buffer[BUFFER_SIZE];
- uint64_t done = 0;
- size_t totalBytes = 0;
- // read the lengths, so we know haw many bytes to skip
- while (done < numValues) {
- uint64_t step = std::min(BUFFER_SIZE,
- static_cast<size_t>(numValues - done));
- lengthRle->next(buffer, step, nullptr);
- totalBytes += computeSize(buffer, nullptr, step);
- done += step;
- }
- if (totalBytes <= lastBufferLength) {
- // subtract the needed bytes from the ones left over
- lastBufferLength -= totalBytes;
- lastBuffer += totalBytes;
- } else {
- // move the stream forward after accounting for the buffered bytes
- totalBytes -= lastBufferLength;
- const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
- while (totalBytes != 0) {
- size_t step = totalBytes > cap ? cap : totalBytes;
- blobStream->Skip(static_cast<int>(step));
- totalBytes -= step;
- }
- lastBufferLength = 0;
- lastBuffer = nullptr;
- }
- return numValues;
- }
-
- size_t StringDirectColumnReader::computeSize(const int64_t* lengths,
- const char* notNull,
- uint64_t numValues) {
- size_t totalLength = 0;
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- totalLength += static_cast<size_t>(lengths[i]);
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- totalLength += static_cast<size_t>(lengths[i]);
- }
- }
- return totalLength;
- }
-
- void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- // update the notNull from the parent class
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char **startPtr = byteBatch.data.data();
- int64_t *lengthPtr = byteBatch.length.data();
-
- // read the length vector
- lengthRle->next(lengthPtr, numValues, notNull);
-
- // figure out the total length of data we need from the blob stream
- const size_t totalLength = computeSize(lengthPtr, notNull, numValues);
-
- // Load data from the blob stream into our buffer until we have enough
- // to get the rest directly out of the stream's buffer.
- size_t bytesBuffered = 0;
- byteBatch.blob.resize(totalLength);
- char *ptr= byteBatch.blob.data();
- while (bytesBuffered + lastBufferLength < totalLength) {
- memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength);
- bytesBuffered += lastBufferLength;
- const void* readBuffer;
- int readLength;
- if (!blobStream->Next(&readBuffer, &readLength)) {
- throw ParseError("failed to read in StringDirectColumnReader.next");
- }
- lastBuffer = static_cast<const char*>(readBuffer);
- lastBufferLength = static_cast<size_t>(readLength);
- }
-
- if (bytesBuffered < totalLength) {
- size_t moreBytes = totalLength - bytesBuffered;
- memcpy(ptr + bytesBuffered, lastBuffer, moreBytes);
- lastBuffer += moreBytes;
- lastBufferLength -= moreBytes;
- }
-
- size_t filledSlots = 0;
- ptr = byteBatch.blob.data();
- if (notNull) {
- while (filledSlots < numValues) {
- if (notNull[filledSlots]) {
- startPtr[filledSlots] = const_cast<char*>(ptr);
- ptr += lengthPtr[filledSlots];
- }
- filledSlots += 1;
- }
- } else {
- while (filledSlots < numValues) {
- startPtr[filledSlots] = const_cast<char*>(ptr);
- ptr += lengthPtr[filledSlots];
- filledSlots += 1;
- }
- }
- }
-
- void StringDirectColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- blobStream->seek(positions.at(columnId));
- lengthRle->seek(positions.at(columnId));
- }
-
- class StructColumnReader: public ColumnReader {
- private:
- std::vector<ColumnReader*> children;
-
- public:
- StructColumnReader(const Type& type, StripeStreams& stipe);
- ~StructColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
-
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
- };
-
- StructColumnReader::StructColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- // count the number of selected sub-columns
- const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) {
- case proto::ColumnEncoding_Kind_DIRECT:
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
- const Type& child = *type.getSubtype(i);
- if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
- children.push_back(buildReader(child, stripe).release());
- }
- }
- break;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- default:
- throw ParseError("Unknown encoding for StructColumnReader");
- }
- }
-
- StructColumnReader::~StructColumnReader() {
- for (size_t i=0; i<children.size(); i++) {
- delete children[i];
- }
- }
-
- uint64_t StructColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- for(std::vector<ColumnReader*>::iterator ptr=children.begin(); ptr != children.end(); ++ptr) {
- (*ptr)->skip(numValues);
- }
- return numValues;
- }
-
- void StructColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- nextInternal<false>(rowBatch, numValues, notNull);
- }
-
- void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- nextInternal<true>(rowBatch, numValues, notNull);
- }
-
- template<bool encoded>
- void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- uint64_t i=0;
- notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr;
- for(std::vector<ColumnReader*>::iterator ptr=children.begin();
- ptr != children.end(); ++ptr, ++i) {
- if (encoded) {
- (*ptr)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
- numValues, notNull);
- } else {
- (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
- numValues, notNull);
- }
- }
- }
-
- void StructColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
-
- for(std::vector<ColumnReader*>::iterator ptr = children.begin();
- ptr != children.end();
- ++ptr) {
- (*ptr)->seekToRowGroup(positions);
- }
- }
-
- class ListColumnReader: public ColumnReader {
- private:
- std::unique_ptr<ColumnReader> child;
- std::unique_ptr<RleDecoder> rle;
-
- public:
- ListColumnReader(const Type& type, StripeStreams& stipe);
- ~ListColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
-
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
- };
-
- ListColumnReader::ListColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- // count the number of selected sub-columns
- const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in List column");
- rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
- const Type& childType = *type.getSubtype(0);
- if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
- child = buildReader(childType, stripe);
- }
- }
-
- ListColumnReader::~ListColumnReader() {
- // PASS
- }
-
- uint64_t ListColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- ColumnReader *childReader = child.get();
- if (childReader) {
- const uint64_t BUFFER_SIZE = 1024;
- int64_t buffer[BUFFER_SIZE];
- uint64_t childrenElements = 0;
- uint64_t lengthsRead = 0;
- while (lengthsRead < numValues) {
- uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
- rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
- childrenElements += static_cast<size_t>(buffer[i]);
- }
- lengthsRead += chunk;
- }
- childReader->skip(childrenElements);
- } else {
- rle->skip(numValues);
- }
- return numValues;
- }
-
- void ListColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- nextInternal<false>(rowBatch, numValues, notNull);
- }
-
- void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- nextInternal<true>(rowBatch, numValues, notNull);
- }
-
- template<bool encoded>
- void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
- int64_t* offsets = listBatch.offsets.data();
- notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr;
- rle->next(offsets, numValues, notNull);
- uint64_t totalChildren = 0;
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- uint64_t tmp = static_cast<uint64_t>(offsets[i]);
- offsets[i] = static_cast<int64_t>(totalChildren);
- totalChildren += tmp;
- } else {
- offsets[i] = static_cast<int64_t>(totalChildren);
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- uint64_t tmp = static_cast<uint64_t>(offsets[i]);
- offsets[i] = static_cast<int64_t>(totalChildren);
- totalChildren += tmp;
- }
- }
- offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader *childReader = child.get();
- if (childReader) {
- if (encoded) {
- childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr);
- } else {
- childReader->next(*(listBatch.elements.get()), totalChildren, nullptr);
- }
- }
- }
-
- void ListColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- if (child.get()) {
- child->seekToRowGroup(positions);
- }
- }
-
- class MapColumnReader: public ColumnReader {
- private:
- std::unique_ptr<ColumnReader> keyReader;
- std::unique_ptr<ColumnReader> elementReader;
- std::unique_ptr<RleDecoder> rle;
-
- public:
- MapColumnReader(const Type& type, StripeStreams& stipe);
- ~MapColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
-
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
- };
-
- MapColumnReader::MapColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- // Determine if the key and/or value columns are selected
- const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in Map column");
- rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
- const Type& keyType = *type.getSubtype(0);
- if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) {
- keyReader = buildReader(keyType, stripe);
- }
- const Type& elementType = *type.getSubtype(1);
- if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) {
- elementReader = buildReader(elementType, stripe);
- }
- }
-
- MapColumnReader::~MapColumnReader() {
- // PASS
- }
-
- uint64_t MapColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- ColumnReader *rawKeyReader = keyReader.get();
- ColumnReader *rawElementReader = elementReader.get();
- if (rawKeyReader || rawElementReader) {
- const uint64_t BUFFER_SIZE = 1024;
- int64_t buffer[BUFFER_SIZE];
- uint64_t childrenElements = 0;
- uint64_t lengthsRead = 0;
- while (lengthsRead < numValues) {
- uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
- rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
- childrenElements += static_cast<size_t>(buffer[i]);
- }
- lengthsRead += chunk;
- }
- if (rawKeyReader) {
- rawKeyReader->skip(childrenElements);
- }
- if (rawElementReader) {
- rawElementReader->skip(childrenElements);
- }
- } else {
- rle->skip(numValues);
- }
- return numValues;
- }
-
- void MapColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull)
- {
- nextInternal<false>(rowBatch, numValues, notNull);
- }
-
- void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull)
- {
- nextInternal<true>(rowBatch, numValues, notNull);
- }
-
- template<bool encoded>
- void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
- int64_t* offsets = mapBatch.offsets.data();
- notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr;
- rle->next(offsets, numValues, notNull);
- uint64_t totalChildren = 0;
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- uint64_t tmp = static_cast<uint64_t>(offsets[i]);
- offsets[i] = static_cast<int64_t>(totalChildren);
- totalChildren += tmp;
- } else {
- offsets[i] = static_cast<int64_t>(totalChildren);
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- uint64_t tmp = static_cast<uint64_t>(offsets[i]);
- offsets[i] = static_cast<int64_t>(totalChildren);
- totalChildren += tmp;
- }
- }
- offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader *rawKeyReader = keyReader.get();
- if (rawKeyReader) {
- if (encoded) {
- rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr);
- } else {
- rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr);
- }
- }
- ColumnReader *rawElementReader = elementReader.get();
- if (rawElementReader) {
- if (encoded) {
- rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr);
- } else {
- rawElementReader->next(*(mapBatch.elements.get()), totalChildren, nullptr);
- }
- }
- }
-
- void MapColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- if (keyReader.get()) {
- keyReader->seekToRowGroup(positions);
- }
- if (elementReader.get()) {
- elementReader->seekToRowGroup(positions);
- }
- }
-
- class UnionColumnReader: public ColumnReader {
- private:
- std::unique_ptr<ByteRleDecoder> rle;
- std::vector<ColumnReader*> childrenReader;
- std::vector<int64_t> childrenCounts;
- uint64_t numChildren;
-
- public:
- UnionColumnReader(const Type& type, StripeStreams& stipe);
- ~UnionColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
-
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
- };
-
- UnionColumnReader::UnionColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- numChildren = type.getSubtypeCount();
- childrenReader.resize(numChildren);
- childrenCounts.resize(numChildren);
-
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in Union column");
- rle = createByteRleDecoder(std::move(stream));
- // figure out which types are selected
- const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- for(unsigned int i=0; i < numChildren; ++i) {
- const Type &child = *type.getSubtype(i);
- if (selectedColumns[static_cast<size_t>(child.getColumnId())]) {
- childrenReader[i] = buildReader(child, stripe).release();
- }
- }
- }
-
- UnionColumnReader::~UnionColumnReader() {
- for(std::vector<ColumnReader*>::iterator itr = childrenReader.begin();
- itr != childrenReader.end(); ++itr) {
- delete *itr;
- }
- }
-
- uint64_t UnionColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- const uint64_t BUFFER_SIZE = 1024;
- char buffer[BUFFER_SIZE];
- uint64_t lengthsRead = 0;
- int64_t *counts = childrenCounts.data();
- memset(counts, 0, sizeof(int64_t) * numChildren);
- while (lengthsRead < numValues) {
- uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
- rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
- counts[static_cast<size_t>(buffer[i])] += 1;
- }
- lengthsRead += chunk;
- }
- for(size_t i=0; i < numChildren; ++i) {
- if (counts[i] != 0 && childrenReader[i] != nullptr) {
- childrenReader[i]->skip(static_cast<uint64_t>(counts[i]));
- }
- }
- return numValues;
- }
-
- void UnionColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- nextInternal<false>(rowBatch, numValues, notNull);
- }
-
- void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- nextInternal<true>(rowBatch, numValues, notNull);
- }
-
- template<bool encoded>
- void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
- uint64_t* offsets = unionBatch.offsets.data();
- int64_t* counts = childrenCounts.data();
- memset(counts, 0, sizeof(int64_t) * numChildren);
- unsigned char* tags = unionBatch.tags.data();
- notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr;
- rle->next(reinterpret_cast<char *>(tags), numValues, notNull);
- // set the offsets for each row
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- offsets[i] =
- static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- offsets[i] =
- static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
- }
- }
- // read the right number of each child column
- for(size_t i=0; i < numChildren; ++i) {
- if (childrenReader[i] != nullptr) {
- if (encoded) {
- childrenReader[i]->nextEncoded(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
- } else {
- childrenReader[i]->next(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
- }
- }
- }
- }
-
- void UnionColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- rle->seek(positions.at(columnId));
- for(size_t i = 0; i < numChildren; ++i) {
- if (childrenReader[i] != nullptr) {
- childrenReader[i]->seekToRowGroup(positions);
- }
- }
- }
-
- /**
- * Destructively convert the number from zigzag encoding to the
- * natural signed representation.
- */
- void unZigZagInt128(Int128& value) {
- bool needsNegate = value.getLowBits() & 1;
- value >>= 1;
- if (needsNegate) {
- value.negate();
- value -= 1;
- }
- }
-
- class Decimal64ColumnReader: public ColumnReader {
- public:
- static const uint32_t MAX_PRECISION_64 = 18;
- static const uint32_t MAX_PRECISION_128 = 38;
- static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1];
-
- protected:
- std::unique_ptr<SeekableInputStream> valueStream;
- int32_t precision;
- int32_t scale;
- const char* buffer;
- const char* bufferEnd;
-
- std::unique_ptr<RleDecoder> scaleDecoder;
-
- /**
- * Read the valueStream for more bytes.
- */
- void readBuffer() {
- while (buffer == bufferEnd) {
- int length;
- if (!valueStream->Next(reinterpret_cast<const void**>(&buffer),
- &length)) {
- throw ParseError("Read past end of stream in Decimal64ColumnReader "+
- valueStream->getName());
- }
- bufferEnd = buffer + length;
- }
- }
-
- void readInt64(int64_t& value, int32_t currentScale) {
- value = 0;
- size_t offset = 0;
- while (true) {
- readBuffer();
- unsigned char ch = static_cast<unsigned char>(*(buffer++));
- value |= static_cast<uint64_t>(ch & 0x7f) << offset;
- offset += 7;
- if (!(ch & 0x80)) {
- break;
- }
- }
- value = unZigZag(static_cast<uint64_t>(value));
- if (scale > currentScale &&
- static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) {
- value *= POWERS_OF_TEN[scale - currentScale];
- } else if (scale < currentScale &&
- static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) {
- value /= POWERS_OF_TEN[currentScale - scale];
- } else if (scale != currentScale) {
- throw ParseError("Decimal scale out of range");
- }
- }
-
- public:
- Decimal64ColumnReader(const Type& type, StripeStreams& stipe);
- ~Decimal64ColumnReader() override;
-
- uint64_t skip(uint64_t numValues) override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
- };
- const uint32_t Decimal64ColumnReader::MAX_PRECISION_64;
- const uint32_t Decimal64ColumnReader::MAX_PRECISION_128;
- const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]=
- {1,
- 10,
- 100,
- 1000,
- 10000,
- 100000,
- 1000000,
- 10000000,
- 100000000,
- 1000000000,
- 10000000000,
- 100000000000,
- 1000000000000,
- 10000000000000,
- 100000000000000,
- 1000000000000000,
- 10000000000000000,
- 100000000000000000,
- 1000000000000000000};
-
- Decimal64ColumnReader::Decimal64ColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- scale = static_cast<int32_t>(type.getScale());
- precision = static_cast<int32_t>(type.getPrecision());
- valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (valueStream == nullptr)
- throw ParseError("DATA stream not found in Decimal64Column");
- buffer = nullptr;
- bufferEnd = nullptr;
- RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
- std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
- if (stream == nullptr)
- throw ParseError("SECONDARY stream not found in Decimal64Column");
- scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool);
- }
-
- Decimal64ColumnReader::~Decimal64ColumnReader() {
- // PASS
- }
-
- uint64_t Decimal64ColumnReader::skip(uint64_t numValues) {
- numValues = ColumnReader::skip(numValues);
- uint64_t skipped = 0;
- while (skipped < numValues) {
- readBuffer();
- if (!(0x80 & *(buffer++))) {
- skipped += 1;
- }
- }
- scaleDecoder->skip(numValues);
- return numValues;
- }
-
- void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal64VectorBatch &batch =
- dynamic_cast<Decimal64VectorBatch&>(rowBatch);
- int64_t* values = batch.values.data();
- // read the next group of scales
- int64_t* scaleBuffer = batch.readScales.data();
- scaleDecoder->next(scaleBuffer, numValues, notNull);
- batch.precision = precision;
- batch.scale = scale;
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
- }
- }
- }
-
- void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) {
- if (scale > currentScale) {
- while(scale > currentScale) {
- uint32_t scaleAdjust =
- std::min(Decimal64ColumnReader::MAX_PRECISION_64,
- scale - currentScale);
- value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust];
- currentScale += scaleAdjust;
- }
- } else if (scale < currentScale) {
- Int128 remainder;
- while(currentScale > scale) {
- uint32_t scaleAdjust =
- std::min(Decimal64ColumnReader::MAX_PRECISION_64,
- currentScale - scale);
- value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust],
- remainder);
- currentScale -= scaleAdjust;
- }
- }
- }
-
- void Decimal64ColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
- ColumnReader::seekToRowGroup(positions);
- valueStream->seek(positions.at(columnId));
- scaleDecoder->seek(positions.at(columnId));
- }
-
- class Decimal128ColumnReader: public Decimal64ColumnReader {
- public:
- Decimal128ColumnReader(const Type& type, StripeStreams& stipe);
- ~Decimal128ColumnReader() override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
-
- private:
- void readInt128(Int128& value, int32_t currentScale) {
- value = 0;
- Int128 work;
- uint32_t offset = 0;
- while (true) {
- readBuffer();
- unsigned char ch = static_cast<unsigned char>(*(buffer++));
- work = ch & 0x7f;
- work <<= offset;
- value |= work;
- offset += 7;
- if (!(ch & 0x80)) {
- break;
- }
- }
- unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
- static_cast<uint32_t>(currentScale));
- }
- };
-
- Decimal128ColumnReader::Decimal128ColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): Decimal64ColumnReader(type, stripe) {
- // PASS
- }
-
- Decimal128ColumnReader::~Decimal128ColumnReader() {
- // PASS
- }
-
- void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal128VectorBatch &batch =
- dynamic_cast<Decimal128VectorBatch&>(rowBatch);
- Int128* values = batch.values.data();
- // read the next group of scales
- int64_t* scaleBuffer = batch.readScales.data();
- scaleDecoder->next(scaleBuffer, numValues, notNull);
- batch.precision = precision;
- batch.scale = scale;
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
- }
- }
- }
-
- class DecimalHive11ColumnReader: public Decimal64ColumnReader {
- private:
- bool throwOnOverflow;
- std::ostream* errorStream;
-
- /**
- * Read an Int128 from the stream and correct it to the desired scale.
- */
- bool readInt128(Int128& value, int32_t currentScale) {
- // -/+ 99999999999999999999999999999999999999
- static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001);
- static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff);
-
- value = 0;
- Int128 work;
- uint32_t offset = 0;
- bool result = true;
- while (true) {
- readBuffer();
- unsigned char ch = static_cast<unsigned char>(*(buffer++));
- work = ch & 0x7f;
- // If we have read more than 128 bits, we flag the error, but keep
- // reading bytes so the stream isn't thrown off.
- if (offset > 128 || (offset == 126 && work > 3)) {
- result = false;
- }
- work <<= offset;
- value |= work;
- offset += 7;
- if (!(ch & 0x80)) {
- break;
- }
- }
-
- if (!result) {
- return result;
- }
- unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
- static_cast<uint32_t>(currentScale));
- return value >= MIN_VALUE && value <= MAX_VALUE;
- }
-
- public:
- DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe);
- ~DecimalHive11ColumnReader() override;
-
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
- };
-
- DecimalHive11ColumnReader::DecimalHive11ColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): Decimal64ColumnReader(type, stripe) {
- scale = stripe.getForcedScaleOnHive11Decimal();
- throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow();
- errorStream = stripe.getErrorStream();
- }
-
- DecimalHive11ColumnReader::~DecimalHive11ColumnReader() {
- // PASS
- }
-
- void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
- ColumnReader::next(rowBatch, numValues, notNull);
- notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal128VectorBatch &batch =
- dynamic_cast<Decimal128VectorBatch&>(rowBatch);
- Int128* values = batch.values.data();
- // read the next group of scales
- int64_t* scaleBuffer = batch.readScales.data();
-
- scaleDecoder->next(scaleBuffer, numValues, notNull);
-
- batch.precision = precision;
- batch.scale = scale;
- if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
- if (notNull[i]) {
- if (!readInt128(values[i],
- static_cast<int32_t>(scaleBuffer[i]))) {
- if (throwOnOverflow) {
- throw ParseError("Hive 0.11 decimal was more than 38 digits.");
- } else {
- *errorStream << "Warning: "
- << "Hive 0.11 decimal with more than 38 digits "
- << "replaced by NULL.\n";
- notNull[i] = false;
- }
- }
- }
- }
- } else {
- for(size_t i=0; i < numValues; ++i) {
- if (!readInt128(values[i],
- static_cast<int32_t>(scaleBuffer[i]))) {
- if (throwOnOverflow) {
- throw ParseError("Hive 0.11 decimal was more than 38 digits.");
- } else {
- *errorStream << "Warning: "
- << "Hive 0.11 decimal with more than 38 digits "
- << "replaced by NULL.\n";
- batch.hasNulls = true;
- batch.notNull[i] = false;
- }
- }
- }
- }
- }
-
- /**
- * Create a reader for the given stripe.
- */
- std::unique_ptr<ColumnReader> buildReader(const Type& type,
- StripeStreams& stripe) {
- switch (static_cast<int64_t>(type.getKind())) {
- case DATE:
- case INT:
- case LONG:
- case SHORT:
- return std::unique_ptr<ColumnReader>(
- new IntegerColumnReader(type, stripe));
- case BINARY:
- case CHAR:
- case STRING:
- case VARCHAR:
- switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){
- case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return std::unique_ptr<ColumnReader>(
- new StringDictionaryColumnReader(type, stripe));
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- return std::unique_ptr<ColumnReader>(
- new StringDirectColumnReader(type, stripe));
- default:
- throw NotImplementedYet("buildReader unhandled string encoding");
- }
-
- case BOOLEAN:
- return std::unique_ptr<ColumnReader>(
- new BooleanColumnReader(type, stripe));
-
- case BYTE:
- return std::unique_ptr<ColumnReader>(
- new ByteColumnReader(type, stripe));
-
- case LIST:
- return std::unique_ptr<ColumnReader>(
- new ListColumnReader(type, stripe));
-
- case MAP:
- return std::unique_ptr<ColumnReader>(
- new MapColumnReader(type, stripe));
-
- case UNION:
- return std::unique_ptr<ColumnReader>(
- new UnionColumnReader(type, stripe));
-
- case STRUCT:
- return std::unique_ptr<ColumnReader>(
- new StructColumnReader(type, stripe));
-
- case FLOAT:
- case DOUBLE:
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader(type, stripe));
-
- case TIMESTAMP:
- return std::unique_ptr<ColumnReader>
- (new TimestampColumnReader(type, stripe));
-
- case DECIMAL:
- // is this a Hive 0.11 or 0.12 file?
- if (type.getPrecision() == 0) {
- return std::unique_ptr<ColumnReader>
- (new DecimalHive11ColumnReader(type, stripe));
-
- // can we represent the values using int64_t?
- } else if (type.getPrecision() <=
- Decimal64ColumnReader::MAX_PRECISION_64) {
- return std::unique_ptr<ColumnReader>
- (new Decimal64ColumnReader(type, stripe));
-
- // otherwise we use the Int128 implementation
- } else {
- return std::unique_ptr<ColumnReader>
- (new Decimal128ColumnReader(type, stripe));
- }
-
- default:
- throw NotImplementedYet("buildReader unhandled type");
- }
- }
-
-}
+ secsBuffer[i] -= 1;
+ }
+ }
+ }
+ }
+
+ void TimestampColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ secondsRle->seek(positions.at(columnId));
+ nanoRle->seek(positions.at(columnId));
+ }
+
+ class DoubleColumnReader: public ColumnReader {
+ public:
+ DoubleColumnReader(const Type& type, StripeStreams& stripe);
+ ~DoubleColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+
+ private:
+ std::unique_ptr<SeekableInputStream> inputStream;
+ TypeKind columnKind;
+ const uint64_t bytesPerValue ;
+ const char *bufferPointer;
+ const char *bufferEnd;
+
+ unsigned char readByte() {
+ if (bufferPointer == bufferEnd) {
+ int length;
+ if (!inputStream->Next
+ (reinterpret_cast<const void**>(&bufferPointer), &length)) {
+ throw ParseError("bad read in DoubleColumnReader::next()");
+ }
+ bufferEnd = bufferPointer + length;
+ }
+ return static_cast<unsigned char>(*(bufferPointer++));
+ }
+
+ double readDouble() {
+ int64_t bits = 0;
+ for (uint64_t i=0; i < 8; i++) {
+ bits |= static_cast<int64_t>(readByte()) << (i*8);
+ }
+ double *result = reinterpret_cast<double*>(&bits);
+ return *result;
+ }
+
+ double readFloat() {
+ int32_t bits = 0;
+ for (uint64_t i=0; i < 4; i++) {
+ bits |= readByte() << (i*8);
+ }
+ float *result = reinterpret_cast<float*>(&bits);
+ return static_cast<double>(*result);
+ }
+ };
+
+ DoubleColumnReader::DoubleColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe),
+ columnKind(type.getKind()),
+ bytesPerValue((type.getKind() ==
+ FLOAT) ? 4 : 8),
+ bufferPointer(nullptr),
+ bufferEnd(nullptr) {
+ inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (inputStream == nullptr)
+ throw ParseError("DATA stream not found in Double column");
+ }
+
+ DoubleColumnReader::~DoubleColumnReader() {
+ // PASS
+ }
+
+ uint64_t DoubleColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+
+ if (static_cast<size_t>(bufferEnd - bufferPointer) >=
+ bytesPerValue * numValues) {
+ bufferPointer += bytesPerValue * numValues;
+ } else {
+ size_t sizeToSkip = bytesPerValue * numValues -
+ static_cast<size_t>(bufferEnd - bufferPointer);
+ const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
+ while (sizeToSkip != 0) {
+ size_t step = sizeToSkip > cap ? cap : sizeToSkip;
+ inputStream->Skip(static_cast<int>(step));
+ sizeToSkip -= step;
+ }
+ bufferEnd = nullptr;
+ bufferPointer = nullptr;
+ }
+
+ return numValues;
+ }
+
+ void DoubleColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // update the notNull from the parent class
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data();
+
+ if (columnKind == FLOAT) {
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ outArray[i] = readFloat();
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ outArray[i] = readFloat();
+ }
+ }
+ } else {
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ outArray[i] = readDouble();
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ outArray[i] = readDouble();
+ }
+ }
+ }
+ }
+
+ void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) {
+ int64_t posn = 0;
+ while (posn < bufferSize) {
+ const void* chunk;
+ int length;
+ if (!stream->Next(&chunk, &length)) {
+ throw ParseError("bad read in readFully");
+ }
+ if (posn + length > bufferSize) {
+ throw ParseError("Corrupt dictionary blob in StringDictionaryColumn");
+ }
+ memcpy(buffer + posn, chunk, static_cast<size_t>(length));
+ posn += length;
+ }
+ }
+
+ void DoubleColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ inputStream->seek(positions.at(columnId));
+ }
+
+ class StringDictionaryColumnReader: public ColumnReader {
+ private:
+ std::shared_ptr<StringDictionary> dictionary;
+ std::unique_ptr<RleDecoder> rle;
+
+ public:
+ StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
+ ~StringDictionaryColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+
+ StringDictionaryColumnReader::StringDictionaryColumnReader
+ (const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe),
+ dictionary(new StringDictionary(stripe.getMemoryPool())) {
+ RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
+ .kind());
+ uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize();
+ rle = createRleDecoder(stripe.getStream(columnId,
+ proto::Stream_Kind_DATA,
+ true),
+ false, rleVersion, memoryPool);
+ std::unique_ptr<RleDecoder> lengthDecoder =
+ createRleDecoder(stripe.getStream(columnId,
+ proto::Stream_Kind_LENGTH,
+ false),
+ false, rleVersion, memoryPool);
+ dictionary->dictionaryOffset.resize(dictSize + 1);
+ int64_t* lengthArray = dictionary->dictionaryOffset.data();
+ lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
+ lengthArray[0] = 0;
+ for(uint32_t i = 1; i < dictSize + 1; ++i) {
+ lengthArray[i] += lengthArray[i - 1];
+ }
+ dictionary->dictionaryBlob.resize(
+ static_cast<uint64_t>(lengthArray[dictSize]));
+ std::unique_ptr<SeekableInputStream> blobStream =
+ stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
+ readFully(
+ dictionary->dictionaryBlob.data(),
+ lengthArray[dictSize],
+ blobStream.get());
+ }
+
+ StringDictionaryColumnReader::~StringDictionaryColumnReader() {
+ // PASS
+ }
+
+ uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+ }
+
+ void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // update the notNull from the parent class
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
+ char *blob = dictionary->dictionaryBlob.data();
+ int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data();
+ char **outputStarts = byteBatch.data.data();
+ int64_t *outputLengths = byteBatch.length.data();
+ rle->next(outputLengths, numValues, notNull);
+ uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1;
+ if (notNull) {
+ for(uint64_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ int64_t entry = outputLengths[i];
+ if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) {
+ throw ParseError("Entry index out of range in StringDictionaryColumn");
+ }
+ outputStarts[i] = blob + dictionaryOffsets[entry];
+ outputLengths[i] = dictionaryOffsets[entry+1] -
+ dictionaryOffsets[entry];
+ }
+ }
+ } else {
+ for(uint64_t i=0; i < numValues; ++i) {
+ int64_t entry = outputLengths[i];
+ if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) {
+ throw ParseError("Entry index out of range in StringDictionaryColumn");
+ }
+ outputStarts[i] = blob + dictionaryOffsets[entry];
+ outputLengths[i] = dictionaryOffsets[entry+1] -
+ dictionaryOffsets[entry];
+ }
+ }
+ }
+
+ void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ rowBatch.isEncoded = true;
+
+ EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch);
+ batch.dictionary = this->dictionary;
+
+ // Length buffer is reused to save dictionary entry ids
+ rle->next(batch.index.data(), numValues, notNull);
+ }
+
+ void StringDictionaryColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ }
+
+
+ class StringDirectColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<RleDecoder> lengthRle;
+ std::unique_ptr<SeekableInputStream> blobStream;
+ const char *lastBuffer;
+ size_t lastBufferLength;
+
+ /**
+ * Compute the total length of the values.
+ * @param lengths the array of lengths
+ * @param notNull the array of notNull flags
+ * @param numValues the lengths of the arrays
+ * @return the total number of bytes for the non-null values
+ */
+ size_t computeSize(const int64_t *lengths, const char *notNull,
+ uint64_t numValues);
+
+ public:
+ StringDirectColumnReader(const Type& type, StripeStreams& stipe);
+ ~StringDirectColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+
+ StringDirectColumnReader::StringDirectColumnReader
+ (const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
+ .kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
+ if (stream == nullptr)
+ throw ParseError("LENGTH stream not found in StringDirectColumn");
+ lengthRle = createRleDecoder(
+ std::move(stream), false, rleVersion, memoryPool);
+ blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (blobStream == nullptr)
+ throw ParseError("DATA stream not found in StringDirectColumn");
+ lastBuffer = nullptr;
+ lastBufferLength = 0;
+ }
+
+ StringDirectColumnReader::~StringDirectColumnReader() {
+ // PASS
+ }
+
+ uint64_t StringDirectColumnReader::skip(uint64_t numValues) {
+ const size_t BUFFER_SIZE = 1024;
+ numValues = ColumnReader::skip(numValues);
+ int64_t buffer[BUFFER_SIZE];
+ uint64_t done = 0;
+ size_t totalBytes = 0;
+ // read the lengths, so we know haw many bytes to skip
+ while (done < numValues) {
+ uint64_t step = std::min(BUFFER_SIZE,
+ static_cast<size_t>(numValues - done));
+ lengthRle->next(buffer, step, nullptr);
+ totalBytes += computeSize(buffer, nullptr, step);
+ done += step;
+ }
+ if (totalBytes <= lastBufferLength) {
+ // subtract the needed bytes from the ones left over
+ lastBufferLength -= totalBytes;
+ lastBuffer += totalBytes;
+ } else {
+ // move the stream forward after accounting for the buffered bytes
+ totalBytes -= lastBufferLength;
+ const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
+ while (totalBytes != 0) {
+ size_t step = totalBytes > cap ? cap : totalBytes;
+ blobStream->Skip(static_cast<int>(step));
+ totalBytes -= step;
+ }
+ lastBufferLength = 0;
+ lastBuffer = nullptr;
+ }
+ return numValues;
+ }
+
+ size_t StringDirectColumnReader::computeSize(const int64_t* lengths,
+ const char* notNull,
+ uint64_t numValues) {
+ size_t totalLength = 0;
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ totalLength += static_cast<size_t>(lengths[i]);
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ totalLength += static_cast<size_t>(lengths[i]);
+ }
+ }
+ return totalLength;
+ }
+
+ void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // update the notNull from the parent class
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
+ char **startPtr = byteBatch.data.data();
+ int64_t *lengthPtr = byteBatch.length.data();
+
+ // read the length vector
+ lengthRle->next(lengthPtr, numValues, notNull);
+
+ // figure out the total length of data we need from the blob stream
+ const size_t totalLength = computeSize(lengthPtr, notNull, numValues);
+
+ // Load data from the blob stream into our buffer until we have enough
+ // to get the rest directly out of the stream's buffer.
+ size_t bytesBuffered = 0;
+ byteBatch.blob.resize(totalLength);
+ char *ptr= byteBatch.blob.data();
+ while (bytesBuffered + lastBufferLength < totalLength) {
+ memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength);
+ bytesBuffered += lastBufferLength;
+ const void* readBuffer;
+ int readLength;
+ if (!blobStream->Next(&readBuffer, &readLength)) {
+ throw ParseError("failed to read in StringDirectColumnReader.next");
+ }
+ lastBuffer = static_cast<const char*>(readBuffer);
+ lastBufferLength = static_cast<size_t>(readLength);
+ }
+
+ if (bytesBuffered < totalLength) {
+ size_t moreBytes = totalLength - bytesBuffered;
+ memcpy(ptr + bytesBuffered, lastBuffer, moreBytes);
+ lastBuffer += moreBytes;
+ lastBufferLength -= moreBytes;
+ }
+
+ size_t filledSlots = 0;
+ ptr = byteBatch.blob.data();
+ if (notNull) {
+ while (filledSlots < numValues) {
+ if (notNull[filledSlots]) {
+ startPtr[filledSlots] = const_cast<char*>(ptr);
+ ptr += lengthPtr[filledSlots];
+ }
+ filledSlots += 1;
+ }
+ } else {
+ while (filledSlots < numValues) {
+ startPtr[filledSlots] = const_cast<char*>(ptr);
+ ptr += lengthPtr[filledSlots];
+ filledSlots += 1;
+ }
+ }
+ }
+
+ void StringDirectColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ blobStream->seek(positions.at(columnId));
+ lengthRle->seek(positions.at(columnId));
+ }
+
+ class StructColumnReader: public ColumnReader {
+ private:
+ std::vector<ColumnReader*> children;
+
+ public:
+ StructColumnReader(const Type& type, StripeStreams& stipe);
+ ~StructColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+
+ private:
+ template<bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull);
+ };
+
+ StructColumnReader::StructColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ // count the number of selected sub-columns
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) {
+ case proto::ColumnEncoding_Kind_DIRECT:
+ for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ const Type& child = *type.getSubtype(i);
+ if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
+ children.push_back(buildReader(child, stripe).release());
+ }
+ }
+ break;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ default:
+ throw ParseError("Unknown encoding for StructColumnReader");
+ }
+ }
+
+ StructColumnReader::~StructColumnReader() {
+ for (size_t i=0; i<children.size(); i++) {
+ delete children[i];
+ }
+ }
+
+ uint64_t StructColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ for(std::vector<ColumnReader*>::iterator ptr=children.begin(); ptr != children.end(); ++ptr) {
+ (*ptr)->skip(numValues);
+ }
+ return numValues;
+ }
+
+ void StructColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ nextInternal<false>(rowBatch, numValues, notNull);
+ }
+
+ void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ nextInternal<true>(rowBatch, numValues, notNull);
+ }
+
+ template<bool encoded>
+ void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ uint64_t i=0;
+ notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr;
+ for(std::vector<ColumnReader*>::iterator ptr=children.begin();
+ ptr != children.end(); ++ptr, ++i) {
+ if (encoded) {
+ (*ptr)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
+ numValues, notNull);
+ } else {
+ (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
+ numValues, notNull);
+ }
+ }
+ }
+
+ void StructColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+
+ for(std::vector<ColumnReader*>::iterator ptr = children.begin();
+ ptr != children.end();
+ ++ptr) {
+ (*ptr)->seekToRowGroup(positions);
+ }
+ }
+
+ class ListColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<ColumnReader> child;
+ std::unique_ptr<RleDecoder> rle;
+
+ public:
+ ListColumnReader(const Type& type, StripeStreams& stipe);
+ ~ListColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+
+ private:
+ template<bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull);
+ };
+
+ ListColumnReader::ListColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ // count the number of selected sub-columns
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
+ if (stream == nullptr)
+ throw ParseError("LENGTH stream not found in List column");
+ rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
+ const Type& childType = *type.getSubtype(0);
+ if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
+ child = buildReader(childType, stripe);
+ }
+ }
+
+ ListColumnReader::~ListColumnReader() {
+ // PASS
+ }
+
+ uint64_t ListColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ ColumnReader *childReader = child.get();
+ if (childReader) {
+ const uint64_t BUFFER_SIZE = 1024;
+ int64_t buffer[BUFFER_SIZE];
+ uint64_t childrenElements = 0;
+ uint64_t lengthsRead = 0;
+ while (lengthsRead < numValues) {
+ uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
+ rle->next(buffer, chunk, nullptr);
+ for(size_t i=0; i < chunk; ++i) {
+ childrenElements += static_cast<size_t>(buffer[i]);
+ }
+ lengthsRead += chunk;
+ }
+ childReader->skip(childrenElements);
+ } else {
+ rle->skip(numValues);
+ }
+ return numValues;
+ }
+
+ void ListColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ nextInternal<false>(rowBatch, numValues, notNull);
+ }
+
+ void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ nextInternal<true>(rowBatch, numValues, notNull);
+ }
+
+ template<bool encoded>
+ void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
+ int64_t* offsets = listBatch.offsets.data();
+ notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr;
+ rle->next(offsets, numValues, notNull);
+ uint64_t totalChildren = 0;
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ } else {
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ }
+ }
+ offsets[numValues] = static_cast<int64_t>(totalChildren);
+ ColumnReader *childReader = child.get();
+ if (childReader) {
+ if (encoded) {
+ childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr);
+ } else {
+ childReader->next(*(listBatch.elements.get()), totalChildren, nullptr);
+ }
+ }
+ }
+
+ void ListColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ if (child.get()) {
+ child->seekToRowGroup(positions);
+ }
+ }
+
+ class MapColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<ColumnReader> keyReader;
+ std::unique_ptr<ColumnReader> elementReader;
+ std::unique_ptr<RleDecoder> rle;
+
+ public:
+ MapColumnReader(const Type& type, StripeStreams& stipe);
+ ~MapColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+
+ private:
+ template<bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull);
+ };
+
+ MapColumnReader::MapColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ // Determine if the key and/or value columns are selected
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
+ if (stream == nullptr)
+ throw ParseError("LENGTH stream not found in Map column");
+ rle = createRleDecoder(std::move(stream), false, vers, memoryPool);
+ const Type& keyType = *type.getSubtype(0);
+ if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) {
+ keyReader = buildReader(keyType, stripe);
+ }
+ const Type& elementType = *type.getSubtype(1);
+ if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) {
+ elementReader = buildReader(elementType, stripe);
+ }
+ }
+
+ MapColumnReader::~MapColumnReader() {
+ // PASS
+ }
+
+ uint64_t MapColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ ColumnReader *rawKeyReader = keyReader.get();
+ ColumnReader *rawElementReader = elementReader.get();
+ if (rawKeyReader || rawElementReader) {
+ const uint64_t BUFFER_SIZE = 1024;
+ int64_t buffer[BUFFER_SIZE];
+ uint64_t childrenElements = 0;
+ uint64_t lengthsRead = 0;
+ while (lengthsRead < numValues) {
+ uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
+ rle->next(buffer, chunk, nullptr);
+ for(size_t i=0; i < chunk; ++i) {
+ childrenElements += static_cast<size_t>(buffer[i]);
+ }
+ lengthsRead += chunk;
+ }
+ if (rawKeyReader) {
+ rawKeyReader->skip(childrenElements);
+ }
+ if (rawElementReader) {
+ rawElementReader->skip(childrenElements);
+ }
+ } else {
+ rle->skip(numValues);
+ }
+ return numValues;
+ }
+
+ void MapColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull)
+ {
+ nextInternal<false>(rowBatch, numValues, notNull);
+ }
+
+ void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull)
+ {
+ nextInternal<true>(rowBatch, numValues, notNull);
+ }
+
+ template<bool encoded>
+ void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
+ int64_t* offsets = mapBatch.offsets.data();
+ notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr;
+ rle->next(offsets, numValues, notNull);
+ uint64_t totalChildren = 0;
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ } else {
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ }
+ }
+ offsets[numValues] = static_cast<int64_t>(totalChildren);
+ ColumnReader *rawKeyReader = keyReader.get();
+ if (rawKeyReader) {
+ if (encoded) {
+ rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr);
+ } else {
+ rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr);
+ }
+ }
+ ColumnReader *rawElementReader = elementReader.get();
+ if (rawElementReader) {
+ if (encoded) {
+ rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr);
+ } else {
+ rawElementReader->next(*(mapBatch.elements.get()), totalChildren, nullptr);
+ }
+ }
+ }
+
+ void MapColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ if (keyReader.get()) {
+ keyReader->seekToRowGroup(positions);
+ }
+ if (elementReader.get()) {
+ elementReader->seekToRowGroup(positions);
+ }
+ }
+
+ class UnionColumnReader: public ColumnReader {
+ private:
+ std::unique_ptr<ByteRleDecoder> rle;
+ std::vector<ColumnReader*> childrenReader;
+ std::vector<int64_t> childrenCounts;
+ uint64_t numChildren;
+
+ public:
+ UnionColumnReader(const Type& type, StripeStreams& stipe);
+ ~UnionColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+
+ private:
+ template<bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull);
+ };
+
+ UnionColumnReader::UnionColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ numChildren = type.getSubtypeCount();
+ childrenReader.resize(numChildren);
+ childrenCounts.resize(numChildren);
+
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (stream == nullptr)
+ throw ParseError("LENGTH stream not found in Union column");
+ rle = createByteRleDecoder(std::move(stream));
+ // figure out which types are selected
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ for(unsigned int i=0; i < numChildren; ++i) {
+ const Type &child = *type.getSubtype(i);
+ if (selectedColumns[static_cast<size_t>(child.getColumnId())]) {
+ childrenReader[i] = buildReader(child, stripe).release();
+ }
+ }
+ }
+
+ UnionColumnReader::~UnionColumnReader() {
+ for(std::vector<ColumnReader*>::iterator itr = childrenReader.begin();
+ itr != childrenReader.end(); ++itr) {
+ delete *itr;
+ }
+ }
+
+ uint64_t UnionColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ const uint64_t BUFFER_SIZE = 1024;
+ char buffer[BUFFER_SIZE];
+ uint64_t lengthsRead = 0;
+ int64_t *counts = childrenCounts.data();
+ memset(counts, 0, sizeof(int64_t) * numChildren);
+ while (lengthsRead < numValues) {
+ uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
+ rle->next(buffer, chunk, nullptr);
+ for(size_t i=0; i < chunk; ++i) {
+ counts[static_cast<size_t>(buffer[i])] += 1;
+ }
+ lengthsRead += chunk;
+ }
+ for(size_t i=0; i < numChildren; ++i) {
+ if (counts[i] != 0 && childrenReader[i] != nullptr) {
+ childrenReader[i]->skip(static_cast<uint64_t>(counts[i]));
+ }
+ }
+ return numValues;
+ }
+
+ void UnionColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ nextInternal<false>(rowBatch, numValues, notNull);
+ }
+
+ void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ nextInternal<true>(rowBatch, numValues, notNull);
+ }
+
+ template<bool encoded>
+ void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
+ uint64_t* offsets = unionBatch.offsets.data();
+ int64_t* counts = childrenCounts.data();
+ memset(counts, 0, sizeof(int64_t) * numChildren);
+ unsigned char* tags = unionBatch.tags.data();
+ notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr;
+ rle->next(reinterpret_cast<char *>(tags), numValues, notNull);
+ // set the offsets for each row
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ offsets[i] =
+ static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ offsets[i] =
+ static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ }
+ }
+ // read the right number of each child column
+ for(size_t i=0; i < numChildren; ++i) {
+ if (childrenReader[i] != nullptr) {
+ if (encoded) {
+ childrenReader[i]->nextEncoded(*(unionBatch.children[i]),
+ static_cast<uint64_t>(counts[i]), nullptr);
+ } else {
+ childrenReader[i]->next(*(unionBatch.children[i]),
+ static_cast<uint64_t>(counts[i]), nullptr);
+ }
+ }
+ }
+ }
+
+ void UnionColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ rle->seek(positions.at(columnId));
+ for(size_t i = 0; i < numChildren; ++i) {
+ if (childrenReader[i] != nullptr) {
+ childrenReader[i]->seekToRowGroup(positions);
+ }
+ }
+ }
+
+ /**
+ * Destructively convert the number from zigzag encoding to the
+ * natural signed representation.
+ */
+ void unZigZagInt128(Int128& value) {
+ bool needsNegate = value.getLowBits() & 1;
+ value >>= 1;
+ if (needsNegate) {
+ value.negate();
+ value -= 1;
+ }
+ }
+
+ class Decimal64ColumnReader: public ColumnReader {
+ public:
+ static const uint32_t MAX_PRECISION_64 = 18;
+ static const uint32_t MAX_PRECISION_128 = 38;
+ static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1];
+
+ protected:
+ std::unique_ptr<SeekableInputStream> valueStream;
+ int32_t precision;
+ int32_t scale;
+ const char* buffer;
+ const char* bufferEnd;
+
+ std::unique_ptr<RleDecoder> scaleDecoder;
+
+ /**
+ * Read the valueStream for more bytes.
+ */
+ void readBuffer() {
+ while (buffer == bufferEnd) {
+ int length;
+ if (!valueStream->Next(reinterpret_cast<const void**>(&buffer),
+ &length)) {
+ throw ParseError("Read past end of stream in Decimal64ColumnReader "+
+ valueStream->getName());
+ }
+ bufferEnd = buffer + length;
+ }
+ }
+
+ void readInt64(int64_t& value, int32_t currentScale) {
+ value = 0;
+ size_t offset = 0;
+ while (true) {
+ readBuffer();
+ unsigned char ch = static_cast<unsigned char>(*(buffer++));
+ value |= static_cast<uint64_t>(ch & 0x7f) << offset;
+ offset += 7;
+ if (!(ch & 0x80)) {
+ break;
+ }
+ }
+ value = unZigZag(static_cast<uint64_t>(value));
+ if (scale > currentScale &&
+ static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) {
+ value *= POWERS_OF_TEN[scale - currentScale];
+ } else if (scale < currentScale &&
+ static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) {
+ value /= POWERS_OF_TEN[currentScale - scale];
+ } else if (scale != currentScale) {
+ throw ParseError("Decimal scale out of range");
+ }
+ }
+
+ public:
+ Decimal64ColumnReader(const Type& type, StripeStreams& stipe);
+ ~Decimal64ColumnReader() override;
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ };
+ const uint32_t Decimal64ColumnReader::MAX_PRECISION_64;
+ const uint32_t Decimal64ColumnReader::MAX_PRECISION_128;
+ const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]=
+ {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+ Decimal64ColumnReader::Decimal64ColumnReader(const Type& type,
+ StripeStreams& stripe
+ ): ColumnReader(type, stripe) {
+ scale = static_cast<int32_t>(type.getScale());
+ precision = static_cast<int32_t>(type.getPrecision());
+ valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ if (valueStream == nullptr)
+ throw ParseError("DATA stream not found in Decimal64Column");
+ buffer = nullptr;
+ bufferEnd = nullptr;
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
+ if (stream == nullptr)
+ throw ParseError("SECONDARY stream not found in Decimal64Column");
+ scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool);
+ }
+
+ Decimal64ColumnReader::~Decimal64ColumnReader() {
+ // PASS
+ }
+
+ uint64_t Decimal64ColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ uint64_t skipped = 0;
+ while (skipped < numValues) {
+ readBuffer();
+ if (!(0x80 & *(buffer++))) {
+ skipped += 1;
+ }
+ }
+ scaleDecoder->skip(numValues);
+ return numValues;
+ }
+
+ void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ Decimal64VectorBatch &batch =
+ dynamic_cast<Decimal64VectorBatch&>(rowBatch);
+ int64_t* values = batch.values.data();
+ // read the next group of scales
+ int64_t* scaleBuffer = batch.readScales.data();
+ scaleDecoder->next(scaleBuffer, numValues, notNull);
+ batch.precision = precision;
+ batch.scale = scale;
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ }
+ }
+ }
+
+ void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) {
+ if (scale > currentScale) {
+ while(scale > currentScale) {
+ uint32_t scaleAdjust =
+ std::min(Decimal64ColumnReader::MAX_PRECISION_64,
+ scale - currentScale);
+ value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust];
+ currentScale += scaleAdjust;
+ }
+ } else if (scale < currentScale) {
+ Int128 remainder;
+ while(currentScale > scale) {
+ uint32_t scaleAdjust =
+ std::min(Decimal64ColumnReader::MAX_PRECISION_64,
+ currentScale - scale);
+ value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust],
+ remainder);
+ currentScale -= scaleAdjust;
+ }
+ }
+ }
+
+ void Decimal64ColumnReader::seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
+ ColumnReader::seekToRowGroup(positions);
+ valueStream->seek(positions.at(columnId));
+ scaleDecoder->seek(positions.at(columnId));
+ }
+
+ class Decimal128ColumnReader: public Decimal64ColumnReader {
+ public:
+ Decimal128ColumnReader(const Type& type, StripeStreams& stipe);
+ ~Decimal128ColumnReader() override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+
+ private:
+ void readInt128(Int128& value, int32_t currentScale) {
+ value = 0;
+ Int128 work;
+ uint32_t offset = 0;
+ while (true) {
+ readBuffer();
+ unsigned char ch = static_cast<unsigned char>(*(buffer++));
+ work = ch & 0x7f;
+ work <<= offset;
+ value |= work;
+ offset += 7;
+ if (!(ch & 0x80)) {
+ break;
+ }
+ }
+ unZigZagInt128(value);
+ scaleInt128(value, static_cast<uint32_t>(scale),
+ static_cast<uint32_t>(currentScale));
+ }
+ };
+
+ Decimal128ColumnReader::Decimal128ColumnReader
+ (const Type& type,
+ StripeStreams& stripe
+ ): Decimal64ColumnReader(type, stripe) {
+ // PASS
+ }
+
+ Decimal128ColumnReader::~Decimal128ColumnReader() {
+ // PASS
+ }
+
+ void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ Decimal128VectorBatch &batch =
+ dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Int128* values = batch.values.data();
+ // read the next group of scales
+ int64_t* scaleBuffer = batch.readScales.data();
+ scaleDecoder->next(scaleBuffer, numValues, notNull);
+ batch.precision = precision;
+ batch.scale = scale;
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ }
+ }
+ }
+
+ class DecimalHive11ColumnReader: public Decimal64ColumnReader {
+ private:
+ bool throwOnOverflow;
+ std::ostream* errorStream;
+
+ /**
+ * Read an Int128 from the stream and correct it to the desired scale.
+ */
+ bool readInt128(Int128& value, int32_t currentScale) {
+ // -/+ 99999999999999999999999999999999999999
+ static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001);
+ static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff);
+
+ value = 0;
+ Int128 work;
+ uint32_t offset = 0;
+ bool result = true;
+ while (true) {
+ readBuffer();
+ unsigned char ch = static_cast<unsigned char>(*(buffer++));
+ work = ch & 0x7f;
+ // If we have read more than 128 bits, we flag the error, but keep
+ // reading bytes so the stream isn't thrown off.
+ if (offset > 128 || (offset == 126 && work > 3)) {
+ result = false;
+ }
+ work <<= offset;
+ value |= work;
+ offset += 7;
+ if (!(ch & 0x80)) {
+ break;
+ }
+ }
+
+ if (!result) {
+ return result;
+ }
+ unZigZagInt128(value);
+ scaleInt128(value, static_cast<uint32_t>(scale),
+ static_cast<uint32_t>(currentScale));
+ return value >= MIN_VALUE && value <= MAX_VALUE;
+ }
+
+ public:
+ DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe);
+ ~DecimalHive11ColumnReader() override;
+
+ void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) override;
+ };
+
+ DecimalHive11ColumnReader::DecimalHive11ColumnReader
+ (const Type& type,
+ StripeStreams& stripe
+ ): Decimal64ColumnReader(type, stripe) {
+ scale = stripe.getForcedScaleOnHive11Decimal();
+ throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow();
+ errorStream = stripe.getErrorStream();
+ }
+
+ DecimalHive11ColumnReader::~DecimalHive11ColumnReader() {
+ // PASS
+ }
+
+ void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char *notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ Decimal128VectorBatch &batch =
+ dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Int128* values = batch.values.data();
+ // read the next group of scales
+ int64_t* scaleBuffer = batch.readScales.data();
+
+ scaleDecoder->next(scaleBuffer, numValues, notNull);
+
+ batch.precision = precision;
+ batch.scale = scale;
+ if (notNull) {
+ for(size_t i=0; i < numValues; ++i) {
+ if (notNull[i]) {
+ if (!readInt128(values[i],
+ static_cast<int32_t>(scaleBuffer[i]))) {
+ if (throwOnOverflow) {
+ throw ParseError("Hive 0.11 decimal was more than 38 digits.");
+ } else {
+ *errorStream << "Warning: "
+ << "Hive 0.11 decimal with more than 38 digits "
+ << "replaced by NULL.\n";
+ notNull[i] = false;
+ }
+ }
+ }
+ }
+ } else {
+ for(size_t i=0; i < numValues; ++i) {
+ if (!readInt128(values[i],
+ static_cast<int32_t>(scaleBuffer[i]))) {
+ if (throwOnOverflow) {
+ throw ParseError("Hive 0.11 decimal was more than 38 digits.");
+ } else {
+ *errorStream << "Warning: "
+ << "Hive 0.11 decimal with more than 38 digits "
+ << "replaced by NULL.\n";
+ batch.hasNulls = true;
+ batch.notNull[i] = false;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Create a reader for the given stripe.
+ */
+ std::unique_ptr<ColumnReader> buildReader(const Type& type,
+ StripeStreams& stripe) {
+ switch (static_cast<int64_t>(type.getKind())) {
+ case DATE:
+ case INT:
+ case LONG:
+ case SHORT:
+ return std::unique_ptr<ColumnReader>(
+ new IntegerColumnReader(type, stripe));
+ case BINARY:
+ case CHAR:
+ case STRING:
+ case VARCHAR:
+ switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return std::unique_ptr<ColumnReader>(
+ new StringDictionaryColumnReader(type, stripe));
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ return std::unique_ptr<ColumnReader>(
+ new StringDirectColumnReader(type, stripe));
+ default:
+ throw NotImplementedYet("buildReader unhandled string encoding");
+ }
+
+ case BOOLEAN:
+ return std::unique_ptr<ColumnReader>(
+ new BooleanColumnReader(type, stripe));
+
+ case BYTE:
+ return std::unique_ptr<ColumnReader>(
+ new ByteColumnReader(type, stripe));
+
+ case LIST:
+ return std::unique_ptr<ColumnReader>(
+ new ListColumnReader(type, stripe));
+
+ case MAP:
+ return std::unique_ptr<ColumnReader>(
+ new MapColumnReader(type, stripe));
+
+ case UNION:
+ return std::unique_ptr<ColumnReader>(
+ new UnionColumnReader(type, stripe));
+
+ case STRUCT:
+ return std::unique_ptr<ColumnReader>(
+ new StructColumnReader(type, stripe));
+
+ case FLOAT:
+ case DOUBLE:
+ return std::unique_ptr<ColumnReader>(
+ new DoubleColumnReader(type, stripe));
+
+ case TIMESTAMP:
+ return std::unique_ptr<ColumnReader>
+ (new TimestampColumnReader(type, stripe));
+
+ case DECIMAL:
+ // is this a Hive 0.11 or 0.12 file?
+ if (type.getPrecision() == 0) {
+ return std::unique_ptr<ColumnReader>
+ (new DecimalHive11ColumnReader(type, stripe));
+
+ // can we represent the values using int64_t?
+ } else if (type.getPrecision() <=
+ Decimal64ColumnReader::MAX_PRECISION_64) {
+ return std::unique_ptr<ColumnReader>
+ (new Decimal64ColumnReader(type, stripe));
+
+ // otherwise we use the Int128 implementation
+ } else {
+ return std::unique_ptr<ColumnReader>
+ (new Decimal128ColumnReader(type, stripe));
+ }
+
+ default:
+ throw NotImplementedYet("buildReader unhandled type");
+ }
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.hh b/contrib/libs/apache/orc/c++/src/ColumnReader.hh
index 0c64e5b80f..5023cdfab5 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnReader.hh
+++ b/contrib/libs/apache/orc/c++/src/ColumnReader.hh
@@ -1,156 +1,156 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_COLUMN_READER_HH
-#define ORC_COLUMN_READER_HH
-
-#include <unordered_map>
-
-#include "orc/Vector.hh"
-
-#include "ByteRLE.hh"
-#include "Compression.hh"
-#include "Timezone.hh"
-#include "wrap/orc-proto-wrapper.hh"
-
-namespace orc {
-
- class StripeStreams {
- public:
- virtual ~StripeStreams();
-
- /**
- * Get the array of booleans for which columns are selected.
- * @return the address of an array which contains true at the index of
- * each columnId is selected.
- */
- virtual const std::vector<bool> getSelectedColumns() const = 0;
-
- /**
- * Get the encoding for the given column for this stripe.
- */
- virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0;
-
- /**
- * Get the stream for the given column/kind in this stripe.
- * @param columnId the id of the column
- * @param kind the kind of the stream
- * @param shouldStream should the reading page the stream in
- * @return the new stream
- */
- virtual std::unique_ptr<SeekableInputStream>
- getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const = 0;
-
- /**
- * Get the memory pool for this reader.
- */
- virtual MemoryPool& getMemoryPool() const = 0;
-
- /**
- * Get the writer's timezone, so that we can convert their dates correctly.
- */
- virtual const Timezone& getWriterTimezone() const = 0;
-
- /**
- * Get the error stream.
- * @return a pointer to the stream that should get error messages
- */
- virtual std::ostream* getErrorStream() const = 0;
-
- /**
- * Should the reader throw when the scale overflows when reading Hive 0.11
- * decimals.
- * @return true if it should throw
- */
- virtual bool getThrowOnHive11DecimalOverflow() const = 0;
-
- /**
- * What is the scale forced on the Hive 0.11 decimals?
- * @return the number of scale digits
- */
- virtual int32_t getForcedScaleOnHive11Decimal() const = 0;
- };
-
- /**
- * The interface for reading ORC data types.
- */
- class ColumnReader {
- protected:
- std::unique_ptr<ByteRleDecoder> notNullDecoder;
- uint64_t columnId;
- MemoryPool& memoryPool;
-
- public:
- ColumnReader(const Type& type, StripeStreams& stipe);
-
- virtual ~ColumnReader();
-
- /**
- * Skip number of specified rows.
- * @param numValues the number of values to skip
- * @return the number of non-null values skipped
- */
- virtual uint64_t skip(uint64_t numValues);
-
- /**
- * Read the next group of values into this rowBatch.
- * @param rowBatch the memory to read into.
- * @param numValues the number of values to read
- * @param notNull if null, all values are not null. Otherwise, it is
- * a mask (with at least numValues bytes) for which values to
- * set.
- */
- virtual void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull);
-
- /**
- * Read the next group of values without decoding
- * @param rowBatch the memory to read into.
- * @param numValues the number of values to read
- * @param notNull if null, all values are not null. Otherwise, it is
- * a mask (with at least numValues bytes) for which values to
- * set.
- */
- virtual void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull)
- {
- rowBatch.isEncoded = false;
- next(rowBatch, numValues, notNull);
- }
-
- /**
- * Seek to beginning of a row group in the current stripe
- * @param positions a list of PositionProviders storing the positions
- */
- virtual void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions);
-
- };
-
- /**
- * Create a reader for the given stripe.
- */
- std::unique_ptr<ColumnReader> buildReader(const Type& type,
- StripeStreams& stripe);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_COLUMN_READER_HH
+#define ORC_COLUMN_READER_HH
+
+#include <unordered_map>
+
+#include "orc/Vector.hh"
+
+#include "ByteRLE.hh"
+#include "Compression.hh"
+#include "Timezone.hh"
+#include "wrap/orc-proto-wrapper.hh"
+
+namespace orc {
+
+ class StripeStreams {
+ public:
+ virtual ~StripeStreams();
+
+ /**
+ * Get the array of booleans for which columns are selected.
+ * @return the address of an array which contains true at the index of
+ * each columnId is selected.
+ */
+ virtual const std::vector<bool> getSelectedColumns() const = 0;
+
+ /**
+ * Get the encoding for the given column for this stripe.
+ */
+ virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0;
+
+ /**
+ * Get the stream for the given column/kind in this stripe.
+ * @param columnId the id of the column
+ * @param kind the kind of the stream
+ * @param shouldStream should the reading page the stream in
+ * @return the new stream
+ */
+ virtual std::unique_ptr<SeekableInputStream>
+ getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const = 0;
+
+ /**
+ * Get the memory pool for this reader.
+ */
+ virtual MemoryPool& getMemoryPool() const = 0;
+
+ /**
+ * Get the writer's timezone, so that we can convert their dates correctly.
+ */
+ virtual const Timezone& getWriterTimezone() const = 0;
+
+ /**
+ * Get the error stream.
+ * @return a pointer to the stream that should get error messages
+ */
+ virtual std::ostream* getErrorStream() const = 0;
+
+ /**
+ * Should the reader throw when the scale overflows when reading Hive 0.11
+ * decimals.
+ * @return true if it should throw
+ */
+ virtual bool getThrowOnHive11DecimalOverflow() const = 0;
+
+ /**
+ * What is the scale forced on the Hive 0.11 decimals?
+ * @return the number of scale digits
+ */
+ virtual int32_t getForcedScaleOnHive11Decimal() const = 0;
+ };
+
+ /**
+ * The interface for reading ORC data types.
+ */
+ class ColumnReader {
+ protected:
+ std::unique_ptr<ByteRleDecoder> notNullDecoder;
+ uint64_t columnId;
+ MemoryPool& memoryPool;
+
+ public:
+ ColumnReader(const Type& type, StripeStreams& stipe);
+
+ virtual ~ColumnReader();
+
+ /**
+ * Skip number of specified rows.
+ * @param numValues the number of values to skip
+ * @return the number of non-null values skipped
+ */
+ virtual uint64_t skip(uint64_t numValues);
+
+ /**
+ * Read the next group of values into this rowBatch.
+ * @param rowBatch the memory to read into.
+ * @param numValues the number of values to read
+ * @param notNull if null, all values are not null. Otherwise, it is
+ * a mask (with at least numValues bytes) for which values to
+ * set.
+ */
+ virtual void next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull);
+
+ /**
+ * Read the next group of values without decoding
+ * @param rowBatch the memory to read into.
+ * @param numValues the number of values to read
+ * @param notNull if null, all values are not null. Otherwise, it is
+ * a mask (with at least numValues bytes) for which values to
+ * set.
+ */
+ virtual void nextEncoded(ColumnVectorBatch& rowBatch,
+ uint64_t numValues,
+ char* notNull)
+ {
+ rowBatch.isEncoded = false;
+ next(rowBatch, numValues, notNull);
+ }
+
+ /**
+ * Seek to beginning of a row group in the current stripe
+ * @param positions a list of PositionProviders storing the positions
+ */
+ virtual void seekToRowGroup(
+ std::unordered_map<uint64_t, PositionProvider>& positions);
+
+ };
+
+ /**
+ * Create a reader for the given stripe.
+ */
+ std::unique_ptr<ColumnReader> buildReader(const Type& type,
+ StripeStreams& stripe);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
index 1408a15457..8d4d00cc61 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
@@ -1,3013 +1,3013 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Int128.hh"
-#include "orc/Writer.hh"
-
-#include "ByteRLE.hh"
-#include "ColumnWriter.hh"
-#include "RLE.hh"
-#include "Statistics.hh"
-#include "Timezone.hh"
-
-namespace orc {
- StreamsFactory::~StreamsFactory() {
- //PASS
- }
-
- class StreamsFactoryImpl : public StreamsFactory {
- public:
- StreamsFactoryImpl(
- const WriterOptions& writerOptions,
- OutputStream* outputStream) :
- options(writerOptions),
- outStream(outputStream) {
- }
-
- virtual std::unique_ptr<BufferedOutputStream>
- createStream(proto::Stream_Kind kind) const override;
- private:
- const WriterOptions& options;
- OutputStream* outStream;
- };
-
- std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(
- proto::Stream_Kind) const {
- // In the future, we can decide compression strategy and modifier
- // based on stream kind. But for now we just use the setting from
- // WriterOption
- return createCompressor(
- options.getCompression(),
- outStream,
- options.getCompressionStrategy(),
- // BufferedOutputStream initial capacity
- 1 * 1024 * 1024,
- options.getCompressionBlockSize(),
- *options.getMemoryPool());
- }
-
- std::unique_ptr<StreamsFactory> createStreamsFactory(
- const WriterOptions& options,
- OutputStream* outStream) {
- return std::unique_ptr<StreamsFactory>(
- new StreamsFactoryImpl(options, outStream));
- }
-
- RowIndexPositionRecorder::~RowIndexPositionRecorder() {
- // PASS
- }
-
- proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion)
- {
- switch (rleVersion)
- {
- case RleVersion_1:
- return proto::ColumnEncoding_Kind_DIRECT;
- case RleVersion_2:
- return proto::ColumnEncoding_Kind_DIRECT_V2;
- default:
- throw InvalidArgument("Invalid param");
- }
- }
-
- ColumnWriter::ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- columnId(type.getColumnId()),
- colIndexStatistics(),
- colStripeStatistics(),
- colFileStatistics(),
- enableIndex(options.getEnableIndex()),
- rowIndex(),
- rowIndexEntry(),
- rowIndexPosition(),
- enableBloomFilter(false),
- memPool(*options.getMemoryPool()),
- indexStream(),
- bloomFilterStream() {
-
- std::unique_ptr<BufferedOutputStream> presentStream =
- factory.createStream(proto::Stream_Kind_PRESENT);
- notNullEncoder = createBooleanRleEncoder(std::move(presentStream));
-
- colIndexStatistics = createColumnStatistics(type);
- colStripeStatistics = createColumnStatistics(type);
- colFileStatistics = createColumnStatistics(type);
-
- if (enableIndex) {
- rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex());
- rowIndexEntry =
- std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry());
- rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>(
- new RowIndexPositionRecorder(*rowIndexEntry));
- indexStream =
- factory.createStream(proto::Stream_Kind_ROW_INDEX);
-
- // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported
- if (options.isColumnUseBloomFilter(columnId)
- && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) {
- enableBloomFilter = true;
- bloomFilter.reset(new BloomFilterImpl(
- options.getRowIndexStride(), options.getBloomFilterFPP()));
- bloomFilterIndex.reset(new proto::BloomFilterIndex());
- bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8);
- }
- }
- }
-
- ColumnWriter::~ColumnWriter() {
- // PASS
- }
-
- void ColumnWriter::add(ColumnVectorBatch& batch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- notNullEncoder->add(batch.notNull.data() + offset, numValues, incomingMask);
- }
-
- void ColumnWriter::flush(std::vector<proto::Stream>& streams) {
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_PRESENT);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(notNullEncoder->flush());
- streams.push_back(stream);
- }
-
- uint64_t ColumnWriter::getEstimatedSize() const {
- return notNullEncoder->getBufferSize();
- }
-
- void ColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- getProtoBufStatistics(stats, colStripeStatistics.get());
- }
-
- void ColumnWriter::mergeStripeStatsIntoFileStats() {
- colFileStatistics->merge(*colStripeStatistics);
- colStripeStatistics->reset();
- }
-
- void ColumnWriter::mergeRowGroupStatsIntoStripeStats() {
- colStripeStatistics->merge(*colIndexStatistics);
- colIndexStatistics->reset();
- }
-
- void ColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- getProtoBufStatistics(stats, colFileStatistics.get());
- }
-
- void ColumnWriter::createRowIndexEntry() {
- proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics();
- colIndexStatistics->toProtoBuf(*indexStats);
-
- *rowIndex->add_entry() = *rowIndexEntry;
-
- rowIndexEntry->clear_positions();
- rowIndexEntry->clear_statistics();
-
- colStripeStatistics->merge(*colIndexStatistics);
- colIndexStatistics->reset();
-
- addBloomFilterEntry();
-
- recordPosition();
- }
-
- void ColumnWriter::addBloomFilterEntry() {
- if (enableBloomFilter) {
- BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter());
- bloomFilter->reset();
- }
- }
-
- void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
- // write row index to output stream
- rowIndex->SerializeToZeroCopyStream(indexStream.get());
-
- // construct row index stream
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_ROW_INDEX);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(indexStream->flush());
- streams.push_back(stream);
-
- // write BLOOM_FILTER_UTF8 stream
- if (enableBloomFilter) {
- if (!bloomFilterIndex->SerializeToZeroCopyStream(bloomFilterStream.get())) {
- throw std::logic_error("Failed to write bloom filter stream.");
- }
- stream.set_kind(proto::Stream_Kind_BLOOM_FILTER_UTF8);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(bloomFilterStream->flush());
- streams.push_back(stream);
- }
- }
-
- void ColumnWriter::recordPosition() const {
- notNullEncoder->recordPosition(rowIndexPosition.get());
- }
-
- void ColumnWriter::reset() {
- if (enableIndex) {
- // clear row index
- rowIndex->clear_entry();
- rowIndexEntry->clear_positions();
- rowIndexEntry->clear_statistics();
-
- // write current positions
- recordPosition();
- }
-
- if (enableBloomFilter) {
- bloomFilter->reset();
- bloomFilterIndex->clear_bloomfilter();
- }
- }
-
- void ColumnWriter::writeDictionary() {
- // PASS
- }
-
- class StructColumnWriter : public ColumnWriter {
- public:
- StructColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
- ~StructColumnWriter() override;
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void mergeStripeStatsIntoFileStats() override;
-
- virtual void mergeRowGroupStatsIntoStripeStats() override;
-
- virtual void createRowIndexEntry() override;
-
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
-
- virtual void writeDictionary() override;
-
- virtual void reset() override;
-
- private:
- std::vector<ColumnWriter *> children;
- };
-
- StructColumnWriter::StructColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
- for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
- const Type& child = *type.getSubtype(i);
- children.push_back(buildWriter(child, factory, options).release());
- }
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- StructColumnWriter::~StructColumnWriter() {
- for (uint32_t i = 0; i < children.size(); ++i) {
- delete children[i];
- }
- }
-
- void StructColumnWriter::add(
- ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const StructVectorBatch* structBatch =
- dynamic_cast<const StructVectorBatch *>(&rowBatch);
- if (structBatch == nullptr) {
- throw InvalidArgument("Failed to cast to StructVectorBatch");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = structBatch->hasNulls ?
- structBatch->notNull.data() + offset : nullptr;
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->add(*structBatch->fields[i], offset, numValues, notNull);
- }
-
- // update stats
- if (!notNull) {
- colIndexStatistics->increase(numValues);
- } else {
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull[i]) {
- ++count;
- }
- }
- colIndexStatistics->increase(count);
- if (count < numValues) {
- colIndexStatistics->setHasNull(true);
- }
- }
- }
-
- void StructColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->flush(streams);
- }
- }
-
- void StructColumnWriter::writeIndex(
- std::vector<proto::Stream> &streams) const {
- ColumnWriter::writeIndex(streams);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeIndex(streams);
- }
- }
-
- uint64_t StructColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- for (uint32_t i = 0; i < children.size(); ++i) {
- size += children[i]->getEstimatedSize();
- }
- return size;
- }
-
- void StructColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
- encodings.push_back(encoding);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getColumnEncoding(encodings);
- }
- }
-
- void StructColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getStripeStatistics(stats);
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getStripeStatistics(stats);
- }
- }
-
- void StructColumnWriter::mergeStripeStatsIntoFileStats() {
- ColumnWriter::mergeStripeStatsIntoFileStats();
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeStripeStatsIntoFileStats();
- }
- }
-
- void StructColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getFileStatistics(stats);
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getFileStatistics(stats);
- }
- }
-
- void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
- ColumnWriter::mergeRowGroupStatsIntoStripeStats();
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeRowGroupStatsIntoStripeStats();
- }
- }
-
- void StructColumnWriter::createRowIndexEntry() {
- ColumnWriter::createRowIndexEntry();
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->createRowIndexEntry();
- }
- }
-
- void StructColumnWriter::reset() {
- ColumnWriter::reset();
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->reset();
- }
- }
-
- void StructColumnWriter::writeDictionary() {
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeDictionary();
- }
- }
-
- class IntegerColumnWriter : public ColumnWriter {
- public:
- IntegerColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- protected:
- std::unique_ptr<RleEncoder> rleEncoder;
-
- private:
- RleVersion rleVersion;
- };
-
- IntegerColumnWriter::IntegerColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()) {
- std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createRleEncoder(
- std::move(dataStream),
- true,
- rleVersion,
- memPool,
- options.getAlignedBitpacking());
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- void IntegerColumnWriter::add(
- ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const LongVectorBatch* longBatch =
- dynamic_cast<const LongVectorBatch*>(&rowBatch);
- if (longBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
- }
- IntegerColumnStatisticsImpl* intStats =
- dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
- if (intStats == nullptr) {
- throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const int64_t* data = longBatch->data.data() + offset;
- const char* notNull = longBatch->hasNulls ?
- longBatch->notNull.data() + offset : nullptr;
-
- rleEncoder->add(data, numValues, notNull);
-
- // update stats
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull == nullptr || notNull[i]) {
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(data[i]);
- }
- intStats->update(data[i], 1);
- }
- }
- intStats->increase(count);
- if (count < numValues) {
- intStats->setHasNull(true);
- }
- }
-
- void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_DATA);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(rleEncoder->flush());
- streams.push_back(stream);
- }
-
- uint64_t IntegerColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += rleEncoder->getBufferSize();
- return size;
- }
-
- void IntegerColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void IntegerColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- rleEncoder->recordPosition(rowIndexPosition.get());
- }
-
- class ByteColumnWriter : public ColumnWriter {
- public:
- ByteColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- private:
- std::unique_ptr<ByteRleEncoder> byteRleEncoder;
- };
-
- ByteColumnWriter::ByteColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
- std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- byteRleEncoder = createByteRleEncoder(std::move(dataStream));
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
- if (byteBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
- }
- IntegerColumnStatisticsImpl* intStats =
- dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
- if (intStats == nullptr) {
- throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- int64_t* data = byteBatch->data.data() + offset;
- const char* notNull = byteBatch->hasNulls ?
- byteBatch->notNull.data() + offset : nullptr;
-
- char* byteData = reinterpret_cast<char*>(data);
- for (uint64_t i = 0; i < numValues; ++i) {
- byteData[i] = static_cast<char>(data[i]);
- }
- byteRleEncoder->add(byteData, numValues, notNull);
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull == nullptr || notNull[i]) {
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(data[i]);
- }
- intStats->update(static_cast<int64_t>(byteData[i]), 1);
- }
- }
- intStats->increase(count);
- if (count < numValues) {
- intStats->setHasNull(true);
- }
- }
-
- void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_DATA);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(byteRleEncoder->flush());
- streams.push_back(stream);
- }
-
- uint64_t ByteColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += byteRleEncoder->getBufferSize();
- return size;
- }
-
- void ByteColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void ByteColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- byteRleEncoder->recordPosition(rowIndexPosition.get());
- }
-
- class BooleanColumnWriter : public ColumnWriter {
- public:
- BooleanColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- private:
- std::unique_ptr<ByteRleEncoder> rleEncoder;
- };
-
- BooleanColumnWriter::BooleanColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
- std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createBooleanRleEncoder(std::move(dataStream));
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
- if (byteBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
- }
- BooleanColumnStatisticsImpl* boolStats =
- dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
- if (boolStats == nullptr) {
- throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- int64_t* data = byteBatch->data.data() + offset;
- const char* notNull = byteBatch->hasNulls ?
- byteBatch->notNull.data() + offset : nullptr;
-
- char* byteData = reinterpret_cast<char*>(data);
- for (uint64_t i = 0; i < numValues; ++i) {
- byteData[i] = static_cast<char>(data[i]);
- }
- rleEncoder->add(byteData, numValues, notNull);
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull == nullptr || notNull[i]) {
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(data[i]);
- }
- boolStats->update(byteData[i] != 0, 1);
- }
- }
- boolStats->increase(count);
- if (count < numValues) {
- boolStats->setHasNull(true);
- }
- }
-
- void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_DATA);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(rleEncoder->flush());
- streams.push_back(stream);
- }
-
- uint64_t BooleanColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += rleEncoder->getBufferSize();
- return size;
- }
-
- void BooleanColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void BooleanColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- rleEncoder->recordPosition(rowIndexPosition.get());
- }
-
- class DoubleColumnWriter : public ColumnWriter {
- public:
- DoubleColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isFloat);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- private:
- bool isFloat;
- std::unique_ptr<AppendOnlyBufferedStream> dataStream;
- DataBuffer<char> buffer;
- };
-
- DoubleColumnWriter::DoubleColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isFloatType) :
- ColumnWriter(type, factory, options),
- isFloat(isFloatType),
- buffer(*options.getMemoryPool()) {
- dataStream.reset(new AppendOnlyBufferedStream(
- factory.createStream(proto::Stream_Kind_DATA)));
- buffer.resize(isFloat ? 4 : 8);
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- // Floating point types are stored using IEEE 754 floating point bit layout.
- // Float columns use 4 bytes per value and double columns use 8 bytes.
- template <typename FLOAT_TYPE, typename INTEGER_TYPE>
- inline void encodeFloatNum(FLOAT_TYPE input, char* output) {
- INTEGER_TYPE* intBits = reinterpret_cast<INTEGER_TYPE*>(&input);
- for (size_t i = 0; i < sizeof(INTEGER_TYPE); ++i) {
- output[i] = static_cast<char>(((*intBits) >> (8 * i)) & 0xff);
- }
- }
-
- void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const DoubleVectorBatch* dblBatch =
- dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
- if (dblBatch == nullptr) {
- throw InvalidArgument("Failed to cast to DoubleVectorBatch");
- }
- DoubleColumnStatisticsImpl* doubleStats =
- dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
- if (doubleStats == nullptr) {
- throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const double* doubleData = dblBatch->data.data() + offset;
- const char* notNull = dblBatch->hasNulls ?
- dblBatch->notNull.data() + offset : nullptr;
-
- size_t bytes = isFloat ? 4 : 8;
- char* data = buffer.data();
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- if (isFloat) {
- encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data);
- } else {
- encodeFloatNum<double, int64_t>(doubleData[i], data);
- }
- dataStream->write(data, bytes);
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addDouble(doubleData[i]);
- }
- doubleStats->update(doubleData[i]);
- }
- }
- doubleStats->increase(count);
- if (count < numValues) {
- doubleStats->setHasNull(true);
- }
- }
-
- void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_DATA);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(dataStream->flush());
- streams.push_back(stream);
- }
-
- uint64_t DoubleColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += dataStream->getSize();
- return size;
- }
-
- void DoubleColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void DoubleColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- dataStream->recordPosition(rowIndexPosition.get());
- }
-
- /**
- * Implementation of increasing sorted string dictionary
- */
- class SortedStringDictionary {
- public:
- struct DictEntry {
- DictEntry(const char * str, size_t len):data(str),length(len) {}
- const char * data;
- size_t length;
- };
-
- SortedStringDictionary():totalLength(0) {}
-
- // insert a new string into dictionary, return its insertion order
- size_t insert(const char * data, size_t len);
-
- // write dictionary data & length to output buffer
- void flush(AppendOnlyBufferedStream * dataStream,
- RleEncoder * lengthEncoder) const;
-
- // reorder input index buffer from insertion order to dictionary order
- void reorder(std::vector<int64_t>& idxBuffer) const;
-
- // get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const;
-
- // return count of entries
- size_t size() const;
-
- // return total length of strings in the dictioanry
- uint64_t length() const;
-
- void clear();
-
- private:
- struct LessThan {
- bool operator()(const DictEntry& left, const DictEntry& right) const {
- int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
- if (ret != 0) {
- return ret < 0;
- }
- return left.length < right.length;
- }
- };
-
- std::map<DictEntry, size_t, LessThan> dict;
- std::vector<std::vector<char>> data;
- uint64_t totalLength;
-
- // use friend class here to avoid being bothered by const function calls
- friend class StringColumnWriter;
- friend class CharColumnWriter;
- friend class VarCharColumnWriter;
- // store indexes of insertion order in the dictionary for not-null rows
- std::vector<int64_t> idxInDictBuffer;
- };
-
- // insert a new string into dictionary, return its insertion order
- size_t SortedStringDictionary::insert(const char * str, size_t len) {
- auto ret = dict.insert({DictEntry(str, len), dict.size()});
- if (ret.second) {
- // make a copy to internal storage
- data.push_back(std::vector<char>(len));
- memcpy(data.back().data(), str, len);
- // update dictionary entry to link pointer to internal storage
- DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first));
- entry->data = data.back().data();
- totalLength += len;
- }
- return ret.first->second;
- }
-
- // write dictionary data & length to output buffer
- void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream,
- RleEncoder * lengthEncoder) const {
- for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
- dataStream->write(it->first.data, it->first.length);
- lengthEncoder->write(static_cast<int64_t>(it->first.length));
- }
- }
-
- /**
- * Reorder input index buffer from insertion order to dictionary order
- *
- * We require this function because string values are buffered by indexes
- * in their insertion order. Until the entire dictionary is complete can
- * we get their sorted indexes in the dictionary in that ORC specification
- * demands dictionary should be ordered. Therefore this function transforms
- * the indexes from insertion order to dictionary value order for final
- * output.
- */
- void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
- // iterate the dictionary to get mapping from insertion order to value order
- std::vector<size_t> mapping(dict.size());
- size_t dictIdx = 0;
- for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
- mapping[it->second] = dictIdx++;
- }
-
- // do the transformation
- for (size_t i = 0; i != idxBuffer.size(); ++i) {
- idxBuffer[i] = static_cast<int64_t>(
- mapping[static_cast<size_t>(idxBuffer[i])]);
- }
- }
-
- // get dict entries in insertion order
- void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector<const DictEntry *>& entries) const {
- entries.resize(dict.size());
- for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
- entries[it->second] = &(it->first);
- }
- }
-
- // return count of entries
- size_t SortedStringDictionary::size() const {
- return dict.size();
- }
-
- // return total length of strings in the dictioanry
- uint64_t SortedStringDictionary::length() const {
- return totalLength;
- }
-
- void SortedStringDictionary::clear() {
- totalLength = 0;
- data.clear();
- dict.clear();
- }
-
- class StringColumnWriter : public ColumnWriter {
- public:
- StringColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- virtual void createRowIndexEntry() override;
-
- virtual void writeDictionary() override;
-
- virtual void reset() override;
-
- private:
- /**
- * dictionary related functions
- */
- bool checkDictionaryKeyRatio();
- void createDirectStreams();
- void createDictStreams();
- void deleteDictStreams();
- void fallbackToDirectEncoding();
-
- protected:
- RleVersion rleVersion;
- bool useCompression;
- const StreamsFactory& streamsFactory;
- bool alignedBitPacking;
-
- // direct encoding streams
- std::unique_ptr<RleEncoder> directLengthEncoder;
- std::unique_ptr<AppendOnlyBufferedStream> directDataStream;
-
- // dictionary encoding streams
- std::unique_ptr<RleEncoder> dictDataEncoder;
- std::unique_ptr<RleEncoder> dictLengthEncoder;
- std::unique_ptr<AppendOnlyBufferedStream> dictStream;
-
- /**
- * dictionary related variables
- */
- SortedStringDictionary dictionary;
- // whether or not dictionary checking is done
- bool doneDictionaryCheck;
- // whether or not it should be used
- bool useDictionary;
- // keys in the dictionary should not exceed this ratio
- double dictSizeThreshold;
-
- // record start row of each row group; null rows are skipped
- mutable std::vector<size_t> startOfRowGroups;
- };
-
- StringColumnWriter::StringColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- useCompression(options.getCompression() != CompressionKind_NONE),
- streamsFactory(factory),
- alignedBitPacking(options.getAlignedBitpacking()),
- doneDictionaryCheck(false),
- useDictionary(options.getEnableDictionary()),
- dictSizeThreshold(options.getDictionaryKeySizeThreshold()){
- if (type.getKind() == TypeKind::BINARY) {
- useDictionary = false;
- doneDictionaryCheck = true;
- }
-
- if (useDictionary) {
- createDictStreams();
- } else {
- doneDictionaryCheck = true;
- createDirectStreams();
- }
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- void StringColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const StringVectorBatch* stringBatch =
- dynamic_cast<const StringVectorBatch*>(&rowBatch);
- if (stringBatch == nullptr) {
- throw InvalidArgument("Failed to cast to StringVectorBatch");
- }
-
- StringColumnStatisticsImpl* strStats =
- dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
- if (strStats == nullptr) {
- throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- char *const * data = stringBatch->data.data() + offset;
- const int64_t* length = stringBatch->length.data() + offset;
- const char* notNull = stringBatch->hasNulls ?
- stringBatch->notNull.data() + offset : nullptr;
-
- if (!useDictionary){
- directLengthEncoder->add(length, numValues, notNull);
- }
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- const size_t len = static_cast<size_t>(length[i]);
- if (useDictionary) {
- size_t index = dictionary.insert(data[i], len);
- dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
- } else {
- directDataStream->write(data[i], len);
- }
- if (enableBloomFilter) {
- bloomFilter->addBytes(data[i], static_cast<int64_t>(len));
- }
- strStats->update(data[i], len);
- ++count;
- }
- }
- strStats->increase(count);
- if (count < numValues) {
- strStats->setHasNull(true);
- }
- }
-
- void StringColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- if (useDictionary) {
- proto::Stream data;
- data.set_kind(proto::Stream_Kind_DATA);
- data.set_column(static_cast<uint32_t>(columnId));
- data.set_length(dictDataEncoder->flush());
- streams.push_back(data);
-
- proto::Stream dict;
- dict.set_kind(proto::Stream_Kind_DICTIONARY_DATA);
- dict.set_column(static_cast<uint32_t>(columnId));
- dict.set_length(dictStream->flush());
- streams.push_back(dict);
-
- proto::Stream length;
- length.set_kind(proto::Stream_Kind_LENGTH);
- length.set_column(static_cast<uint32_t>(columnId));
- length.set_length(dictLengthEncoder->flush());
- streams.push_back(length);
- } else {
- proto::Stream length;
- length.set_kind(proto::Stream_Kind_LENGTH);
- length.set_column(static_cast<uint32_t>(columnId));
- length.set_length(directLengthEncoder->flush());
- streams.push_back(length);
-
- proto::Stream data;
- data.set_kind(proto::Stream_Kind_DATA);
- data.set_column(static_cast<uint32_t>(columnId));
- data.set_length(directDataStream->flush());
- streams.push_back(data);
- }
- }
-
- uint64_t StringColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- if (!useDictionary) {
- size += directLengthEncoder->getBufferSize();
- size += directDataStream->getSize();
- } else {
- size += dictionary.length();
- size += dictionary.size() * sizeof(int32_t);
- size += dictionary.idxInDictBuffer.size() * sizeof(int32_t);
- if (useCompression) {
- size /= 3; // estimated ratio is 3:1
- }
- }
- return size;
- }
-
- void StringColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- if (!useDictionary) {
- encoding.set_kind(rleVersion == RleVersion_1 ?
- proto::ColumnEncoding_Kind_DIRECT :
- proto::ColumnEncoding_Kind_DIRECT_V2);
- } else {
- encoding.set_kind(rleVersion == RleVersion_1 ?
- proto::ColumnEncoding_Kind_DICTIONARY :
- proto::ColumnEncoding_Kind_DICTIONARY_V2);
- }
- encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size()));
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void StringColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- if (!useDictionary) {
- directDataStream->recordPosition(rowIndexPosition.get());
- directLengthEncoder->recordPosition(rowIndexPosition.get());
- } else {
- if (enableIndex) {
- startOfRowGroups.push_back(dictionary.idxInDictBuffer.size());
- }
- }
- }
-
- bool StringColumnWriter::checkDictionaryKeyRatio() {
- if (!doneDictionaryCheck) {
- useDictionary = dictionary.size() <= static_cast<size_t>(
- static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold);
- doneDictionaryCheck = true;
- }
-
- return useDictionary;
- }
-
- void StringColumnWriter::createRowIndexEntry() {
- if (useDictionary && !doneDictionaryCheck) {
- if (!checkDictionaryKeyRatio()) {
- fallbackToDirectEncoding();
- }
- }
- ColumnWriter::createRowIndexEntry();
- }
-
- void StringColumnWriter::reset() {
- ColumnWriter::reset();
-
- dictionary.clear();
- dictionary.idxInDictBuffer.resize(0);
- startOfRowGroups.clear();
- startOfRowGroups.push_back(0);
- }
-
- void StringColumnWriter::createDirectStreams() {
- std::unique_ptr<BufferedOutputStream> directLengthStream =
- streamsFactory.createStream(proto::Stream_Kind_LENGTH);
- directLengthEncoder = createRleEncoder(std::move(directLengthStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
- directDataStream.reset(new AppendOnlyBufferedStream(
- streamsFactory.createStream(proto::Stream_Kind_DATA)));
- }
-
- void StringColumnWriter::createDictStreams() {
- std::unique_ptr<BufferedOutputStream> dictDataStream =
- streamsFactory.createStream(proto::Stream_Kind_DATA);
- dictDataEncoder = createRleEncoder(std::move(dictDataStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
- std::unique_ptr<BufferedOutputStream> dictLengthStream =
- streamsFactory.createStream(proto::Stream_Kind_LENGTH);
- dictLengthEncoder = createRleEncoder(std::move(dictLengthStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
- dictStream.reset(new AppendOnlyBufferedStream(
- streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA)));
- }
-
- void StringColumnWriter::deleteDictStreams() {
- dictDataEncoder.reset(nullptr);
- dictLengthEncoder.reset(nullptr);
- dictStream.reset(nullptr);
-
- dictionary.clear();
- dictionary.idxInDictBuffer.clear();
- startOfRowGroups.clear();
- }
-
- void StringColumnWriter::writeDictionary() {
- if (useDictionary && !doneDictionaryCheck) {
- // when index is disabled, dictionary check happens while writing 1st stripe
- if (!checkDictionaryKeyRatio()) {
- fallbackToDirectEncoding();
- return;
- }
- }
-
- if (useDictionary) {
- // flush dictionary data & length streams
- dictionary.flush(dictStream.get(), dictLengthEncoder.get());
-
- // convert index from insertion order to dictionary order
- dictionary.reorder(dictionary.idxInDictBuffer);
-
- // write data sequences
- int64_t * data = dictionary.idxInDictBuffer.data();
- if (enableIndex) {
- size_t prevOffset = 0;
- for (size_t i = 0; i < startOfRowGroups.size(); ++i) {
- // write sequences in batch for a row group stride
- size_t offset = startOfRowGroups[i];
- dictDataEncoder->add(data + prevOffset, offset - prevOffset, nullptr);
-
- // update index positions
- int rowGroupId = static_cast<int>(i);
- proto::RowIndexEntry* indexEntry =
- (rowGroupId < rowIndex->entry_size()) ?
- rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get();
-
- // add positions for direct streams
- RowIndexPositionRecorder recorder(*indexEntry);
- dictDataEncoder->recordPosition(&recorder);
-
- prevOffset = offset;
- }
-
- dictDataEncoder->add(data + prevOffset,
- dictionary.idxInDictBuffer.size() - prevOffset,
- nullptr);
- } else {
- dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr);
- }
- }
- }
-
- void StringColumnWriter::fallbackToDirectEncoding() {
- createDirectStreams();
-
- if (enableIndex) {
- // fallback happens at the 1st row group;
- // simply complete positions for direct streams
- proto::RowIndexEntry * indexEntry = rowIndexEntry.get();
- RowIndexPositionRecorder recorder(*indexEntry);
- directDataStream->recordPosition(&recorder);
- directLengthEncoder->recordPosition(&recorder);
- }
-
- // get dictionary entries in insertion order
- std::vector<const SortedStringDictionary::DictEntry *> entries;
- dictionary.getEntriesInInsertionOrder(entries);
-
- // store each length of the data into a vector
- const SortedStringDictionary::DictEntry * dictEntry = nullptr;
- for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) {
- // write one row data in direct encoding
- dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])];
- directDataStream->write(dictEntry->data, dictEntry->length);
- directLengthEncoder->write(static_cast<int64_t>(dictEntry->length));
- }
-
- deleteDictStreams();
- }
-
- struct Utf8Utils {
- /**
- * Counts how many utf-8 chars of the input data
- */
- static uint64_t charLength(const char * data, uint64_t length) {
- uint64_t chars = 0;
- for (uint64_t i = 0; i < length; i++) {
- if (isUtfStartByte(data[i])) {
- chars++;
- }
- }
- return chars;
- }
-
- /**
- * Return the number of bytes required to read at most maxCharLength
- * characters in full from a utf-8 encoded byte array provided
- * by data. This does not validate utf-8 data, but
- * operates correctly on already valid utf-8 data.
- *
- * @param maxCharLength number of characters required
- * @param data the bytes of UTF-8
- * @param length the length of data to truncate
- */
- static uint64_t truncateBytesTo(uint64_t maxCharLength,
- const char * data,
- uint64_t length) {
- uint64_t chars = 0;
- if (length <= maxCharLength) {
- return length;
- }
- for (uint64_t i = 0; i < length; i++) {
- if (isUtfStartByte(data[i])) {
- chars++;
- }
- if (chars > maxCharLength) {
- return i;
- }
- }
- // everything fits
- return length;
- }
-
- /**
- * Checks if b is the first byte of a UTF-8 character.
- */
- inline static bool isUtfStartByte(char b) {
- return (b & 0xC0) != 0x80;
- }
-
- /**
- * Find the start of the last character that ends in the current string.
- * @param text the bytes of the utf-8
- * @param from the first byte location
- * @param until the last byte location
- * @return the index of the last character
- */
- static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) {
- uint64_t posn = until;
- /* we don't expect characters more than 5 bytes */
- while (posn >= from) {
- if (isUtfStartByte(text[posn])) {
- return posn;
- }
- posn -= 1;
- }
- /* beginning of a valid char not found */
- throw std::logic_error(
- "Could not truncate string, beginning of a valid char not found");
- }
- };
-
- class CharColumnWriter : public StringColumnWriter {
- public:
- CharColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()),
- padBuffer(*options.getMemoryPool()) {
- // utf-8 is currently 4 bytes long, but it could be up to 6
- padBuffer.resize(maxLength * 6);
- }
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- private:
- uint64_t maxLength;
- DataBuffer<char> padBuffer;
- };
-
- void CharColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
- if (charsBatch == nullptr) {
- throw InvalidArgument("Failed to cast to StringVectorBatch");
- }
-
- StringColumnStatisticsImpl* strStats =
- dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
- if (strStats == nullptr) {
- throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- char** data = charsBatch->data.data() + offset;
- int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- const char * charData = nullptr;
- uint64_t originLength = static_cast<uint64_t>(length[i]);
- uint64_t charLength = Utf8Utils::charLength(data[i], originLength);
- if (charLength >= maxLength) {
- charData = data[i];
- length[i] = static_cast<int64_t>(
- Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
- } else {
- charData = padBuffer.data();
- // the padding is exactly 1 byte per char
- length[i] = length[i] + static_cast<int64_t>(maxLength - charLength);
- memcpy(padBuffer.data(), data[i], originLength);
- memset(padBuffer.data() + originLength,
- ' ',
- static_cast<size_t>(length[i]) - originLength);
- }
-
- if (useDictionary) {
- size_t index = dictionary.insert(charData, static_cast<size_t>(length[i]));
- dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
- } else {
- directDataStream->write(charData, static_cast<size_t>(length[i]));
- }
-
- if (enableBloomFilter) {
- bloomFilter->addBytes(data[i], length[i]);
- }
- strStats->update(charData, static_cast<size_t>(length[i]));
- ++count;
- }
- }
-
- if (!useDictionary) {
- directLengthEncoder->add(length, numValues, notNull);
- }
-
- strStats->increase(count);
- if (count < numValues) {
- strStats->setHasNull(true);
- }
- }
-
- class VarCharColumnWriter : public StringColumnWriter {
- public:
- VarCharColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()) {
- // PASS
- }
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- private:
- uint64_t maxLength;
- };
-
- void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
- if (charsBatch == nullptr) {
- throw InvalidArgument("Failed to cast to StringVectorBatch");
- }
-
- StringColumnStatisticsImpl* strStats =
- dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
- if (strStats == nullptr) {
- throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- char* const* data = charsBatch->data.data() + offset;
- int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- uint64_t itemLength = Utf8Utils::truncateBytesTo(
- maxLength, data[i], static_cast<uint64_t>(length[i]));
- length[i] = static_cast<int64_t>(itemLength);
-
- if (useDictionary) {
- size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i]));
- dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
- } else {
- directDataStream->write(data[i], static_cast<size_t>(length[i]));
- }
-
- if (enableBloomFilter) {
- bloomFilter->addBytes(data[i], length[i]);
- }
- strStats->update(data[i], static_cast<size_t>(length[i]));
- ++count;
- }
- }
-
- if (!useDictionary) {
- directLengthEncoder->add(length, numValues, notNull);
- }
-
- strStats->increase(count);
- if (count < numValues) {
- strStats->setHasNull(true);
- }
- }
-
- class BinaryColumnWriter : public StringColumnWriter {
- public:
- BinaryColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options) {
- // PASS
- }
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
- };
-
- void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
- if (binBatch == nullptr) {
- throw InvalidArgument("Failed to cast to StringVectorBatch");
- }
-
- BinaryColumnStatisticsImpl* binStats =
- dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get());
- if (binStats == nullptr) {
- throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- char** data = binBatch->data.data() + offset;
- int64_t* length = binBatch->length.data() + offset;
- const char* notNull = binBatch->hasNulls ?
- binBatch->notNull.data() + offset : nullptr;
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- uint64_t unsignedLength = static_cast<uint64_t>(length[i]);
- if (!notNull || notNull[i]) {
- directDataStream->write(data[i], unsignedLength);
-
- binStats->update(unsignedLength);
- ++count;
- }
- }
- directLengthEncoder->add(length, numValues, notNull);
- binStats->increase(count);
- if (count < numValues) {
- binStats->setHasNull(true);
- }
- }
-
- class TimestampColumnWriter : public ColumnWriter {
- public:
- TimestampColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- protected:
- std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder;
-
- private:
- RleVersion rleVersion;
- const Timezone& timezone;
- };
-
- TimestampColumnWriter::TimestampColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- timezone(getTimezoneByName("GMT")){
- std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- std::unique_ptr<BufferedOutputStream> secondaryStream =
- factory.createStream(proto::Stream_Kind_SECONDARY);
- secRleEncoder = createRleEncoder(std::move(dataStream),
- true,
- rleVersion,
- memPool,
- options.getAlignedBitpacking());
- nanoRleEncoder = createRleEncoder(std::move(secondaryStream),
- false,
- rleVersion,
- memPool,
- options.getAlignedBitpacking());
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- // Because the number of nanoseconds often has a large number of trailing zeros,
- // the number has trailing decimal zero digits removed and the last three bits
- // are used to record how many zeros were removed if the trailing zeros are
- // more than 2. Thus 1000 nanoseconds would be serialized as 0x0a and
- // 100000 would be serialized as 0x0c.
- static int64_t formatNano(int64_t nanos) {
- if (nanos == 0) {
- return 0;
- } else if (nanos % 100 != 0) {
- return (nanos) << 3;
- } else {
- nanos /= 100;
- int64_t trailingZeros = 1;
- while (nanos % 10 == 0 && trailingZeros < 7) {
- nanos /= 10;
- trailingZeros += 1;
- }
- return (nanos) << 3 | trailingZeros;
- }
- }
-
- void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- TimestampVectorBatch* tsBatch =
- dynamic_cast<TimestampVectorBatch*>(&rowBatch);
- if (tsBatch == nullptr) {
- throw InvalidArgument("Failed to cast to TimestampVectorBatch");
- }
-
- TimestampColumnStatisticsImpl* tsStats =
- dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get());
- if (tsStats == nullptr) {
- throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const char* notNull = tsBatch->hasNulls ?
- tsBatch->notNull.data() + offset : nullptr;
- int64_t *secs = tsBatch->data.data() + offset;
- int64_t *nanos = tsBatch->nanoseconds.data() + offset;
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull == nullptr || notNull[i]) {
- // TimestampVectorBatch already stores data in UTC
- int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(millsUTC);
- }
- tsStats->update(millsUTC);
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Int128.hh"
+#include "orc/Writer.hh"
+
+#include "ByteRLE.hh"
+#include "ColumnWriter.hh"
+#include "RLE.hh"
+#include "Statistics.hh"
+#include "Timezone.hh"
+
+namespace orc {
+ StreamsFactory::~StreamsFactory() {
+ //PASS
+ }
+
+ class StreamsFactoryImpl : public StreamsFactory {
+ public:
+ StreamsFactoryImpl(
+ const WriterOptions& writerOptions,
+ OutputStream* outputStream) :
+ options(writerOptions),
+ outStream(outputStream) {
+ }
+
+ virtual std::unique_ptr<BufferedOutputStream>
+ createStream(proto::Stream_Kind kind) const override;
+ private:
+ const WriterOptions& options;
+ OutputStream* outStream;
+ };
+
+ std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(
+ proto::Stream_Kind) const {
+ // In the future, we can decide compression strategy and modifier
+ // based on stream kind. But for now we just use the setting from
+ // WriterOption
+ return createCompressor(
+ options.getCompression(),
+ outStream,
+ options.getCompressionStrategy(),
+ // BufferedOutputStream initial capacity
+ 1 * 1024 * 1024,
+ options.getCompressionBlockSize(),
+ *options.getMemoryPool());
+ }
+
+ std::unique_ptr<StreamsFactory> createStreamsFactory(
+ const WriterOptions& options,
+ OutputStream* outStream) {
+ return std::unique_ptr<StreamsFactory>(
+ new StreamsFactoryImpl(options, outStream));
+ }
+
+ RowIndexPositionRecorder::~RowIndexPositionRecorder() {
+ // PASS
+ }
+
+ proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion)
+ {
+ switch (rleVersion)
+ {
+ case RleVersion_1:
+ return proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion_2:
+ return proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ throw InvalidArgument("Invalid param");
+ }
+ }
+
+ ColumnWriter::ColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ columnId(type.getColumnId()),
+ colIndexStatistics(),
+ colStripeStatistics(),
+ colFileStatistics(),
+ enableIndex(options.getEnableIndex()),
+ rowIndex(),
+ rowIndexEntry(),
+ rowIndexPosition(),
+ enableBloomFilter(false),
+ memPool(*options.getMemoryPool()),
+ indexStream(),
+ bloomFilterStream() {
+
+ std::unique_ptr<BufferedOutputStream> presentStream =
+ factory.createStream(proto::Stream_Kind_PRESENT);
+ notNullEncoder = createBooleanRleEncoder(std::move(presentStream));
+
+ colIndexStatistics = createColumnStatistics(type);
+ colStripeStatistics = createColumnStatistics(type);
+ colFileStatistics = createColumnStatistics(type);
+
+ if (enableIndex) {
+ rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex());
+ rowIndexEntry =
+ std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry());
+ rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>(
+ new RowIndexPositionRecorder(*rowIndexEntry));
+ indexStream =
+ factory.createStream(proto::Stream_Kind_ROW_INDEX);
+
+ // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported
+ if (options.isColumnUseBloomFilter(columnId)
+ && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) {
+ enableBloomFilter = true;
+ bloomFilter.reset(new BloomFilterImpl(
+ options.getRowIndexStride(), options.getBloomFilterFPP()));
+ bloomFilterIndex.reset(new proto::BloomFilterIndex());
+ bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8);
+ }
+ }
+ }
+
+ ColumnWriter::~ColumnWriter() {
+ // PASS
+ }
+
+ void ColumnWriter::add(ColumnVectorBatch& batch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ notNullEncoder->add(batch.notNull.data() + offset, numValues, incomingMask);
+ }
+
+ void ColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_PRESENT);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(notNullEncoder->flush());
+ streams.push_back(stream);
+ }
+
+ uint64_t ColumnWriter::getEstimatedSize() const {
+ return notNullEncoder->getBufferSize();
+ }
+
+ void ColumnWriter::getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ getProtoBufStatistics(stats, colStripeStatistics.get());
+ }
+
+ void ColumnWriter::mergeStripeStatsIntoFileStats() {
+ colFileStatistics->merge(*colStripeStatistics);
+ colStripeStatistics->reset();
+ }
+
+ void ColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ colStripeStatistics->merge(*colIndexStatistics);
+ colIndexStatistics->reset();
+ }
+
+ void ColumnWriter::getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ getProtoBufStatistics(stats, colFileStatistics.get());
+ }
+
+ void ColumnWriter::createRowIndexEntry() {
+ proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics();
+ colIndexStatistics->toProtoBuf(*indexStats);
+
+ *rowIndex->add_entry() = *rowIndexEntry;
+
+ rowIndexEntry->clear_positions();
+ rowIndexEntry->clear_statistics();
+
+ colStripeStatistics->merge(*colIndexStatistics);
+ colIndexStatistics->reset();
+
+ addBloomFilterEntry();
+
+ recordPosition();
+ }
+
+ void ColumnWriter::addBloomFilterEntry() {
+ if (enableBloomFilter) {
+ BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter());
+ bloomFilter->reset();
+ }
+ }
+
+ void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ // write row index to output stream
+ rowIndex->SerializeToZeroCopyStream(indexStream.get());
+
+ // construct row index stream
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_ROW_INDEX);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(indexStream->flush());
+ streams.push_back(stream);
+
+ // write BLOOM_FILTER_UTF8 stream
+ if (enableBloomFilter) {
+ if (!bloomFilterIndex->SerializeToZeroCopyStream(bloomFilterStream.get())) {
+ throw std::logic_error("Failed to write bloom filter stream.");
+ }
+ stream.set_kind(proto::Stream_Kind_BLOOM_FILTER_UTF8);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(bloomFilterStream->flush());
+ streams.push_back(stream);
+ }
+ }
+
+ void ColumnWriter::recordPosition() const {
+ notNullEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ void ColumnWriter::reset() {
+ if (enableIndex) {
+ // clear row index
+ rowIndex->clear_entry();
+ rowIndexEntry->clear_positions();
+ rowIndexEntry->clear_statistics();
+
+ // write current positions
+ recordPosition();
+ }
+
+ if (enableBloomFilter) {
+ bloomFilter->reset();
+ bloomFilterIndex->clear_bloomfilter();
+ }
+ }
+
+ void ColumnWriter::writeDictionary() {
+ // PASS
+ }
+
+ class StructColumnWriter : public ColumnWriter {
+ public:
+ StructColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+ ~StructColumnWriter() override;
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void mergeStripeStatsIntoFileStats() override;
+
+ virtual void mergeRowGroupStatsIntoStripeStats() override;
+
+ virtual void createRowIndexEntry() override;
+
+ virtual void writeIndex(
+ std::vector<proto::Stream> &streams) const override;
+
+ virtual void writeDictionary() override;
+
+ virtual void reset() override;
+
+ private:
+ std::vector<ColumnWriter *> children;
+ };
+
+ StructColumnWriter::StructColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options) {
+ for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ const Type& child = *type.getSubtype(i);
+ children.push_back(buildWriter(child, factory, options).release());
+ }
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ StructColumnWriter::~StructColumnWriter() {
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ delete children[i];
+ }
+ }
+
+ void StructColumnWriter::add(
+ ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const StructVectorBatch* structBatch =
+ dynamic_cast<const StructVectorBatch *>(&rowBatch);
+ if (structBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StructVectorBatch");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+ const char* notNull = structBatch->hasNulls ?
+ structBatch->notNull.data() + offset : nullptr;
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->add(*structBatch->fields[i], offset, numValues, notNull);
+ }
+
+ // update stats
+ if (!notNull) {
+ colIndexStatistics->increase(numValues);
+ } else {
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++count;
+ }
+ }
+ colIndexStatistics->increase(count);
+ if (count < numValues) {
+ colIndexStatistics->setHasNull(true);
+ }
+ }
+ }
+
+ void StructColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->flush(streams);
+ }
+ }
+
+ void StructColumnWriter::writeIndex(
+ std::vector<proto::Stream> &streams) const {
+ ColumnWriter::writeIndex(streams);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->writeIndex(streams);
+ }
+ }
+
+ uint64_t StructColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ size += children[i]->getEstimatedSize();
+ }
+ return size;
+ }
+
+ void StructColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+ encoding.set_dictionarysize(0);
+ encodings.push_back(encoding);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->getColumnEncoding(encodings);
+ }
+ }
+
+ void StructColumnWriter::getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getStripeStatistics(stats);
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->getStripeStatistics(stats);
+ }
+ }
+
+ void StructColumnWriter::mergeStripeStatsIntoFileStats() {
+ ColumnWriter::mergeStripeStatsIntoFileStats();
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->mergeStripeStatsIntoFileStats();
+ }
+ }
+
+ void StructColumnWriter::getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getFileStatistics(stats);
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->getFileStatistics(stats);
+ }
+ }
+
+ void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ ColumnWriter::mergeRowGroupStatsIntoStripeStats();
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->mergeRowGroupStatsIntoStripeStats();
+ }
+ }
+
+ void StructColumnWriter::createRowIndexEntry() {
+ ColumnWriter::createRowIndexEntry();
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->createRowIndexEntry();
+ }
+ }
+
+ void StructColumnWriter::reset() {
+ ColumnWriter::reset();
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->reset();
+ }
+ }
+
+ void StructColumnWriter::writeDictionary() {
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->writeDictionary();
+ }
+ }
+
+ class IntegerColumnWriter : public ColumnWriter {
+ public:
+ IntegerColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ protected:
+ std::unique_ptr<RleEncoder> rleEncoder;
+
+ private:
+ RleVersion rleVersion;
+ };
+
+ IntegerColumnWriter::IntegerColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()) {
+ std::unique_ptr<BufferedOutputStream> dataStream =
+ factory.createStream(proto::Stream_Kind_DATA);
+ rleEncoder = createRleEncoder(
+ std::move(dataStream),
+ true,
+ rleVersion,
+ memPool,
+ options.getAlignedBitpacking());
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ void IntegerColumnWriter::add(
+ ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const LongVectorBatch* longBatch =
+ dynamic_cast<const LongVectorBatch*>(&rowBatch);
+ if (longBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to LongVectorBatch");
+ }
+ IntegerColumnStatisticsImpl* intStats =
+ dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (intStats == nullptr) {
+ throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const int64_t* data = longBatch->data.data() + offset;
+ const char* notNull = longBatch->hasNulls ?
+ longBatch->notNull.data() + offset : nullptr;
+
+ rleEncoder->add(data, numValues, notNull);
+
+ // update stats
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(data[i]);
+ }
+ intStats->update(data[i], 1);
+ }
+ }
+ intStats->increase(count);
+ if (count < numValues) {
+ intStats->setHasNull(true);
+ }
+ }
+
+ void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_DATA);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(rleEncoder->flush());
+ streams.push_back(stream);
+ }
+
+ uint64_t IntegerColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += rleEncoder->getBufferSize();
+ return size;
+ }
+
+ void IntegerColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void IntegerColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ rleEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ class ByteColumnWriter : public ColumnWriter {
+ public:
+ ByteColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ private:
+ std::unique_ptr<ByteRleEncoder> byteRleEncoder;
+ };
+
+ ByteColumnWriter::ByteColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options) {
+ std::unique_ptr<BufferedOutputStream> dataStream =
+ factory.createStream(proto::Stream_Kind_DATA);
+ byteRleEncoder = createByteRleEncoder(std::move(dataStream));
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
+ if (byteBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to LongVectorBatch");
+ }
+ IntegerColumnStatisticsImpl* intStats =
+ dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (intStats == nullptr) {
+ throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ int64_t* data = byteBatch->data.data() + offset;
+ const char* notNull = byteBatch->hasNulls ?
+ byteBatch->notNull.data() + offset : nullptr;
+
+ char* byteData = reinterpret_cast<char*>(data);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ byteData[i] = static_cast<char>(data[i]);
+ }
+ byteRleEncoder->add(byteData, numValues, notNull);
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(data[i]);
+ }
+ intStats->update(static_cast<int64_t>(byteData[i]), 1);
+ }
+ }
+ intStats->increase(count);
+ if (count < numValues) {
+ intStats->setHasNull(true);
+ }
+ }
+
+ void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_DATA);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(byteRleEncoder->flush());
+ streams.push_back(stream);
+ }
+
+ uint64_t ByteColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += byteRleEncoder->getBufferSize();
+ return size;
+ }
+
+ void ByteColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void ByteColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ byteRleEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ class BooleanColumnWriter : public ColumnWriter {
+ public:
+ BooleanColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ private:
+ std::unique_ptr<ByteRleEncoder> rleEncoder;
+ };
+
+ BooleanColumnWriter::BooleanColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options) {
+ std::unique_ptr<BufferedOutputStream> dataStream =
+ factory.createStream(proto::Stream_Kind_DATA);
+ rleEncoder = createBooleanRleEncoder(std::move(dataStream));
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
+ if (byteBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to LongVectorBatch");
+ }
+ BooleanColumnStatisticsImpl* boolStats =
+ dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (boolStats == nullptr) {
+ throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ int64_t* data = byteBatch->data.data() + offset;
+ const char* notNull = byteBatch->hasNulls ?
+ byteBatch->notNull.data() + offset : nullptr;
+
+ char* byteData = reinterpret_cast<char*>(data);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ byteData[i] = static_cast<char>(data[i]);
+ }
+ rleEncoder->add(byteData, numValues, notNull);
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(data[i]);
+ }
+ boolStats->update(byteData[i] != 0, 1);
+ }
+ }
+ boolStats->increase(count);
+ if (count < numValues) {
+ boolStats->setHasNull(true);
+ }
+ }
+
+ void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_DATA);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(rleEncoder->flush());
+ streams.push_back(stream);
+ }
+
+ uint64_t BooleanColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += rleEncoder->getBufferSize();
+ return size;
+ }
+
+ void BooleanColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void BooleanColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ rleEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ class DoubleColumnWriter : public ColumnWriter {
+ public:
+ DoubleColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options,
+ bool isFloat);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ private:
+ bool isFloat;
+ std::unique_ptr<AppendOnlyBufferedStream> dataStream;
+ DataBuffer<char> buffer;
+ };
+
+ DoubleColumnWriter::DoubleColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options,
+ bool isFloatType) :
+ ColumnWriter(type, factory, options),
+ isFloat(isFloatType),
+ buffer(*options.getMemoryPool()) {
+ dataStream.reset(new AppendOnlyBufferedStream(
+ factory.createStream(proto::Stream_Kind_DATA)));
+ buffer.resize(isFloat ? 4 : 8);
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ // Floating point types are stored using IEEE 754 floating point bit layout.
+ // Float columns use 4 bytes per value and double columns use 8 bytes.
+ template <typename FLOAT_TYPE, typename INTEGER_TYPE>
+ inline void encodeFloatNum(FLOAT_TYPE input, char* output) {
+ INTEGER_TYPE* intBits = reinterpret_cast<INTEGER_TYPE*>(&input);
+ for (size_t i = 0; i < sizeof(INTEGER_TYPE); ++i) {
+ output[i] = static_cast<char>(((*intBits) >> (8 * i)) & 0xff);
+ }
+ }
+
+ void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const DoubleVectorBatch* dblBatch =
+ dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
+ if (dblBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to DoubleVectorBatch");
+ }
+ DoubleColumnStatisticsImpl* doubleStats =
+ dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (doubleStats == nullptr) {
+ throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const double* doubleData = dblBatch->data.data() + offset;
+ const char* notNull = dblBatch->hasNulls ?
+ dblBatch->notNull.data() + offset : nullptr;
+
+ size_t bytes = isFloat ? 4 : 8;
+ char* data = buffer.data();
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ if (isFloat) {
+ encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data);
+ } else {
+ encodeFloatNum<double, int64_t>(doubleData[i], data);
+ }
+ dataStream->write(data, bytes);
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addDouble(doubleData[i]);
+ }
+ doubleStats->update(doubleData[i]);
+ }
+ }
+ doubleStats->increase(count);
+ if (count < numValues) {
+ doubleStats->setHasNull(true);
+ }
+ }
+
+ void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_DATA);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(dataStream->flush());
+ streams.push_back(stream);
+ }
+
+ uint64_t DoubleColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += dataStream->getSize();
+ return size;
+ }
+
+ void DoubleColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void DoubleColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ dataStream->recordPosition(rowIndexPosition.get());
+ }
+
+ /**
+ * Implementation of increasing sorted string dictionary
+ */
+ class SortedStringDictionary {
+ public:
+ struct DictEntry {
+ DictEntry(const char * str, size_t len):data(str),length(len) {}
+ const char * data;
+ size_t length;
+ };
+
+ SortedStringDictionary():totalLength(0) {}
+
+ // insert a new string into dictionary, return its insertion order
+ size_t insert(const char * data, size_t len);
+
+ // write dictionary data & length to output buffer
+ void flush(AppendOnlyBufferedStream * dataStream,
+ RleEncoder * lengthEncoder) const;
+
+ // reorder input index buffer from insertion order to dictionary order
+ void reorder(std::vector<int64_t>& idxBuffer) const;
+
+ // get dict entries in insertion order
+ void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const;
+
+ // return count of entries
+ size_t size() const;
+
+ // return total length of strings in the dictioanry
+ uint64_t length() const;
+
+ void clear();
+
+ private:
+ struct LessThan {
+ bool operator()(const DictEntry& left, const DictEntry& right) const {
+ int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
+ if (ret != 0) {
+ return ret < 0;
+ }
+ return left.length < right.length;
+ }
+ };
+
+ std::map<DictEntry, size_t, LessThan> dict;
+ std::vector<std::vector<char>> data;
+ uint64_t totalLength;
+
+ // use friend class here to avoid being bothered by const function calls
+ friend class StringColumnWriter;
+ friend class CharColumnWriter;
+ friend class VarCharColumnWriter;
+ // store indexes of insertion order in the dictionary for not-null rows
+ std::vector<int64_t> idxInDictBuffer;
+ };
+
+ // insert a new string into dictionary, return its insertion order
+ size_t SortedStringDictionary::insert(const char * str, size_t len) {
+ auto ret = dict.insert({DictEntry(str, len), dict.size()});
+ if (ret.second) {
+ // make a copy to internal storage
+ data.push_back(std::vector<char>(len));
+ memcpy(data.back().data(), str, len);
+ // update dictionary entry to link pointer to internal storage
+ DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first));
+ entry->data = data.back().data();
+ totalLength += len;
+ }
+ return ret.first->second;
+ }
+
+ // write dictionary data & length to output buffer
+ void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream,
+ RleEncoder * lengthEncoder) const {
+ for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
+ dataStream->write(it->first.data, it->first.length);
+ lengthEncoder->write(static_cast<int64_t>(it->first.length));
+ }
+ }
+
+ /**
+ * Reorder input index buffer from insertion order to dictionary order
+ *
+ * We require this function because string values are buffered by indexes
+ * in their insertion order. Until the entire dictionary is complete can
+ * we get their sorted indexes in the dictionary in that ORC specification
+ * demands dictionary should be ordered. Therefore this function transforms
+ * the indexes from insertion order to dictionary value order for final
+ * output.
+ */
+ void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
+ // iterate the dictionary to get mapping from insertion order to value order
+ std::vector<size_t> mapping(dict.size());
+ size_t dictIdx = 0;
+ for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
+ mapping[it->second] = dictIdx++;
+ }
+
+ // do the transformation
+ for (size_t i = 0; i != idxBuffer.size(); ++i) {
+ idxBuffer[i] = static_cast<int64_t>(
+ mapping[static_cast<size_t>(idxBuffer[i])]);
+ }
+ }
+
+ // get dict entries in insertion order
+ void SortedStringDictionary::getEntriesInInsertionOrder(
+ std::vector<const DictEntry *>& entries) const {
+ entries.resize(dict.size());
+ for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
+ entries[it->second] = &(it->first);
+ }
+ }
+
+ // return count of entries
+ size_t SortedStringDictionary::size() const {
+ return dict.size();
+ }
+
+ // return total length of strings in the dictioanry
+ uint64_t SortedStringDictionary::length() const {
+ return totalLength;
+ }
+
+ void SortedStringDictionary::clear() {
+ totalLength = 0;
+ data.clear();
+ dict.clear();
+ }
+
+ class StringColumnWriter : public ColumnWriter {
+ public:
+ StringColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ virtual void createRowIndexEntry() override;
+
+ virtual void writeDictionary() override;
+
+ virtual void reset() override;
+
+ private:
+ /**
+ * dictionary related functions
+ */
+ bool checkDictionaryKeyRatio();
+ void createDirectStreams();
+ void createDictStreams();
+ void deleteDictStreams();
+ void fallbackToDirectEncoding();
+
+ protected:
+ RleVersion rleVersion;
+ bool useCompression;
+ const StreamsFactory& streamsFactory;
+ bool alignedBitPacking;
+
+ // direct encoding streams
+ std::unique_ptr<RleEncoder> directLengthEncoder;
+ std::unique_ptr<AppendOnlyBufferedStream> directDataStream;
+
+ // dictionary encoding streams
+ std::unique_ptr<RleEncoder> dictDataEncoder;
+ std::unique_ptr<RleEncoder> dictLengthEncoder;
+ std::unique_ptr<AppendOnlyBufferedStream> dictStream;
+
+ /**
+ * dictionary related variables
+ */
+ SortedStringDictionary dictionary;
+ // whether or not dictionary checking is done
+ bool doneDictionaryCheck;
+ // whether or not it should be used
+ bool useDictionary;
+ // keys in the dictionary should not exceed this ratio
+ double dictSizeThreshold;
+
+ // record start row of each row group; null rows are skipped
+ mutable std::vector<size_t> startOfRowGroups;
+ };
+
+ StringColumnWriter::StringColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ useCompression(options.getCompression() != CompressionKind_NONE),
+ streamsFactory(factory),
+ alignedBitPacking(options.getAlignedBitpacking()),
+ doneDictionaryCheck(false),
+ useDictionary(options.getEnableDictionary()),
+ dictSizeThreshold(options.getDictionaryKeySizeThreshold()){
+ if (type.getKind() == TypeKind::BINARY) {
+ useDictionary = false;
+ doneDictionaryCheck = true;
+ }
+
+ if (useDictionary) {
+ createDictStreams();
+ } else {
+ doneDictionaryCheck = true;
+ createDirectStreams();
+ }
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ void StringColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const StringVectorBatch* stringBatch =
+ dynamic_cast<const StringVectorBatch*>(&rowBatch);
+ if (stringBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+
+ StringColumnStatisticsImpl* strStats =
+ dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (strStats == nullptr) {
+ throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ char *const * data = stringBatch->data.data() + offset;
+ const int64_t* length = stringBatch->length.data() + offset;
+ const char* notNull = stringBatch->hasNulls ?
+ stringBatch->notNull.data() + offset : nullptr;
+
+ if (!useDictionary){
+ directLengthEncoder->add(length, numValues, notNull);
+ }
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ const size_t len = static_cast<size_t>(length[i]);
+ if (useDictionary) {
+ size_t index = dictionary.insert(data[i], len);
+ dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
+ } else {
+ directDataStream->write(data[i], len);
+ }
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], static_cast<int64_t>(len));
+ }
+ strStats->update(data[i], len);
+ ++count;
+ }
+ }
+ strStats->increase(count);
+ if (count < numValues) {
+ strStats->setHasNull(true);
+ }
+ }
+
+ void StringColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ if (useDictionary) {
+ proto::Stream data;
+ data.set_kind(proto::Stream_Kind_DATA);
+ data.set_column(static_cast<uint32_t>(columnId));
+ data.set_length(dictDataEncoder->flush());
+ streams.push_back(data);
+
+ proto::Stream dict;
+ dict.set_kind(proto::Stream_Kind_DICTIONARY_DATA);
+ dict.set_column(static_cast<uint32_t>(columnId));
+ dict.set_length(dictStream->flush());
+ streams.push_back(dict);
+
+ proto::Stream length;
+ length.set_kind(proto::Stream_Kind_LENGTH);
+ length.set_column(static_cast<uint32_t>(columnId));
+ length.set_length(dictLengthEncoder->flush());
+ streams.push_back(length);
+ } else {
+ proto::Stream length;
+ length.set_kind(proto::Stream_Kind_LENGTH);
+ length.set_column(static_cast<uint32_t>(columnId));
+ length.set_length(directLengthEncoder->flush());
+ streams.push_back(length);
+
+ proto::Stream data;
+ data.set_kind(proto::Stream_Kind_DATA);
+ data.set_column(static_cast<uint32_t>(columnId));
+ data.set_length(directDataStream->flush());
+ streams.push_back(data);
+ }
+ }
+
+ uint64_t StringColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ if (!useDictionary) {
+ size += directLengthEncoder->getBufferSize();
+ size += directDataStream->getSize();
+ } else {
+ size += dictionary.length();
+ size += dictionary.size() * sizeof(int32_t);
+ size += dictionary.idxInDictBuffer.size() * sizeof(int32_t);
+ if (useCompression) {
+ size /= 3; // estimated ratio is 3:1
+ }
+ }
+ return size;
+ }
+
+ void StringColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ if (!useDictionary) {
+ encoding.set_kind(rleVersion == RleVersion_1 ?
+ proto::ColumnEncoding_Kind_DIRECT :
+ proto::ColumnEncoding_Kind_DIRECT_V2);
+ } else {
+ encoding.set_kind(rleVersion == RleVersion_1 ?
+ proto::ColumnEncoding_Kind_DICTIONARY :
+ proto::ColumnEncoding_Kind_DICTIONARY_V2);
+ }
+ encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size()));
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void StringColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ if (!useDictionary) {
+ directDataStream->recordPosition(rowIndexPosition.get());
+ directLengthEncoder->recordPosition(rowIndexPosition.get());
+ } else {
+ if (enableIndex) {
+ startOfRowGroups.push_back(dictionary.idxInDictBuffer.size());
+ }
+ }
+ }
+
+ bool StringColumnWriter::checkDictionaryKeyRatio() {
+ if (!doneDictionaryCheck) {
+ useDictionary = dictionary.size() <= static_cast<size_t>(
+ static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold);
+ doneDictionaryCheck = true;
+ }
+
+ return useDictionary;
+ }
+
+ void StringColumnWriter::createRowIndexEntry() {
+ if (useDictionary && !doneDictionaryCheck) {
+ if (!checkDictionaryKeyRatio()) {
+ fallbackToDirectEncoding();
+ }
+ }
+ ColumnWriter::createRowIndexEntry();
+ }
+
+ void StringColumnWriter::reset() {
+ ColumnWriter::reset();
+
+ dictionary.clear();
+ dictionary.idxInDictBuffer.resize(0);
+ startOfRowGroups.clear();
+ startOfRowGroups.push_back(0);
+ }
+
+ void StringColumnWriter::createDirectStreams() {
+ std::unique_ptr<BufferedOutputStream> directLengthStream =
+ streamsFactory.createStream(proto::Stream_Kind_LENGTH);
+ directLengthEncoder = createRleEncoder(std::move(directLengthStream),
+ false,
+ rleVersion,
+ memPool,
+ alignedBitPacking);
+ directDataStream.reset(new AppendOnlyBufferedStream(
+ streamsFactory.createStream(proto::Stream_Kind_DATA)));
+ }
+
+ void StringColumnWriter::createDictStreams() {
+ std::unique_ptr<BufferedOutputStream> dictDataStream =
+ streamsFactory.createStream(proto::Stream_Kind_DATA);
+ dictDataEncoder = createRleEncoder(std::move(dictDataStream),
+ false,
+ rleVersion,
+ memPool,
+ alignedBitPacking);
+ std::unique_ptr<BufferedOutputStream> dictLengthStream =
+ streamsFactory.createStream(proto::Stream_Kind_LENGTH);
+ dictLengthEncoder = createRleEncoder(std::move(dictLengthStream),
+ false,
+ rleVersion,
+ memPool,
+ alignedBitPacking);
+ dictStream.reset(new AppendOnlyBufferedStream(
+ streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA)));
+ }
+
+ void StringColumnWriter::deleteDictStreams() {
+ dictDataEncoder.reset(nullptr);
+ dictLengthEncoder.reset(nullptr);
+ dictStream.reset(nullptr);
+
+ dictionary.clear();
+ dictionary.idxInDictBuffer.clear();
+ startOfRowGroups.clear();
+ }
+
+ void StringColumnWriter::writeDictionary() {
+ if (useDictionary && !doneDictionaryCheck) {
+ // when index is disabled, dictionary check happens while writing 1st stripe
+ if (!checkDictionaryKeyRatio()) {
+ fallbackToDirectEncoding();
+ return;
+ }
+ }
+
+ if (useDictionary) {
+ // flush dictionary data & length streams
+ dictionary.flush(dictStream.get(), dictLengthEncoder.get());
+
+ // convert index from insertion order to dictionary order
+ dictionary.reorder(dictionary.idxInDictBuffer);
+
+ // write data sequences
+ int64_t * data = dictionary.idxInDictBuffer.data();
+ if (enableIndex) {
+ size_t prevOffset = 0;
+ for (size_t i = 0; i < startOfRowGroups.size(); ++i) {
+ // write sequences in batch for a row group stride
+ size_t offset = startOfRowGroups[i];
+ dictDataEncoder->add(data + prevOffset, offset - prevOffset, nullptr);
+
+ // update index positions
+ int rowGroupId = static_cast<int>(i);
+ proto::RowIndexEntry* indexEntry =
+ (rowGroupId < rowIndex->entry_size()) ?
+ rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get();
+
+ // add positions for direct streams
+ RowIndexPositionRecorder recorder(*indexEntry);
+ dictDataEncoder->recordPosition(&recorder);
+
+ prevOffset = offset;
+ }
+
+ dictDataEncoder->add(data + prevOffset,
+ dictionary.idxInDictBuffer.size() - prevOffset,
+ nullptr);
+ } else {
+ dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr);
+ }
+ }
+ }
+
+ void StringColumnWriter::fallbackToDirectEncoding() {
+ createDirectStreams();
+
+ if (enableIndex) {
+ // fallback happens at the 1st row group;
+ // simply complete positions for direct streams
+ proto::RowIndexEntry * indexEntry = rowIndexEntry.get();
+ RowIndexPositionRecorder recorder(*indexEntry);
+ directDataStream->recordPosition(&recorder);
+ directLengthEncoder->recordPosition(&recorder);
+ }
+
+ // get dictionary entries in insertion order
+ std::vector<const SortedStringDictionary::DictEntry *> entries;
+ dictionary.getEntriesInInsertionOrder(entries);
+
+ // store each length of the data into a vector
+ const SortedStringDictionary::DictEntry * dictEntry = nullptr;
+ for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) {
+ // write one row data in direct encoding
+ dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])];
+ directDataStream->write(dictEntry->data, dictEntry->length);
+ directLengthEncoder->write(static_cast<int64_t>(dictEntry->length));
+ }
+
+ deleteDictStreams();
+ }
+
+ struct Utf8Utils {
+ /**
+ * Counts how many utf-8 chars of the input data
+ */
+ static uint64_t charLength(const char * data, uint64_t length) {
+ uint64_t chars = 0;
+ for (uint64_t i = 0; i < length; i++) {
+ if (isUtfStartByte(data[i])) {
+ chars++;
+ }
+ }
+ return chars;
+ }
+
+ /**
+ * Return the number of bytes required to read at most maxCharLength
+ * characters in full from a utf-8 encoded byte array provided
+ * by data. This does not validate utf-8 data, but
+ * operates correctly on already valid utf-8 data.
+ *
+ * @param maxCharLength number of characters required
+ * @param data the bytes of UTF-8
+ * @param length the length of data to truncate
+ */
+ static uint64_t truncateBytesTo(uint64_t maxCharLength,
+ const char * data,
+ uint64_t length) {
+ uint64_t chars = 0;
+ if (length <= maxCharLength) {
+ return length;
+ }
+ for (uint64_t i = 0; i < length; i++) {
+ if (isUtfStartByte(data[i])) {
+ chars++;
+ }
+ if (chars > maxCharLength) {
+ return i;
+ }
+ }
+ // everything fits
+ return length;
+ }
+
+ /**
+ * Checks if b is the first byte of a UTF-8 character.
+ */
+ inline static bool isUtfStartByte(char b) {
+ return (b & 0xC0) != 0x80;
+ }
+
+ /**
+ * Find the start of the last character that ends in the current string.
+ * @param text the bytes of the utf-8
+ * @param from the first byte location
+ * @param until the last byte location
+ * @return the index of the last character
+ */
+ static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) {
+ uint64_t posn = until;
+ /* we don't expect characters more than 5 bytes */
+ while (posn >= from) {
+ if (isUtfStartByte(text[posn])) {
+ return posn;
+ }
+ posn -= 1;
+ }
+ /* beginning of a valid char not found */
+ throw std::logic_error(
+ "Could not truncate string, beginning of a valid char not found");
+ }
+ };
+
+ class CharColumnWriter : public StringColumnWriter {
+ public:
+ CharColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ StringColumnWriter(type, factory, options),
+ maxLength(type.getMaximumLength()),
+ padBuffer(*options.getMemoryPool()) {
+ // utf-8 is currently 4 bytes long, but it could be up to 6
+ padBuffer.resize(maxLength * 6);
+ }
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ private:
+ uint64_t maxLength;
+ DataBuffer<char> padBuffer;
+ };
+
+ void CharColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
+ if (charsBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+
+ StringColumnStatisticsImpl* strStats =
+ dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (strStats == nullptr) {
+ throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ char** data = charsBatch->data.data() + offset;
+ int64_t* length = charsBatch->length.data() + offset;
+ const char* notNull = charsBatch->hasNulls ?
+ charsBatch->notNull.data() + offset : nullptr;
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ const char * charData = nullptr;
+ uint64_t originLength = static_cast<uint64_t>(length[i]);
+ uint64_t charLength = Utf8Utils::charLength(data[i], originLength);
+ if (charLength >= maxLength) {
+ charData = data[i];
+ length[i] = static_cast<int64_t>(
+ Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
+ } else {
+ charData = padBuffer.data();
+ // the padding is exactly 1 byte per char
+ length[i] = length[i] + static_cast<int64_t>(maxLength - charLength);
+ memcpy(padBuffer.data(), data[i], originLength);
+ memset(padBuffer.data() + originLength,
+ ' ',
+ static_cast<size_t>(length[i]) - originLength);
+ }
+
+ if (useDictionary) {
+ size_t index = dictionary.insert(charData, static_cast<size_t>(length[i]));
+ dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
+ } else {
+ directDataStream->write(charData, static_cast<size_t>(length[i]));
+ }
+
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], length[i]);
+ }
+ strStats->update(charData, static_cast<size_t>(length[i]));
+ ++count;
+ }
+ }
+
+ if (!useDictionary) {
+ directLengthEncoder->add(length, numValues, notNull);
+ }
+
+ strStats->increase(count);
+ if (count < numValues) {
+ strStats->setHasNull(true);
+ }
+ }
+
+ class VarCharColumnWriter : public StringColumnWriter {
+ public:
+ VarCharColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ StringColumnWriter(type, factory, options),
+ maxLength(type.getMaximumLength()) {
+ // PASS
+ }
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ private:
+ uint64_t maxLength;
+ };
+
+ void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
+ if (charsBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+
+ StringColumnStatisticsImpl* strStats =
+ dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (strStats == nullptr) {
+ throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ char* const* data = charsBatch->data.data() + offset;
+ int64_t* length = charsBatch->length.data() + offset;
+ const char* notNull = charsBatch->hasNulls ?
+ charsBatch->notNull.data() + offset : nullptr;
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ uint64_t itemLength = Utf8Utils::truncateBytesTo(
+ maxLength, data[i], static_cast<uint64_t>(length[i]));
+ length[i] = static_cast<int64_t>(itemLength);
+
+ if (useDictionary) {
+ size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i]));
+ dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
+ } else {
+ directDataStream->write(data[i], static_cast<size_t>(length[i]));
+ }
+
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], length[i]);
+ }
+ strStats->update(data[i], static_cast<size_t>(length[i]));
+ ++count;
+ }
+ }
+
+ if (!useDictionary) {
+ directLengthEncoder->add(length, numValues, notNull);
+ }
+
+ strStats->increase(count);
+ if (count < numValues) {
+ strStats->setHasNull(true);
+ }
+ }
+
+ class BinaryColumnWriter : public StringColumnWriter {
+ public:
+ BinaryColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ StringColumnWriter(type, factory, options) {
+ // PASS
+ }
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+ };
+
+ void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
+ if (binBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+
+ BinaryColumnStatisticsImpl* binStats =
+ dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (binStats == nullptr) {
+ throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ char** data = binBatch->data.data() + offset;
+ int64_t* length = binBatch->length.data() + offset;
+ const char* notNull = binBatch->hasNulls ?
+ binBatch->notNull.data() + offset : nullptr;
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ uint64_t unsignedLength = static_cast<uint64_t>(length[i]);
+ if (!notNull || notNull[i]) {
+ directDataStream->write(data[i], unsignedLength);
+
+ binStats->update(unsignedLength);
+ ++count;
+ }
+ }
+ directLengthEncoder->add(length, numValues, notNull);
+ binStats->increase(count);
+ if (count < numValues) {
+ binStats->setHasNull(true);
+ }
+ }
+
+ class TimestampColumnWriter : public ColumnWriter {
+ public:
+ TimestampColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ protected:
+ std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder;
+
+ private:
+ RleVersion rleVersion;
+ const Timezone& timezone;
+ };
+
+ TimestampColumnWriter::TimestampColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ timezone(getTimezoneByName("GMT")){
+ std::unique_ptr<BufferedOutputStream> dataStream =
+ factory.createStream(proto::Stream_Kind_DATA);
+ std::unique_ptr<BufferedOutputStream> secondaryStream =
+ factory.createStream(proto::Stream_Kind_SECONDARY);
+ secRleEncoder = createRleEncoder(std::move(dataStream),
+ true,
+ rleVersion,
+ memPool,
+ options.getAlignedBitpacking());
+ nanoRleEncoder = createRleEncoder(std::move(secondaryStream),
+ false,
+ rleVersion,
+ memPool,
+ options.getAlignedBitpacking());
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ // Because the number of nanoseconds often has a large number of trailing zeros,
+ // the number has trailing decimal zero digits removed and the last three bits
+ // are used to record how many zeros were removed if the trailing zeros are
+ // more than 2. Thus 1000 nanoseconds would be serialized as 0x0a and
+ // 100000 would be serialized as 0x0c.
+ static int64_t formatNano(int64_t nanos) {
+ if (nanos == 0) {
+ return 0;
+ } else if (nanos % 100 != 0) {
+ return (nanos) << 3;
+ } else {
+ nanos /= 100;
+ int64_t trailingZeros = 1;
+ while (nanos % 10 == 0 && trailingZeros < 7) {
+ nanos /= 10;
+ trailingZeros += 1;
+ }
+ return (nanos) << 3 | trailingZeros;
+ }
+ }
+
+ void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ TimestampVectorBatch* tsBatch =
+ dynamic_cast<TimestampVectorBatch*>(&rowBatch);
+ if (tsBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to TimestampVectorBatch");
+ }
+
+ TimestampColumnStatisticsImpl* tsStats =
+ dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (tsStats == nullptr) {
+ throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const char* notNull = tsBatch->hasNulls ?
+ tsBatch->notNull.data() + offset : nullptr;
+ int64_t *secs = tsBatch->data.data() + offset;
+ int64_t *nanos = tsBatch->nanoseconds.data() + offset;
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ // TimestampVectorBatch already stores data in UTC
+ int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(millsUTC);
+ }
+ tsStats->update(millsUTC);
+
if (secs[i] < 0 && nanos[i] > 999999) {
- secs[i] += 1;
- }
-
- secs[i] -= timezone.getEpoch();
- nanos[i] = formatNano(nanos[i]);
- }
- }
- tsStats->increase(count);
- if (count < numValues) {
- tsStats->setHasNull(true);
- }
-
- secRleEncoder->add(secs, numValues, notNull);
- nanoRleEncoder->add(nanos, numValues, notNull);
- }
-
- void TimestampColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream dataStream;
- dataStream.set_kind(proto::Stream_Kind_DATA);
- dataStream.set_column(static_cast<uint32_t>(columnId));
- dataStream.set_length(secRleEncoder->flush());
- streams.push_back(dataStream);
-
- proto::Stream secondaryStream;
- secondaryStream.set_kind(proto::Stream_Kind_SECONDARY);
- secondaryStream.set_column(static_cast<uint32_t>(columnId));
- secondaryStream.set_length(nanoRleEncoder->flush());
- streams.push_back(secondaryStream);
- }
-
- uint64_t TimestampColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += secRleEncoder->getBufferSize();
- size += nanoRleEncoder->getBufferSize();
- return size;
- }
-
- void TimestampColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void TimestampColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- secRleEncoder->recordPosition(rowIndexPosition.get());
- nanoRleEncoder->recordPosition(rowIndexPosition.get());
- }
-
- class DateColumnWriter : public IntegerColumnWriter {
- public:
- DateColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
- };
-
- DateColumnWriter::DateColumnWriter(
- const Type &type,
- const StreamsFactory &factory,
- const WriterOptions &options) :
- IntegerColumnWriter(type, factory, options) {
- // PASS
- }
-
- void DateColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const LongVectorBatch* longBatch =
- dynamic_cast<const LongVectorBatch*>(&rowBatch);
- if (longBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
- }
-
- DateColumnStatisticsImpl* dateStats =
- dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get());
- if (dateStats == nullptr) {
- throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const int64_t* data = longBatch->data.data() + offset;
- const char* notNull = longBatch->hasNulls ?
- longBatch->notNull.data() + offset : nullptr;
-
- rleEncoder->add(data, numValues, notNull);
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- ++count;
- dateStats->update(static_cast<int32_t>(data[i]));
- if (enableBloomFilter) {
- bloomFilter->addLong(data[i]);
- }
- }
- }
- dateStats->increase(count);
- if (count < numValues) {
- dateStats->setHasNull(true);
- }
- }
-
- class Decimal64ColumnWriter : public ColumnWriter {
- public:
- static const uint32_t MAX_PRECISION_64 = 18;
- static const uint32_t MAX_PRECISION_128 = 38;
-
- Decimal64ColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void recordPosition() const override;
-
- protected:
- RleVersion rleVersion;
- uint64_t precision;
- uint64_t scale;
- std::unique_ptr<AppendOnlyBufferedStream> valueStream;
- std::unique_ptr<RleEncoder> scaleEncoder;
-
- private:
- char buffer[10];
- };
-
- Decimal64ColumnWriter::Decimal64ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- precision(type.getPrecision()),
- scale(type.getScale()) {
- valueStream.reset(new AppendOnlyBufferedStream(
- factory.createStream(proto::Stream_Kind_DATA)));
- std::unique_ptr<BufferedOutputStream> scaleStream =
- factory.createStream(proto::Stream_Kind_SECONDARY);
- scaleEncoder = createRleEncoder(std::move(scaleStream),
- true,
- rleVersion,
- memPool,
- options.getAlignedBitpacking());
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const Decimal64VectorBatch* decBatch =
- dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
- if (decBatch == nullptr) {
- throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
- }
-
- DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
- if (decStats == nullptr) {
- throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
- const int64_t* values = decBatch->values.data() + offset;
-
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- int64_t val = zigZag(values[i]);
- char* data = buffer;
- while (true) {
- if ((val & ~0x7f) == 0) {
- *(data++) = (static_cast<char>(val));
- break;
- } else {
- *(data++) = static_cast<char>(0x80 | (val & 0x7f));
- // cast val to unsigned so as to force 0-fill right shift
- val = (static_cast<uint64_t>(val) >> 7);
- }
- }
- valueStream->write(buffer, static_cast<size_t>(data - buffer));
- ++count;
- if (enableBloomFilter) {
- std::string decimal = Decimal(
- values[i], static_cast<int32_t>(scale)).toString();
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
- }
- decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
- }
- }
- decStats->increase(count);
- if (count < numValues) {
- decStats->setHasNull(true);
- }
- std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
- scaleEncoder->add(scales.data(), numValues, notNull);
- }
-
- void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream dataStream;
- dataStream.set_kind(proto::Stream_Kind_DATA);
- dataStream.set_column(static_cast<uint32_t>(columnId));
- dataStream.set_length(valueStream->flush());
- streams.push_back(dataStream);
-
- proto::Stream secondaryStream;
- secondaryStream.set_kind(proto::Stream_Kind_SECONDARY);
- secondaryStream.set_column(static_cast<uint32_t>(columnId));
- secondaryStream.set_length(scaleEncoder->flush());
- streams.push_back(secondaryStream);
- }
-
- uint64_t Decimal64ColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += valueStream->getSize();
- size += scaleEncoder->getBufferSize();
- return size;
- }
-
- void Decimal64ColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- }
-
- void Decimal64ColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- valueStream->recordPosition(rowIndexPosition.get());
- scaleEncoder->recordPosition(rowIndexPosition.get());
- }
-
- class Decimal128ColumnWriter : public Decimal64ColumnWriter {
- public:
- Decimal128ColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- private:
- char buffer[20];
- };
-
- Decimal128ColumnWriter::Decimal128ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- Decimal64ColumnWriter(type, factory, options) {
- // PASS
- }
-
- // Zigzag encoding moves the sign bit to the least significant bit using the
- // expression (val « 1) ^ (val » 63) and derives its name from the fact that
- // positive and negative numbers alternate once encoded.
- Int128 zigZagInt128(const Int128& value) {
- bool isNegative = value < 0;
- Int128 val = value.abs();
- val <<= 1;
- if (isNegative) {
- val -= 1;
- }
- return val;
- }
-
- void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const Decimal128VectorBatch* decBatch =
- dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
- if (decBatch == nullptr) {
- throw InvalidArgument("Failed to cast to Decimal128VectorBatch");
- }
-
- DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
- if (decStats == nullptr) {
- throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
- const Int128* values = decBatch->values.data() + offset;
-
- // The current encoding of decimal columns stores the integer representation
- // of the value as an unbounded length zigzag encoded base 128 varint.
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- Int128 val = zigZagInt128(values[i]);
- char* data = buffer;
- while (true) {
- if ((val & ~0x7f) == 0) {
- *(data++) = (static_cast<char>(val.getLowBits()));
- break;
- } else {
- *(data++) = static_cast<char>(0x80 | (val.getLowBits() & 0x7f));
- val >>= 7;
- }
- }
- valueStream->write(buffer, static_cast<size_t>(data - buffer));
-
- ++count;
- if (enableBloomFilter) {
- std::string decimal = Decimal(
- values[i], static_cast<int32_t>(scale)).toString();
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
- }
- decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
- }
- }
- decStats->increase(count);
- if (count < numValues) {
- decStats->setHasNull(true);
- }
- std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
- scaleEncoder->add(scales.data(), numValues, notNull);
- }
-
- class ListColumnWriter : public ColumnWriter {
- public:
- ListColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
- ~ListColumnWriter() override;
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void mergeStripeStatsIntoFileStats() override;
-
- virtual void mergeRowGroupStatsIntoStripeStats() override;
-
- virtual void createRowIndexEntry() override;
-
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
-
- virtual void recordPosition() const override;
-
- virtual void writeDictionary() override;
-
- virtual void reset() override;
-
- private:
- std::unique_ptr<RleEncoder> lengthEncoder;
- RleVersion rleVersion;
- std::unique_ptr<ColumnWriter> child;
- };
-
- ListColumnWriter::ListColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()){
-
- std::unique_ptr<BufferedOutputStream> lengthStream =
- factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream),
- false,
- rleVersion,
- memPool,
- options.getAlignedBitpacking());
-
- if (type.getSubtypeCount() == 1) {
- child = buildWriter(*type.getSubtype(0), factory, options);
- }
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- ListColumnWriter::~ListColumnWriter() {
- // PASS
- }
-
- void ListColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch);
- if (listBatch == nullptr) {
- throw InvalidArgument("Failed to cast to ListVectorBatch");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- int64_t* offsets = listBatch->offsets.data() + offset;
- const char* notNull = listBatch->hasNulls ?
- listBatch->notNull.data() + offset : nullptr;
-
- uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
- uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
-
- // translate offsets to lengths
- for (uint64_t i = 0; i != numValues; ++i) {
- offsets[i] = offsets[i + 1] - offsets[i];
- }
-
- // unnecessary to deal with null as elements are packed together
- if (child.get()) {
- child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr);
- }
- lengthEncoder->add(offsets, numValues, notNull);
-
- if (enableIndex) {
- if (!notNull) {
- colIndexStatistics->increase(numValues);
- } else {
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull[i]) {
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(offsets[i]);
- }
- }
- }
- colIndexStatistics->increase(count);
- if (count < numValues) {
- colIndexStatistics->setHasNull(true);
- }
- }
- }
- }
-
- void ListColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_LENGTH);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(lengthEncoder->flush());
- streams.push_back(stream);
-
- if (child.get()) {
- child->flush(streams);
- }
- }
-
- void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
- ColumnWriter::writeIndex(streams);
- if (child.get()) {
- child->writeIndex(streams);
- }
- }
-
- uint64_t ListColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- if (child.get()) {
- size += lengthEncoder->getBufferSize();
- size += child->getEstimatedSize();
- }
- return size;
- }
-
- void ListColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- if (child.get()) {
- child->getColumnEncoding(encodings);
- }
- }
-
- void ListColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getStripeStatistics(stats);
- if (child.get()) {
- child->getStripeStatistics(stats);
- }
- }
-
- void ListColumnWriter::mergeStripeStatsIntoFileStats() {
- ColumnWriter::mergeStripeStatsIntoFileStats();
- if (child.get()) {
- child->mergeStripeStatsIntoFileStats();
- }
- }
-
- void ListColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getFileStatistics(stats);
- if (child.get()) {
- child->getFileStatistics(stats);
- }
- }
-
- void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
- ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- if (child.get()) {
- child->mergeRowGroupStatsIntoStripeStats();
- }
- }
-
- void ListColumnWriter::createRowIndexEntry() {
- ColumnWriter::createRowIndexEntry();
- if (child.get()) {
- child->createRowIndexEntry();
- }
- }
-
- void ListColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- lengthEncoder->recordPosition(rowIndexPosition.get());
- }
-
- void ListColumnWriter::reset() {
- ColumnWriter::reset();
- if (child) {
- child->reset();
- }
- }
-
- void ListColumnWriter::writeDictionary() {
- if (child) {
- child->writeDictionary();
- }
- }
-
- class MapColumnWriter : public ColumnWriter {
- public:
- MapColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
- ~MapColumnWriter() override;
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void mergeStripeStatsIntoFileStats() override;
-
- virtual void mergeRowGroupStatsIntoStripeStats() override;
-
- virtual void createRowIndexEntry() override;
-
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
-
- virtual void recordPosition() const override;
-
- virtual void writeDictionary() override;
-
- virtual void reset() override;
-
- private:
- std::unique_ptr<ColumnWriter> keyWriter;
- std::unique_ptr<ColumnWriter> elemWriter;
- std::unique_ptr<RleEncoder> lengthEncoder;
- RleVersion rleVersion;
- };
-
- MapColumnWriter::MapColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()){
- std::unique_ptr<BufferedOutputStream> lengthStream =
- factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream),
- false,
- rleVersion,
- memPool,
- options.getAlignedBitpacking());
-
- if (type.getSubtypeCount() > 0) {
- keyWriter = buildWriter(*type.getSubtype(0), factory, options);
- }
-
- if (type.getSubtypeCount() > 1) {
- elemWriter = buildWriter(*type.getSubtype(1), factory, options);
- }
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- MapColumnWriter::~MapColumnWriter() {
- // PASS
- }
-
- void MapColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch);
- if (mapBatch == nullptr) {
- throw InvalidArgument("Failed to cast to MapVectorBatch");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- int64_t* offsets = mapBatch->offsets.data() + offset;
- const char* notNull = mapBatch->hasNulls ?
- mapBatch->notNull.data() + offset : nullptr;
-
- uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
- uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
-
- // translate offsets to lengths
- for (uint64_t i = 0; i != numValues; ++i) {
- offsets[i] = offsets[i + 1] - offsets[i];
- }
-
- lengthEncoder->add(offsets, numValues, notNull);
-
- // unnecessary to deal with null as keys and values are packed together
- if (keyWriter.get()) {
- keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr);
- }
- if (elemWriter.get()) {
- elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr);
- }
-
- if (enableIndex) {
- if (!notNull) {
- colIndexStatistics->increase(numValues);
- } else {
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull[i]) {
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(offsets[i]);
- }
- }
- }
- colIndexStatistics->increase(count);
- if (count < numValues) {
- colIndexStatistics->setHasNull(true);
- }
- }
- }
- }
-
- void MapColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_LENGTH);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(lengthEncoder->flush());
- streams.push_back(stream);
-
- if (keyWriter.get()) {
- keyWriter->flush(streams);
- }
- if (elemWriter.get()) {
- elemWriter->flush(streams);
- }
- }
-
- void MapColumnWriter::writeIndex(
- std::vector<proto::Stream> &streams) const {
- ColumnWriter::writeIndex(streams);
- if (keyWriter.get()) {
- keyWriter->writeIndex(streams);
- }
- if (elemWriter.get()) {
- elemWriter->writeIndex(streams);
- }
- }
-
- uint64_t MapColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += lengthEncoder->getBufferSize();
- if (keyWriter.get()) {
- size += keyWriter->getEstimatedSize();
- }
- if (elemWriter.get()) {
- size += elemWriter->getEstimatedSize();
- }
- return size;
- }
-
- void MapColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(RleVersionMapper(rleVersion));
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- if (keyWriter.get()) {
- keyWriter->getColumnEncoding(encodings);
- }
- if (elemWriter.get()) {
- elemWriter->getColumnEncoding(encodings);
- }
- }
-
- void MapColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getStripeStatistics(stats);
- if (keyWriter.get()) {
- keyWriter->getStripeStatistics(stats);
- }
- if (elemWriter.get()) {
- elemWriter->getStripeStatistics(stats);
- }
- }
-
- void MapColumnWriter::mergeStripeStatsIntoFileStats() {
- ColumnWriter::mergeStripeStatsIntoFileStats();
- if (keyWriter.get()) {
- keyWriter->mergeStripeStatsIntoFileStats();
- }
- if (elemWriter.get()) {
- elemWriter->mergeStripeStatsIntoFileStats();
- }
- }
-
- void MapColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getFileStatistics(stats);
- if (keyWriter.get()) {
- keyWriter->getFileStatistics(stats);
- }
- if (elemWriter.get()) {
- elemWriter->getFileStatistics(stats);
- }
- }
-
- void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
- ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- if (keyWriter.get()) {
- keyWriter->mergeRowGroupStatsIntoStripeStats();
- }
- if (elemWriter.get()) {
- elemWriter->mergeRowGroupStatsIntoStripeStats();
- }
- }
-
- void MapColumnWriter::createRowIndexEntry() {
- ColumnWriter::createRowIndexEntry();
- if (keyWriter.get()) {
- keyWriter->createRowIndexEntry();
- }
- if (elemWriter.get()) {
- elemWriter->createRowIndexEntry();
- }
- }
-
- void MapColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- lengthEncoder->recordPosition(rowIndexPosition.get());
- }
-
- void MapColumnWriter::reset() {
- ColumnWriter::reset();
- if (keyWriter) {
- keyWriter->reset();
- }
- if (elemWriter) {
- elemWriter->reset();
- }
- }
-
- void MapColumnWriter::writeDictionary() {
- if (keyWriter) {
- keyWriter->writeDictionary();
- }
- if (elemWriter) {
- elemWriter->writeDictionary();
- }
- }
-
- class UnionColumnWriter : public ColumnWriter {
- public:
- UnionColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
- ~UnionColumnWriter() override;
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) override;
-
- virtual void flush(std::vector<proto::Stream>& streams) override;
-
- virtual uint64_t getEstimatedSize() const override;
-
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
-
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
-
- virtual void mergeStripeStatsIntoFileStats() override;
-
- virtual void mergeRowGroupStatsIntoStripeStats() override;
-
- virtual void createRowIndexEntry() override;
-
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
-
- virtual void recordPosition() const override;
-
- virtual void writeDictionary() override;
-
- virtual void reset() override;
-
- private:
- std::unique_ptr<ByteRleEncoder> rleEncoder;
- std::vector<ColumnWriter*> children;
- };
-
- UnionColumnWriter::UnionColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
-
- std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createByteRleEncoder(std::move(dataStream));
-
- for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) {
- children.push_back(buildWriter(*type.getSubtype(i),
- factory,
- options).release());
- }
-
- if (enableIndex) {
- recordPosition();
- }
- }
-
- UnionColumnWriter::~UnionColumnWriter() {
- for (uint32_t i = 0; i < children.size(); ++i) {
- delete children[i];
- }
- }
-
- void UnionColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch);
- if (unionBatch == nullptr) {
- throw InvalidArgument("Failed to cast to UnionVectorBatch");
- }
-
- ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
-
- const char* notNull = unionBatch->hasNulls ?
- unionBatch->notNull.data() + offset : nullptr;
- unsigned char * tags = unionBatch->tags.data() + offset;
- uint64_t * offsets = unionBatch->offsets.data() + offset;
-
- std::vector<int64_t> childOffset(children.size(), -1);
- std::vector<uint64_t> childLength(children.size(), 0);
-
- for (uint64_t i = 0; i != numValues; ++i) {
- if (childOffset[tags[i]] == -1) {
- childOffset[tags[i]] = static_cast<int64_t>(offsets[i]);
- }
- ++childLength[tags[i]];
- }
-
- rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull);
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- if (childLength[i] > 0) {
- children[i]->add(*unionBatch->children[i],
- static_cast<uint64_t>(childOffset[i]),
- childLength[i], nullptr);
- }
- }
-
- // update stats
- if (enableIndex) {
- if (!notNull) {
- colIndexStatistics->increase(numValues);
- } else {
- uint64_t count = 0;
- for (uint64_t i = 0; i < numValues; ++i) {
- if (notNull[i]) {
- ++count;
- if (enableBloomFilter) {
- bloomFilter->addLong(tags[i]);
- }
- }
- }
- colIndexStatistics->increase(count);
- if (count < numValues) {
- colIndexStatistics->setHasNull(true);
- }
- }
- }
- }
-
- void UnionColumnWriter::flush(std::vector<proto::Stream>& streams) {
- ColumnWriter::flush(streams);
-
- proto::Stream stream;
- stream.set_kind(proto::Stream_Kind_DATA);
- stream.set_column(static_cast<uint32_t>(columnId));
- stream.set_length(rleEncoder->flush());
- streams.push_back(stream);
-
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->flush(streams);
- }
- }
-
- void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
- ColumnWriter::writeIndex(streams);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeIndex(streams);
- }
- }
-
- uint64_t UnionColumnWriter::getEstimatedSize() const {
- uint64_t size = ColumnWriter::getEstimatedSize();
- size += rleEncoder->getBufferSize();
- for (uint32_t i = 0; i < children.size(); ++i) {
- size += children[i]->getEstimatedSize();
- }
- return size;
- }
-
- void UnionColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
- proto::ColumnEncoding encoding;
- encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
- encoding.set_dictionarysize(0);
- if (enableBloomFilter) {
- encoding.set_bloomencoding(BloomFilterVersion::UTF8);
- }
- encodings.push_back(encoding);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getColumnEncoding(encodings);
- }
- }
-
- void UnionColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getStripeStatistics(stats);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getStripeStatistics(stats);
- }
- }
-
- void UnionColumnWriter::mergeStripeStatsIntoFileStats() {
- ColumnWriter::mergeStripeStatsIntoFileStats();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeStripeStatsIntoFileStats();
- }
- }
-
- void UnionColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
- ColumnWriter::getFileStatistics(stats);
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->getFileStatistics(stats);
- }
- }
-
- void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
- ColumnWriter::mergeRowGroupStatsIntoStripeStats();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->mergeRowGroupStatsIntoStripeStats();
- }
- }
-
- void UnionColumnWriter::createRowIndexEntry() {
- ColumnWriter::createRowIndexEntry();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->createRowIndexEntry();
- }
- }
-
- void UnionColumnWriter::recordPosition() const {
- ColumnWriter::recordPosition();
- rleEncoder->recordPosition(rowIndexPosition.get());
- }
-
- void UnionColumnWriter::reset() {
- ColumnWriter::reset();
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->reset();
- }
- }
-
- void UnionColumnWriter::writeDictionary() {
- for (uint32_t i = 0; i < children.size(); ++i) {
- children[i]->writeDictionary();
- }
- }
-
- std::unique_ptr<ColumnWriter> buildWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) {
- switch (static_cast<int64_t>(type.getKind())) {
- case STRUCT:
- return std::unique_ptr<ColumnWriter>(
- new StructColumnWriter(
- type,
- factory,
- options));
- case INT:
- case LONG:
- case SHORT:
- return std::unique_ptr<ColumnWriter>(
- new IntegerColumnWriter(
- type,
- factory,
- options));
- case BYTE:
- return std::unique_ptr<ColumnWriter>(
- new ByteColumnWriter(
- type,
- factory,
- options));
- case BOOLEAN:
- return std::unique_ptr<ColumnWriter>(
- new BooleanColumnWriter(
- type,
- factory,
- options));
- case DOUBLE:
- return std::unique_ptr<ColumnWriter>(
- new DoubleColumnWriter(
- type,
- factory,
- options,
- false));
- case FLOAT:
- return std::unique_ptr<ColumnWriter>(
- new DoubleColumnWriter(
- type,
- factory,
- options,
- true));
- case BINARY:
- return std::unique_ptr<ColumnWriter>(
- new BinaryColumnWriter(
- type,
- factory,
- options));
- case STRING:
- return std::unique_ptr<ColumnWriter>(
- new StringColumnWriter(
- type,
- factory,
- options));
- case CHAR:
- return std::unique_ptr<ColumnWriter>(
- new CharColumnWriter(
- type,
- factory,
- options));
- case VARCHAR:
- return std::unique_ptr<ColumnWriter>(
- new VarCharColumnWriter(
- type,
- factory,
- options));
- case DATE:
- return std::unique_ptr<ColumnWriter>(
- new DateColumnWriter(
- type,
- factory,
- options));
- case TIMESTAMP:
- return std::unique_ptr<ColumnWriter>(
- new TimestampColumnWriter(
- type,
- factory,
- options));
- case DECIMAL:
- if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) {
- return std::unique_ptr<ColumnWriter>(
- new Decimal64ColumnWriter(
- type,
- factory,
- options));
- } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) {
- return std::unique_ptr<ColumnWriter>(
- new Decimal128ColumnWriter(
- type,
- factory,
- options));
- } else {
- throw NotImplementedYet("Decimal precision more than 38 is not "
- "supported");
- }
- case LIST:
- return std::unique_ptr<ColumnWriter>(
- new ListColumnWriter(
- type,
- factory,
- options));
- case MAP:
- return std::unique_ptr<ColumnWriter>(
- new MapColumnWriter(
- type,
- factory,
- options));
- case UNION:
- return std::unique_ptr<ColumnWriter>(
- new UnionColumnWriter(
- type,
- factory,
- options));
- default:
- throw NotImplementedYet("Type is not supported yet for creating "
- "ColumnWriter.");
- }
- }
-}
+ secs[i] += 1;
+ }
+
+ secs[i] -= timezone.getEpoch();
+ nanos[i] = formatNano(nanos[i]);
+ }
+ }
+ tsStats->increase(count);
+ if (count < numValues) {
+ tsStats->setHasNull(true);
+ }
+
+ secRleEncoder->add(secs, numValues, notNull);
+ nanoRleEncoder->add(nanos, numValues, notNull);
+ }
+
+ void TimestampColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream dataStream;
+ dataStream.set_kind(proto::Stream_Kind_DATA);
+ dataStream.set_column(static_cast<uint32_t>(columnId));
+ dataStream.set_length(secRleEncoder->flush());
+ streams.push_back(dataStream);
+
+ proto::Stream secondaryStream;
+ secondaryStream.set_kind(proto::Stream_Kind_SECONDARY);
+ secondaryStream.set_column(static_cast<uint32_t>(columnId));
+ secondaryStream.set_length(nanoRleEncoder->flush());
+ streams.push_back(secondaryStream);
+ }
+
+ uint64_t TimestampColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += secRleEncoder->getBufferSize();
+ size += nanoRleEncoder->getBufferSize();
+ return size;
+ }
+
+ void TimestampColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void TimestampColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ secRleEncoder->recordPosition(rowIndexPosition.get());
+ nanoRleEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ class DateColumnWriter : public IntegerColumnWriter {
+ public:
+ DateColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+ };
+
+ DateColumnWriter::DateColumnWriter(
+ const Type &type,
+ const StreamsFactory &factory,
+ const WriterOptions &options) :
+ IntegerColumnWriter(type, factory, options) {
+ // PASS
+ }
+
+ void DateColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const LongVectorBatch* longBatch =
+ dynamic_cast<const LongVectorBatch*>(&rowBatch);
+ if (longBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to LongVectorBatch");
+ }
+
+ DateColumnStatisticsImpl* dateStats =
+ dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (dateStats == nullptr) {
+ throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const int64_t* data = longBatch->data.data() + offset;
+ const char* notNull = longBatch->hasNulls ?
+ longBatch->notNull.data() + offset : nullptr;
+
+ rleEncoder->add(data, numValues, notNull);
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ ++count;
+ dateStats->update(static_cast<int32_t>(data[i]));
+ if (enableBloomFilter) {
+ bloomFilter->addLong(data[i]);
+ }
+ }
+ }
+ dateStats->increase(count);
+ if (count < numValues) {
+ dateStats->setHasNull(true);
+ }
+ }
+
+ class Decimal64ColumnWriter : public ColumnWriter {
+ public:
+ static const uint32_t MAX_PRECISION_64 = 18;
+ static const uint32_t MAX_PRECISION_128 = 38;
+
+ Decimal64ColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void recordPosition() const override;
+
+ protected:
+ RleVersion rleVersion;
+ uint64_t precision;
+ uint64_t scale;
+ std::unique_ptr<AppendOnlyBufferedStream> valueStream;
+ std::unique_ptr<RleEncoder> scaleEncoder;
+
+ private:
+ char buffer[10];
+ };
+
+ Decimal64ColumnWriter::Decimal64ColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ precision(type.getPrecision()),
+ scale(type.getScale()) {
+ valueStream.reset(new AppendOnlyBufferedStream(
+ factory.createStream(proto::Stream_Kind_DATA)));
+ std::unique_ptr<BufferedOutputStream> scaleStream =
+ factory.createStream(proto::Stream_Kind_SECONDARY);
+ scaleEncoder = createRleEncoder(std::move(scaleStream),
+ true,
+ rleVersion,
+ memPool,
+ options.getAlignedBitpacking());
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const Decimal64VectorBatch* decBatch =
+ dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
+ if (decBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
+ }
+
+ DecimalColumnStatisticsImpl* decStats =
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (decStats == nullptr) {
+ throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const char* notNull = decBatch->hasNulls ?
+ decBatch->notNull.data() + offset : nullptr;
+ const int64_t* values = decBatch->values.data() + offset;
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ int64_t val = zigZag(values[i]);
+ char* data = buffer;
+ while (true) {
+ if ((val & ~0x7f) == 0) {
+ *(data++) = (static_cast<char>(val));
+ break;
+ } else {
+ *(data++) = static_cast<char>(0x80 | (val & 0x7f));
+ // cast val to unsigned so as to force 0-fill right shift
+ val = (static_cast<uint64_t>(val) >> 7);
+ }
+ }
+ valueStream->write(buffer, static_cast<size_t>(data - buffer));
+ ++count;
+ if (enableBloomFilter) {
+ std::string decimal = Decimal(
+ values[i], static_cast<int32_t>(scale)).toString();
+ bloomFilter->addBytes(
+ decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ }
+ decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
+ }
+ }
+ decStats->increase(count);
+ if (count < numValues) {
+ decStats->setHasNull(true);
+ }
+ std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
+ scaleEncoder->add(scales.data(), numValues, notNull);
+ }
+
+ void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream dataStream;
+ dataStream.set_kind(proto::Stream_Kind_DATA);
+ dataStream.set_column(static_cast<uint32_t>(columnId));
+ dataStream.set_length(valueStream->flush());
+ streams.push_back(dataStream);
+
+ proto::Stream secondaryStream;
+ secondaryStream.set_kind(proto::Stream_Kind_SECONDARY);
+ secondaryStream.set_column(static_cast<uint32_t>(columnId));
+ secondaryStream.set_length(scaleEncoder->flush());
+ streams.push_back(secondaryStream);
+ }
+
+ uint64_t Decimal64ColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += valueStream->getSize();
+ size += scaleEncoder->getBufferSize();
+ return size;
+ }
+
+ void Decimal64ColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ }
+
+ void Decimal64ColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ valueStream->recordPosition(rowIndexPosition.get());
+ scaleEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ class Decimal128ColumnWriter : public Decimal64ColumnWriter {
+ public:
+ Decimal128ColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ private:
+ char buffer[20];
+ };
+
+ Decimal128ColumnWriter::Decimal128ColumnWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ Decimal64ColumnWriter(type, factory, options) {
+ // PASS
+ }
+
+ // Zigzag encoding moves the sign bit to the least significant bit using the
+ // expression (val « 1) ^ (val » 63) and derives its name from the fact that
+ // positive and negative numbers alternate once encoded.
+ Int128 zigZagInt128(const Int128& value) {
+ bool isNegative = value < 0;
+ Int128 val = value.abs();
+ val <<= 1;
+ if (isNegative) {
+ val -= 1;
+ }
+ return val;
+ }
+
+ void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ const Decimal128VectorBatch* decBatch =
+ dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
+ if (decBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to Decimal128VectorBatch");
+ }
+
+ DecimalColumnStatisticsImpl* decStats =
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (decStats == nullptr) {
+ throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const char* notNull = decBatch->hasNulls ?
+ decBatch->notNull.data() + offset : nullptr;
+ const Int128* values = decBatch->values.data() + offset;
+
+ // The current encoding of decimal columns stores the integer representation
+ // of the value as an unbounded length zigzag encoded base 128 varint.
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ Int128 val = zigZagInt128(values[i]);
+ char* data = buffer;
+ while (true) {
+ if ((val & ~0x7f) == 0) {
+ *(data++) = (static_cast<char>(val.getLowBits()));
+ break;
+ } else {
+ *(data++) = static_cast<char>(0x80 | (val.getLowBits() & 0x7f));
+ val >>= 7;
+ }
+ }
+ valueStream->write(buffer, static_cast<size_t>(data - buffer));
+
+ ++count;
+ if (enableBloomFilter) {
+ std::string decimal = Decimal(
+ values[i], static_cast<int32_t>(scale)).toString();
+ bloomFilter->addBytes(
+ decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ }
+ decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
+ }
+ }
+ decStats->increase(count);
+ if (count < numValues) {
+ decStats->setHasNull(true);
+ }
+ std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
+ scaleEncoder->add(scales.data(), numValues, notNull);
+ }
+
+ class ListColumnWriter : public ColumnWriter {
+ public:
+ ListColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+ ~ListColumnWriter() override;
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void mergeStripeStatsIntoFileStats() override;
+
+ virtual void mergeRowGroupStatsIntoStripeStats() override;
+
+ virtual void createRowIndexEntry() override;
+
+ virtual void writeIndex(
+ std::vector<proto::Stream> &streams) const override;
+
+ virtual void recordPosition() const override;
+
+ virtual void writeDictionary() override;
+
+ virtual void reset() override;
+
+ private:
+ std::unique_ptr<RleEncoder> lengthEncoder;
+ RleVersion rleVersion;
+ std::unique_ptr<ColumnWriter> child;
+ };
+
+ ListColumnWriter::ListColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()){
+
+ std::unique_ptr<BufferedOutputStream> lengthStream =
+ factory.createStream(proto::Stream_Kind_LENGTH);
+ lengthEncoder = createRleEncoder(std::move(lengthStream),
+ false,
+ rleVersion,
+ memPool,
+ options.getAlignedBitpacking());
+
+ if (type.getSubtypeCount() == 1) {
+ child = buildWriter(*type.getSubtype(0), factory, options);
+ }
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ ListColumnWriter::~ListColumnWriter() {
+ // PASS
+ }
+
+ void ListColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch);
+ if (listBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to ListVectorBatch");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ int64_t* offsets = listBatch->offsets.data() + offset;
+ const char* notNull = listBatch->hasNulls ?
+ listBatch->notNull.data() + offset : nullptr;
+
+ uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
+ uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
+
+ // translate offsets to lengths
+ for (uint64_t i = 0; i != numValues; ++i) {
+ offsets[i] = offsets[i + 1] - offsets[i];
+ }
+
+ // unnecessary to deal with null as elements are packed together
+ if (child.get()) {
+ child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr);
+ }
+ lengthEncoder->add(offsets, numValues, notNull);
+
+ if (enableIndex) {
+ if (!notNull) {
+ colIndexStatistics->increase(numValues);
+ } else {
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(offsets[i]);
+ }
+ }
+ }
+ colIndexStatistics->increase(count);
+ if (count < numValues) {
+ colIndexStatistics->setHasNull(true);
+ }
+ }
+ }
+ }
+
+ void ListColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_LENGTH);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(lengthEncoder->flush());
+ streams.push_back(stream);
+
+ if (child.get()) {
+ child->flush(streams);
+ }
+ }
+
+ void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ ColumnWriter::writeIndex(streams);
+ if (child.get()) {
+ child->writeIndex(streams);
+ }
+ }
+
+ uint64_t ListColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ if (child.get()) {
+ size += lengthEncoder->getBufferSize();
+ size += child->getEstimatedSize();
+ }
+ return size;
+ }
+
+ void ListColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ if (child.get()) {
+ child->getColumnEncoding(encodings);
+ }
+ }
+
+ void ListColumnWriter::getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getStripeStatistics(stats);
+ if (child.get()) {
+ child->getStripeStatistics(stats);
+ }
+ }
+
+ void ListColumnWriter::mergeStripeStatsIntoFileStats() {
+ ColumnWriter::mergeStripeStatsIntoFileStats();
+ if (child.get()) {
+ child->mergeStripeStatsIntoFileStats();
+ }
+ }
+
+ void ListColumnWriter::getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getFileStatistics(stats);
+ if (child.get()) {
+ child->getFileStatistics(stats);
+ }
+ }
+
+ void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ ColumnWriter::mergeRowGroupStatsIntoStripeStats();
+ if (child.get()) {
+ child->mergeRowGroupStatsIntoStripeStats();
+ }
+ }
+
+ void ListColumnWriter::createRowIndexEntry() {
+ ColumnWriter::createRowIndexEntry();
+ if (child.get()) {
+ child->createRowIndexEntry();
+ }
+ }
+
+ void ListColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ lengthEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ void ListColumnWriter::reset() {
+ ColumnWriter::reset();
+ if (child) {
+ child->reset();
+ }
+ }
+
+ void ListColumnWriter::writeDictionary() {
+ if (child) {
+ child->writeDictionary();
+ }
+ }
+
+ class MapColumnWriter : public ColumnWriter {
+ public:
+ MapColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+ ~MapColumnWriter() override;
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void mergeStripeStatsIntoFileStats() override;
+
+ virtual void mergeRowGroupStatsIntoStripeStats() override;
+
+ virtual void createRowIndexEntry() override;
+
+ virtual void writeIndex(
+ std::vector<proto::Stream> &streams) const override;
+
+ virtual void recordPosition() const override;
+
+ virtual void writeDictionary() override;
+
+ virtual void reset() override;
+
+ private:
+ std::unique_ptr<ColumnWriter> keyWriter;
+ std::unique_ptr<ColumnWriter> elemWriter;
+ std::unique_ptr<RleEncoder> lengthEncoder;
+ RleVersion rleVersion;
+ };
+
+ MapColumnWriter::MapColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()){
+ std::unique_ptr<BufferedOutputStream> lengthStream =
+ factory.createStream(proto::Stream_Kind_LENGTH);
+ lengthEncoder = createRleEncoder(std::move(lengthStream),
+ false,
+ rleVersion,
+ memPool,
+ options.getAlignedBitpacking());
+
+ if (type.getSubtypeCount() > 0) {
+ keyWriter = buildWriter(*type.getSubtype(0), factory, options);
+ }
+
+ if (type.getSubtypeCount() > 1) {
+ elemWriter = buildWriter(*type.getSubtype(1), factory, options);
+ }
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ MapColumnWriter::~MapColumnWriter() {
+ // PASS
+ }
+
+ void MapColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch);
+ if (mapBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to MapVectorBatch");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ int64_t* offsets = mapBatch->offsets.data() + offset;
+ const char* notNull = mapBatch->hasNulls ?
+ mapBatch->notNull.data() + offset : nullptr;
+
+ uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
+ uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
+
+ // translate offsets to lengths
+ for (uint64_t i = 0; i != numValues; ++i) {
+ offsets[i] = offsets[i + 1] - offsets[i];
+ }
+
+ lengthEncoder->add(offsets, numValues, notNull);
+
+ // unnecessary to deal with null as keys and values are packed together
+ if (keyWriter.get()) {
+ keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr);
+ }
+ if (elemWriter.get()) {
+ elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr);
+ }
+
+ if (enableIndex) {
+ if (!notNull) {
+ colIndexStatistics->increase(numValues);
+ } else {
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(offsets[i]);
+ }
+ }
+ }
+ colIndexStatistics->increase(count);
+ if (count < numValues) {
+ colIndexStatistics->setHasNull(true);
+ }
+ }
+ }
+ }
+
+ void MapColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_LENGTH);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(lengthEncoder->flush());
+ streams.push_back(stream);
+
+ if (keyWriter.get()) {
+ keyWriter->flush(streams);
+ }
+ if (elemWriter.get()) {
+ elemWriter->flush(streams);
+ }
+ }
+
+ void MapColumnWriter::writeIndex(
+ std::vector<proto::Stream> &streams) const {
+ ColumnWriter::writeIndex(streams);
+ if (keyWriter.get()) {
+ keyWriter->writeIndex(streams);
+ }
+ if (elemWriter.get()) {
+ elemWriter->writeIndex(streams);
+ }
+ }
+
+ uint64_t MapColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += lengthEncoder->getBufferSize();
+ if (keyWriter.get()) {
+ size += keyWriter->getEstimatedSize();
+ }
+ if (elemWriter.get()) {
+ size += elemWriter->getEstimatedSize();
+ }
+ return size;
+ }
+
+ void MapColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(RleVersionMapper(rleVersion));
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ if (keyWriter.get()) {
+ keyWriter->getColumnEncoding(encodings);
+ }
+ if (elemWriter.get()) {
+ elemWriter->getColumnEncoding(encodings);
+ }
+ }
+
+ void MapColumnWriter::getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getStripeStatistics(stats);
+ if (keyWriter.get()) {
+ keyWriter->getStripeStatistics(stats);
+ }
+ if (elemWriter.get()) {
+ elemWriter->getStripeStatistics(stats);
+ }
+ }
+
+ void MapColumnWriter::mergeStripeStatsIntoFileStats() {
+ ColumnWriter::mergeStripeStatsIntoFileStats();
+ if (keyWriter.get()) {
+ keyWriter->mergeStripeStatsIntoFileStats();
+ }
+ if (elemWriter.get()) {
+ elemWriter->mergeStripeStatsIntoFileStats();
+ }
+ }
+
+ void MapColumnWriter::getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getFileStatistics(stats);
+ if (keyWriter.get()) {
+ keyWriter->getFileStatistics(stats);
+ }
+ if (elemWriter.get()) {
+ elemWriter->getFileStatistics(stats);
+ }
+ }
+
+ void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ ColumnWriter::mergeRowGroupStatsIntoStripeStats();
+ if (keyWriter.get()) {
+ keyWriter->mergeRowGroupStatsIntoStripeStats();
+ }
+ if (elemWriter.get()) {
+ elemWriter->mergeRowGroupStatsIntoStripeStats();
+ }
+ }
+
+ void MapColumnWriter::createRowIndexEntry() {
+ ColumnWriter::createRowIndexEntry();
+ if (keyWriter.get()) {
+ keyWriter->createRowIndexEntry();
+ }
+ if (elemWriter.get()) {
+ elemWriter->createRowIndexEntry();
+ }
+ }
+
+ void MapColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ lengthEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ void MapColumnWriter::reset() {
+ ColumnWriter::reset();
+ if (keyWriter) {
+ keyWriter->reset();
+ }
+ if (elemWriter) {
+ elemWriter->reset();
+ }
+ }
+
+ void MapColumnWriter::writeDictionary() {
+ if (keyWriter) {
+ keyWriter->writeDictionary();
+ }
+ if (elemWriter) {
+ elemWriter->writeDictionary();
+ }
+ }
+
+ class UnionColumnWriter : public ColumnWriter {
+ public:
+ UnionColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+ ~UnionColumnWriter() override;
+
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) override;
+
+ virtual void flush(std::vector<proto::Stream>& streams) override;
+
+ virtual uint64_t getEstimatedSize() const override;
+
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const override;
+
+ virtual void getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const override;
+
+ virtual void mergeStripeStatsIntoFileStats() override;
+
+ virtual void mergeRowGroupStatsIntoStripeStats() override;
+
+ virtual void createRowIndexEntry() override;
+
+ virtual void writeIndex(
+ std::vector<proto::Stream> &streams) const override;
+
+ virtual void recordPosition() const override;
+
+ virtual void writeDictionary() override;
+
+ virtual void reset() override;
+
+ private:
+ std::unique_ptr<ByteRleEncoder> rleEncoder;
+ std::vector<ColumnWriter*> children;
+ };
+
+ UnionColumnWriter::UnionColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) :
+ ColumnWriter(type, factory, options) {
+
+ std::unique_ptr<BufferedOutputStream> dataStream =
+ factory.createStream(proto::Stream_Kind_DATA);
+ rleEncoder = createByteRleEncoder(std::move(dataStream));
+
+ for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) {
+ children.push_back(buildWriter(*type.getSubtype(i),
+ factory,
+ options).release());
+ }
+
+ if (enableIndex) {
+ recordPosition();
+ }
+ }
+
+ UnionColumnWriter::~UnionColumnWriter() {
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ delete children[i];
+ }
+ }
+
+ void UnionColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* incomingMask) {
+ UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch);
+ if (unionBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to UnionVectorBatch");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const char* notNull = unionBatch->hasNulls ?
+ unionBatch->notNull.data() + offset : nullptr;
+ unsigned char * tags = unionBatch->tags.data() + offset;
+ uint64_t * offsets = unionBatch->offsets.data() + offset;
+
+ std::vector<int64_t> childOffset(children.size(), -1);
+ std::vector<uint64_t> childLength(children.size(), 0);
+
+ for (uint64_t i = 0; i != numValues; ++i) {
+ if (childOffset[tags[i]] == -1) {
+ childOffset[tags[i]] = static_cast<int64_t>(offsets[i]);
+ }
+ ++childLength[tags[i]];
+ }
+
+ rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull);
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ if (childLength[i] > 0) {
+ children[i]->add(*unionBatch->children[i],
+ static_cast<uint64_t>(childOffset[i]),
+ childLength[i], nullptr);
+ }
+ }
+
+ // update stats
+ if (enableIndex) {
+ if (!notNull) {
+ colIndexStatistics->increase(numValues);
+ } else {
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++count;
+ if (enableBloomFilter) {
+ bloomFilter->addLong(tags[i]);
+ }
+ }
+ }
+ colIndexStatistics->increase(count);
+ if (count < numValues) {
+ colIndexStatistics->setHasNull(true);
+ }
+ }
+ }
+ }
+
+ void UnionColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ ColumnWriter::flush(streams);
+
+ proto::Stream stream;
+ stream.set_kind(proto::Stream_Kind_DATA);
+ stream.set_column(static_cast<uint32_t>(columnId));
+ stream.set_length(rleEncoder->flush());
+ streams.push_back(stream);
+
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->flush(streams);
+ }
+ }
+
+ void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ ColumnWriter::writeIndex(streams);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->writeIndex(streams);
+ }
+ }
+
+ uint64_t UnionColumnWriter::getEstimatedSize() const {
+ uint64_t size = ColumnWriter::getEstimatedSize();
+ size += rleEncoder->getBufferSize();
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ size += children[i]->getEstimatedSize();
+ }
+ return size;
+ }
+
+ void UnionColumnWriter::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
+ proto::ColumnEncoding encoding;
+ encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+ encoding.set_dictionarysize(0);
+ if (enableBloomFilter) {
+ encoding.set_bloomencoding(BloomFilterVersion::UTF8);
+ }
+ encodings.push_back(encoding);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->getColumnEncoding(encodings);
+ }
+ }
+
+ void UnionColumnWriter::getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getStripeStatistics(stats);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->getStripeStatistics(stats);
+ }
+ }
+
+ void UnionColumnWriter::mergeStripeStatsIntoFileStats() {
+ ColumnWriter::mergeStripeStatsIntoFileStats();
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->mergeStripeStatsIntoFileStats();
+ }
+ }
+
+ void UnionColumnWriter::getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const {
+ ColumnWriter::getFileStatistics(stats);
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->getFileStatistics(stats);
+ }
+ }
+
+ void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ ColumnWriter::mergeRowGroupStatsIntoStripeStats();
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->mergeRowGroupStatsIntoStripeStats();
+ }
+ }
+
+ void UnionColumnWriter::createRowIndexEntry() {
+ ColumnWriter::createRowIndexEntry();
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->createRowIndexEntry();
+ }
+ }
+
+ void UnionColumnWriter::recordPosition() const {
+ ColumnWriter::recordPosition();
+ rleEncoder->recordPosition(rowIndexPosition.get());
+ }
+
+ void UnionColumnWriter::reset() {
+ ColumnWriter::reset();
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->reset();
+ }
+ }
+
+ void UnionColumnWriter::writeDictionary() {
+ for (uint32_t i = 0; i < children.size(); ++i) {
+ children[i]->writeDictionary();
+ }
+ }
+
+ std::unique_ptr<ColumnWriter> buildWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options) {
+ switch (static_cast<int64_t>(type.getKind())) {
+ case STRUCT:
+ return std::unique_ptr<ColumnWriter>(
+ new StructColumnWriter(
+ type,
+ factory,
+ options));
+ case INT:
+ case LONG:
+ case SHORT:
+ return std::unique_ptr<ColumnWriter>(
+ new IntegerColumnWriter(
+ type,
+ factory,
+ options));
+ case BYTE:
+ return std::unique_ptr<ColumnWriter>(
+ new ByteColumnWriter(
+ type,
+ factory,
+ options));
+ case BOOLEAN:
+ return std::unique_ptr<ColumnWriter>(
+ new BooleanColumnWriter(
+ type,
+ factory,
+ options));
+ case DOUBLE:
+ return std::unique_ptr<ColumnWriter>(
+ new DoubleColumnWriter(
+ type,
+ factory,
+ options,
+ false));
+ case FLOAT:
+ return std::unique_ptr<ColumnWriter>(
+ new DoubleColumnWriter(
+ type,
+ factory,
+ options,
+ true));
+ case BINARY:
+ return std::unique_ptr<ColumnWriter>(
+ new BinaryColumnWriter(
+ type,
+ factory,
+ options));
+ case STRING:
+ return std::unique_ptr<ColumnWriter>(
+ new StringColumnWriter(
+ type,
+ factory,
+ options));
+ case CHAR:
+ return std::unique_ptr<ColumnWriter>(
+ new CharColumnWriter(
+ type,
+ factory,
+ options));
+ case VARCHAR:
+ return std::unique_ptr<ColumnWriter>(
+ new VarCharColumnWriter(
+ type,
+ factory,
+ options));
+ case DATE:
+ return std::unique_ptr<ColumnWriter>(
+ new DateColumnWriter(
+ type,
+ factory,
+ options));
+ case TIMESTAMP:
+ return std::unique_ptr<ColumnWriter>(
+ new TimestampColumnWriter(
+ type,
+ factory,
+ options));
+ case DECIMAL:
+ if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) {
+ return std::unique_ptr<ColumnWriter>(
+ new Decimal64ColumnWriter(
+ type,
+ factory,
+ options));
+ } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) {
+ return std::unique_ptr<ColumnWriter>(
+ new Decimal128ColumnWriter(
+ type,
+ factory,
+ options));
+ } else {
+ throw NotImplementedYet("Decimal precision more than 38 is not "
+ "supported");
+ }
+ case LIST:
+ return std::unique_ptr<ColumnWriter>(
+ new ListColumnWriter(
+ type,
+ factory,
+ options));
+ case MAP:
+ return std::unique_ptr<ColumnWriter>(
+ new MapColumnWriter(
+ type,
+ factory,
+ options));
+ case UNION:
+ return std::unique_ptr<ColumnWriter>(
+ new UnionColumnWriter(
+ type,
+ factory,
+ options));
+ default:
+ throw NotImplementedYet("Type is not supported yet for creating "
+ "ColumnWriter.");
+ }
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
index cbbb5d00dc..4d7d71cb37 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh
@@ -1,221 +1,221 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_COLUMN_WRITER_HH
-#define ORC_COLUMN_WRITER_HH
-
-#include "orc/Vector.hh"
-
-#include "BloomFilter.hh"
-#include "ByteRLE.hh"
-#include "Compression.hh"
-#include "orc/Exceptions.hh"
-#include "Statistics.hh"
-
-#include "wrap/orc-proto-wrapper.hh"
-
-namespace orc {
-
- class StreamsFactory {
- public:
- virtual ~StreamsFactory();
-
- /**
- * Get the stream for the given column/kind in this stripe.
- * @param kind the kind of the stream
- * @return the buffered output stream
- */
- virtual std::unique_ptr<BufferedOutputStream>
- createStream(proto::Stream_Kind kind) const = 0;
- };
-
- std::unique_ptr<StreamsFactory> createStreamsFactory(
- const WriterOptions& options,
- OutputStream * outStream);
-
- /**
- * record stream positions for row index
- */
- class RowIndexPositionRecorder : public PositionRecorder {
- public:
- virtual ~RowIndexPositionRecorder() override;
-
- RowIndexPositionRecorder(proto::RowIndexEntry& entry):
- rowIndexEntry(entry) {}
-
- virtual void add(uint64_t pos) override {
- rowIndexEntry.add_positions(pos);
- }
-
- private:
- proto::RowIndexEntry& rowIndexEntry;
- };
-
- /**
- * The interface for writing ORC data types.
- */
- class ColumnWriter {
- protected:
- std::unique_ptr<ByteRleEncoder> notNullEncoder;
- uint64_t columnId;
- std::unique_ptr<MutableColumnStatistics> colIndexStatistics;
- std::unique_ptr<MutableColumnStatistics> colStripeStatistics;
- std::unique_ptr<MutableColumnStatistics> colFileStatistics;
-
- bool enableIndex;
- // row index for this column, contains all RowIndexEntries in 1 stripe
- std::unique_ptr<proto::RowIndex> rowIndex;
- std::unique_ptr<proto::RowIndexEntry> rowIndexEntry;
- std::unique_ptr<RowIndexPositionRecorder> rowIndexPosition;
-
- // bloom filters are recorded per row group
- bool enableBloomFilter;
- std::unique_ptr<BloomFilterImpl> bloomFilter;
- std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex;
-
- public:
- ColumnWriter(const Type& type, const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual ~ColumnWriter();
-
- /**
- * Write the next group of values from this rowBatch.
- * @param rowBatch the row batch data to write
- * @param offset the starting point of row batch to write
- * @param numValues the number of values to write
- * @param incomingMask if null, all values are not null. Otherwise, it is
- * a mask (with at least numValues bytes) for which
- * values to write.
- */
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char * incomingMask);
- /**
- * Flush column writer output streams.
- * @param streams vector to store streams generated by flush()
- */
- virtual void flush(std::vector<proto::Stream>& streams);
-
- /**
- * Get estimated size of buffer used.
- * @return estimated size of buffer used
- */
- virtual uint64_t getEstimatedSize() const;
-
- /**
- * Get the encoding used by the writer for this column.
- * @param encodings vector to store the returned ColumnEncoding info
- */
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const = 0;
-
- /**
- * Get the stripe statistics for this column.
- * @param stats vector to store the returned stripe statistics
- */
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const;
-
- /**
- * Get the file statistics for this column.
- * @param stats vector to store the returned file statistics
- */
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const;
-
- /**
- * Merge index stats into stripe stats and reset index stats.
- */
- virtual void mergeRowGroupStatsIntoStripeStats();
-
- /**
- * Merge stripe stats into file stats and reset stripe stats.
- */
- virtual void mergeStripeStatsIntoFileStats();
-
- /**
- * Create a row index entry with the previous location and the current
- * index statistics. Also merges the index statistics into the stripe
- * statistics before they are cleared. Finally, it records the start of the
- * next index and ensures all of the children columns also create an entry.
- */
- virtual void createRowIndexEntry();
-
- /**
- * Create a new BloomFilter entry and add the previous one to BloomFilterIndex
- */
- virtual void addBloomFilterEntry();
-
- /**
- * Write row index streams for this column.
- * @param streams output list of ROW_INDEX streams
- */
- virtual void writeIndex(std::vector<proto::Stream> &streams) const;
-
- /**
- * Record positions for index.
- *
- * This function is called by createRowIndexEntry() and ColumnWriter's
- * constructor. So base classes do not need to call inherited classes'
- * recordPosition() function.
- */
- virtual void recordPosition() const;
-
- /**
- * Reset positions for index.
- */
- virtual void reset();
-
- /**
- * Write dictionary to streams for string columns
- */
- virtual void writeDictionary();
-
- protected:
- /**
- * Utility function to translate ColumnStatistics into protobuf form and
- * add it to output list.
- * @param statsList output list for protobuf stats
- * @param stats ColumnStatistics to be transformed and added
- */
- void getProtoBufStatistics(
- std::vector<proto::ColumnStatistics>& statsList,
- const MutableColumnStatistics* stats) const {
- proto::ColumnStatistics pbStats;
- stats->toProtoBuf(pbStats);
- statsList.push_back(pbStats);
- }
-
- protected:
- MemoryPool& memPool;
- std::unique_ptr<BufferedOutputStream> indexStream;
- std::unique_ptr<BufferedOutputStream> bloomFilterStream;
- };
-
- /**
- * Create a writer for the given type.
- */
- std::unique_ptr<ColumnWriter> buildWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_COLUMN_WRITER_HH
+#define ORC_COLUMN_WRITER_HH
+
+#include "orc/Vector.hh"
+
+#include "BloomFilter.hh"
+#include "ByteRLE.hh"
+#include "Compression.hh"
+#include "orc/Exceptions.hh"
+#include "Statistics.hh"
+
+#include "wrap/orc-proto-wrapper.hh"
+
+namespace orc {
+
+ class StreamsFactory {
+ public:
+ virtual ~StreamsFactory();
+
+ /**
+ * Get the stream for the given column/kind in this stripe.
+ * @param kind the kind of the stream
+ * @return the buffered output stream
+ */
+ virtual std::unique_ptr<BufferedOutputStream>
+ createStream(proto::Stream_Kind kind) const = 0;
+ };
+
+ std::unique_ptr<StreamsFactory> createStreamsFactory(
+ const WriterOptions& options,
+ OutputStream * outStream);
+
+ /**
+ * record stream positions for row index
+ */
+ class RowIndexPositionRecorder : public PositionRecorder {
+ public:
+ virtual ~RowIndexPositionRecorder() override;
+
+ RowIndexPositionRecorder(proto::RowIndexEntry& entry):
+ rowIndexEntry(entry) {}
+
+ virtual void add(uint64_t pos) override {
+ rowIndexEntry.add_positions(pos);
+ }
+
+ private:
+ proto::RowIndexEntry& rowIndexEntry;
+ };
+
+ /**
+ * The interface for writing ORC data types.
+ */
+ class ColumnWriter {
+ protected:
+ std::unique_ptr<ByteRleEncoder> notNullEncoder;
+ uint64_t columnId;
+ std::unique_ptr<MutableColumnStatistics> colIndexStatistics;
+ std::unique_ptr<MutableColumnStatistics> colStripeStatistics;
+ std::unique_ptr<MutableColumnStatistics> colFileStatistics;
+
+ bool enableIndex;
+ // row index for this column, contains all RowIndexEntries in 1 stripe
+ std::unique_ptr<proto::RowIndex> rowIndex;
+ std::unique_ptr<proto::RowIndexEntry> rowIndexEntry;
+ std::unique_ptr<RowIndexPositionRecorder> rowIndexPosition;
+
+ // bloom filters are recorded per row group
+ bool enableBloomFilter;
+ std::unique_ptr<BloomFilterImpl> bloomFilter;
+ std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex;
+
+ public:
+ ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options);
+
+ virtual ~ColumnWriter();
+
+ /**
+ * Write the next group of values from this rowBatch.
+ * @param rowBatch the row batch data to write
+ * @param offset the starting point of row batch to write
+ * @param numValues the number of values to write
+ * @param incomingMask if null, all values are not null. Otherwise, it is
+ * a mask (with at least numValues bytes) for which
+ * values to write.
+ */
+ virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues,
+ const char * incomingMask);
+ /**
+ * Flush column writer output streams.
+ * @param streams vector to store streams generated by flush()
+ */
+ virtual void flush(std::vector<proto::Stream>& streams);
+
+ /**
+ * Get estimated size of buffer used.
+ * @return estimated size of buffer used
+ */
+ virtual uint64_t getEstimatedSize() const;
+
+ /**
+ * Get the encoding used by the writer for this column.
+ * @param encodings vector to store the returned ColumnEncoding info
+ */
+ virtual void getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const = 0;
+
+ /**
+ * Get the stripe statistics for this column.
+ * @param stats vector to store the returned stripe statistics
+ */
+ virtual void getStripeStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const;
+
+ /**
+ * Get the file statistics for this column.
+ * @param stats vector to store the returned file statistics
+ */
+ virtual void getFileStatistics(
+ std::vector<proto::ColumnStatistics>& stats) const;
+
+ /**
+ * Merge index stats into stripe stats and reset index stats.
+ */
+ virtual void mergeRowGroupStatsIntoStripeStats();
+
+ /**
+ * Merge stripe stats into file stats and reset stripe stats.
+ */
+ virtual void mergeStripeStatsIntoFileStats();
+
+ /**
+ * Create a row index entry with the previous location and the current
+ * index statistics. Also merges the index statistics into the stripe
+ * statistics before they are cleared. Finally, it records the start of the
+ * next index and ensures all of the children columns also create an entry.
+ */
+ virtual void createRowIndexEntry();
+
+ /**
+ * Create a new BloomFilter entry and add the previous one to BloomFilterIndex
+ */
+ virtual void addBloomFilterEntry();
+
+ /**
+ * Write row index streams for this column.
+ * @param streams output list of ROW_INDEX streams
+ */
+ virtual void writeIndex(std::vector<proto::Stream> &streams) const;
+
+ /**
+ * Record positions for index.
+ *
+ * This function is called by createRowIndexEntry() and ColumnWriter's
+ * constructor. So base classes do not need to call inherited classes'
+ * recordPosition() function.
+ */
+ virtual void recordPosition() const;
+
+ /**
+ * Reset positions for index.
+ */
+ virtual void reset();
+
+ /**
+ * Write dictionary to streams for string columns
+ */
+ virtual void writeDictionary();
+
+ protected:
+ /**
+ * Utility function to translate ColumnStatistics into protobuf form and
+ * add it to output list.
+ * @param statsList output list for protobuf stats
+ * @param stats ColumnStatistics to be transformed and added
+ */
+ void getProtoBufStatistics(
+ std::vector<proto::ColumnStatistics>& statsList,
+ const MutableColumnStatistics* stats) const {
+ proto::ColumnStatistics pbStats;
+ stats->toProtoBuf(pbStats);
+ statsList.push_back(pbStats);
+ }
+
+ protected:
+ MemoryPool& memPool;
+ std::unique_ptr<BufferedOutputStream> indexStream;
+ std::unique_ptr<BufferedOutputStream> bloomFilterStream;
+ };
+
+ /**
+ * Create a writer for the given type.
+ */
+ std::unique_ptr<ColumnWriter> buildWriter(
+ const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc
index dbf073797e..e50f085d30 100644
--- a/contrib/libs/apache/orc/c++/src/Common.cc
+++ b/contrib/libs/apache/orc/c++/src/Common.cc
@@ -1,75 +1,75 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Common.hh"
-
-#include <sstream>
-
-namespace orc {
-
- std::string compressionKindToString(CompressionKind kind) {
- switch (static_cast<int>(kind)) {
- case CompressionKind_NONE:
- return "none";
- case CompressionKind_ZLIB:
- return "zlib";
- case CompressionKind_SNAPPY:
- return "snappy";
- case CompressionKind_LZO:
- return "lzo";
- case CompressionKind_LZ4:
- return "lz4";
- case CompressionKind_ZSTD:
- return "zstd";
- }
- std::stringstream buffer;
- buffer << "unknown - " << kind;
- return buffer.str();
- }
-
- std::string writerVersionToString(WriterVersion version) {
- switch (static_cast<int>(version)) {
- case WriterVersion_ORIGINAL:
- return "original";
- case WriterVersion_HIVE_8732:
- return "HIVE-8732";
- case WriterVersion_HIVE_4243:
- return "HIVE-4243";
- case WriterVersion_HIVE_12055:
- return "HIVE-12055";
- case WriterVersion_HIVE_13083:
- return "HIVE-13083";
- case WriterVersion_ORC_101:
- return "ORC-101";
- case WriterVersion_ORC_135:
- return "ORC-135";
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Common.hh"
+
+#include <sstream>
+
+namespace orc {
+
+ std::string compressionKindToString(CompressionKind kind) {
+ switch (static_cast<int>(kind)) {
+ case CompressionKind_NONE:
+ return "none";
+ case CompressionKind_ZLIB:
+ return "zlib";
+ case CompressionKind_SNAPPY:
+ return "snappy";
+ case CompressionKind_LZO:
+ return "lzo";
+ case CompressionKind_LZ4:
+ return "lz4";
+ case CompressionKind_ZSTD:
+ return "zstd";
+ }
+ std::stringstream buffer;
+ buffer << "unknown - " << kind;
+ return buffer.str();
+ }
+
+ std::string writerVersionToString(WriterVersion version) {
+ switch (static_cast<int>(version)) {
+ case WriterVersion_ORIGINAL:
+ return "original";
+ case WriterVersion_HIVE_8732:
+ return "HIVE-8732";
+ case WriterVersion_HIVE_4243:
+ return "HIVE-4243";
+ case WriterVersion_HIVE_12055:
+ return "HIVE-12055";
+ case WriterVersion_HIVE_13083:
+ return "HIVE-13083";
+ case WriterVersion_ORC_101:
+ return "ORC-101";
+ case WriterVersion_ORC_135:
+ return "ORC-135";
case WriterVersion_ORC_517:
return "ORC-517";
case WriterVersion_ORC_203:
return "ORC-203";
case WriterVersion_ORC_14:
return "ORC-14";
- }
- std::stringstream buffer;
- buffer << "future - " << version;
- return buffer.str();
- }
-
+ }
+ std::stringstream buffer;
+ buffer << "future - " << version;
+ return buffer.str();
+ }
+
std::string writerIdToString(uint32_t id) {
switch (id) {
case ORC_JAVA_WRITER:
@@ -90,59 +90,59 @@ namespace orc {
}
}
- std::string streamKindToString(StreamKind kind) {
- switch (static_cast<int>(kind)) {
- case StreamKind_PRESENT:
- return "present";
- case StreamKind_DATA:
- return "data";
- case StreamKind_LENGTH:
- return "length";
- case StreamKind_DICTIONARY_DATA:
- return "dictionary";
- case StreamKind_DICTIONARY_COUNT:
- return "dictionary count";
- case StreamKind_SECONDARY:
- return "secondary";
- case StreamKind_ROW_INDEX:
- return "index";
- case StreamKind_BLOOM_FILTER:
- return "bloom";
- }
- std::stringstream buffer;
- buffer << "unknown - " << kind;
- return buffer.str();
- }
-
- std::string columnEncodingKindToString(ColumnEncodingKind kind) {
- switch (static_cast<int>(kind)) {
- case ColumnEncodingKind_DIRECT:
- return "direct";
- case ColumnEncodingKind_DICTIONARY:
- return "dictionary";
- case ColumnEncodingKind_DIRECT_V2:
- return "direct rle2";
- case ColumnEncodingKind_DICTIONARY_V2:
- return "dictionary rle2";
- }
- std::stringstream buffer;
- buffer << "unknown - " << kind;
- return buffer.str();
- }
-
- std::string FileVersion::toString() const {
- std::stringstream ss;
- ss << getMajor() << '.' << getMinor();
- return ss.str();
- }
-
- const FileVersion& FileVersion::v_0_11(){
- static FileVersion version(0,11);
- return version;
- }
-
- const FileVersion& FileVersion::v_0_12(){
- static FileVersion version(0,12);
- return version;
- }
-}
+ std::string streamKindToString(StreamKind kind) {
+ switch (static_cast<int>(kind)) {
+ case StreamKind_PRESENT:
+ return "present";
+ case StreamKind_DATA:
+ return "data";
+ case StreamKind_LENGTH:
+ return "length";
+ case StreamKind_DICTIONARY_DATA:
+ return "dictionary";
+ case StreamKind_DICTIONARY_COUNT:
+ return "dictionary count";
+ case StreamKind_SECONDARY:
+ return "secondary";
+ case StreamKind_ROW_INDEX:
+ return "index";
+ case StreamKind_BLOOM_FILTER:
+ return "bloom";
+ }
+ std::stringstream buffer;
+ buffer << "unknown - " << kind;
+ return buffer.str();
+ }
+
+ std::string columnEncodingKindToString(ColumnEncodingKind kind) {
+ switch (static_cast<int>(kind)) {
+ case ColumnEncodingKind_DIRECT:
+ return "direct";
+ case ColumnEncodingKind_DICTIONARY:
+ return "dictionary";
+ case ColumnEncodingKind_DIRECT_V2:
+ return "direct rle2";
+ case ColumnEncodingKind_DICTIONARY_V2:
+ return "dictionary rle2";
+ }
+ std::stringstream buffer;
+ buffer << "unknown - " << kind;
+ return buffer.str();
+ }
+
+ std::string FileVersion::toString() const {
+ std::stringstream ss;
+ ss << getMajor() << '.' << getMinor();
+ return ss.str();
+ }
+
+ const FileVersion& FileVersion::v_0_11(){
+ static FileVersion version(0,11);
+ return version;
+ }
+
+ const FileVersion& FileVersion::v_0_12(){
+ static FileVersion version(0,12);
+ return version;
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc
index 4278ed7aae..057641ec1f 100644
--- a/contrib/libs/apache/orc/c++/src/Compression.cc
+++ b/contrib/libs/apache/orc/c++/src/Compression.cc
@@ -1,1071 +1,1071 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "Compression.hh"
-#include "orc/Exceptions.hh"
-#include "LzoDecompressor.hh"
-#include "lz4.h"
-
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-
-#include "zlib.h"
-#include "zstd.h"
-
-#include "wrap/snappy-wrapper.h"
-
-#ifndef ZSTD_CLEVEL_DEFAULT
-#define ZSTD_CLEVEL_DEFAULT 3
-#endif
-
-namespace orc {
-
- class CompressionStreamBase: public BufferedOutputStream {
- public:
- CompressionStreamBase(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool);
-
- virtual bool Next(void** data, int*size) override = 0;
- virtual void BackUp(int count) override;
-
- virtual std::string getName() const override = 0;
- virtual uint64_t flush() override;
-
- virtual bool isCompressed() const override { return true; }
- virtual uint64_t getSize() const override;
-
- protected:
- void writeHeader(char * buffer, size_t compressedSize, bool original) {
- buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0));
- buffer[1] = static_cast<char>(compressedSize >> 7);
- buffer[2] = static_cast<char>(compressedSize >> 15);
- }
-
- // ensure enough room for compression block header
- void ensureHeader();
-
- // Buffer to hold uncompressed data until user calls Next()
- DataBuffer<unsigned char> rawInputBuffer;
-
- // Compress level
- int level;
-
- // Compressed data output buffer
- char * outputBuffer;
-
- // Size for compressionBuffer
- int bufferSize;
-
- // Compress output position
- int outputPosition;
-
- // Compress output buffer size
- int outputSize;
- };
-
- CompressionStreamBase::CompressionStreamBase(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool) :
- BufferedOutputStream(pool,
- outStream,
- capacity,
- blockSize),
- rawInputBuffer(pool, blockSize),
- level(compressionLevel),
- outputBuffer(nullptr),
- bufferSize(0),
- outputPosition(0),
- outputSize(0) {
- // PASS
- }
-
- void CompressionStreamBase::BackUp(int count) {
- if (count > bufferSize) {
- throw std::logic_error("Can't backup that much!");
- }
- bufferSize -= count;
- }
-
- uint64_t CompressionStreamBase::flush() {
- void * data;
- int size;
- if (!Next(&data, &size)) {
- throw std::runtime_error("Failed to flush compression buffer.");
- }
- BufferedOutputStream::BackUp(outputSize - outputPosition);
- bufferSize = outputSize = outputPosition = 0;
- return BufferedOutputStream::flush();
- }
-
- uint64_t CompressionStreamBase::getSize() const {
- return BufferedOutputStream::getSize() -
- static_cast<uint64_t>(outputSize - outputPosition);
- }
-
- void CompressionStreamBase::ensureHeader() {
- // adjust 3 bytes for the compression header
- if (outputPosition + 3 >= outputSize) {
- int newPosition = outputPosition + 3 - outputSize;
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
- }
- outputPosition = newPosition;
- } else {
- outputPosition += 3;
- }
- }
-
- /**
- * Streaming compression base class
- */
- class CompressionStream: public CompressionStreamBase {
- public:
- CompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool);
-
- virtual bool Next(void** data, int*size) override;
- virtual std::string getName() const override = 0;
-
- protected:
- // return total compressed size
- virtual uint64_t doStreamingCompression() = 0;
- };
-
- CompressionStream::CompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool) :
- CompressionStreamBase(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
- // PASS
- }
-
- bool CompressionStream::Next(void** data, int*size) {
- if (bufferSize != 0) {
- ensureHeader();
-
- uint64_t totalCompressedSize = doStreamingCompression();
-
- char * header = outputBuffer + outputPosition - totalCompressedSize - 3;
- if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) {
- writeHeader(header, static_cast<size_t>(bufferSize), true);
- memcpy(
- header + 3,
- rawInputBuffer.data(),
- static_cast<size_t>(bufferSize));
-
- int backup = static_cast<int>(totalCompressedSize) - bufferSize;
- BufferedOutputStream::BackUp(backup);
- outputPosition -= backup;
- outputSize -= backup;
- } else {
- writeHeader(header, totalCompressedSize, false);
- }
- }
-
- *data = rawInputBuffer.data();
- *size = static_cast<int>(rawInputBuffer.size());
- bufferSize = *size;
-
- return true;
- }
-
- class ZlibCompressionStream: public CompressionStream {
- public:
- ZlibCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool);
-
- virtual ~ZlibCompressionStream() override {
- end();
- }
-
- virtual std::string getName() const override;
-
- protected:
- virtual uint64_t doStreamingCompression() override;
-
- private:
- void init();
- void end();
- z_stream strm;
- };
-
- ZlibCompressionStream::ZlibCompressionStream(
- OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : CompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
- init();
- }
-
- uint64_t ZlibCompressionStream::doStreamingCompression() {
- if (deflateReset(&strm) != Z_OK) {
- throw std::runtime_error("Failed to reset inflate.");
- }
-
- strm.avail_in = static_cast<unsigned int>(bufferSize);
- strm.next_in = rawInputBuffer.data();
-
- do {
- if (outputPosition >= outputSize) {
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
- }
- outputPosition = 0;
- }
- strm.next_out = reinterpret_cast<unsigned char *>
- (outputBuffer + outputPosition);
- strm.avail_out = static_cast<unsigned int>
- (outputSize - outputPosition);
-
- int ret = deflate(&strm, Z_FINISH);
- outputPosition = outputSize - static_cast<int>(strm.avail_out);
-
- if (ret == Z_STREAM_END) {
- break;
- } else if (ret == Z_OK) {
- // needs more buffer so will continue the loop
- } else {
- throw std::runtime_error("Failed to deflate input data.");
- }
- } while (strm.avail_out == 0);
-
- return strm.total_out;
- }
-
- std::string ZlibCompressionStream::getName() const {
- return "ZlibCompressionStream";
- }
-
-DIAGNOSTIC_PUSH
-
-#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wold-style-cast")
-#endif
-
- void ZlibCompressionStream::init() {
- strm.zalloc = nullptr;
- strm.zfree = nullptr;
- strm.opaque = nullptr;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "orc/Exceptions.hh"
+#include "LzoDecompressor.hh"
+#include "lz4.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "zlib.h"
+#include "zstd.h"
+
+#include "wrap/snappy-wrapper.h"
+
+#ifndef ZSTD_CLEVEL_DEFAULT
+#define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+namespace orc {
+
+ class CompressionStreamBase: public BufferedOutputStream {
+ public:
+ CompressionStreamBase(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool);
+
+ virtual bool Next(void** data, int*size) override = 0;
+ virtual void BackUp(int count) override;
+
+ virtual std::string getName() const override = 0;
+ virtual uint64_t flush() override;
+
+ virtual bool isCompressed() const override { return true; }
+ virtual uint64_t getSize() const override;
+
+ protected:
+ void writeHeader(char * buffer, size_t compressedSize, bool original) {
+ buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0));
+ buffer[1] = static_cast<char>(compressedSize >> 7);
+ buffer[2] = static_cast<char>(compressedSize >> 15);
+ }
+
+ // ensure enough room for compression block header
+ void ensureHeader();
+
+ // Buffer to hold uncompressed data until user calls Next()
+ DataBuffer<unsigned char> rawInputBuffer;
+
+ // Compress level
+ int level;
+
+ // Compressed data output buffer
+ char * outputBuffer;
+
+ // Size for compressionBuffer
+ int bufferSize;
+
+ // Compress output position
+ int outputPosition;
+
+ // Compress output buffer size
+ int outputSize;
+ };
+
+ CompressionStreamBase::CompressionStreamBase(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool) :
+ BufferedOutputStream(pool,
+ outStream,
+ capacity,
+ blockSize),
+ rawInputBuffer(pool, blockSize),
+ level(compressionLevel),
+ outputBuffer(nullptr),
+ bufferSize(0),
+ outputPosition(0),
+ outputSize(0) {
+ // PASS
+ }
+
+ void CompressionStreamBase::BackUp(int count) {
+ if (count > bufferSize) {
+ throw std::logic_error("Can't backup that much!");
+ }
+ bufferSize -= count;
+ }
+
+ uint64_t CompressionStreamBase::flush() {
+ void * data;
+ int size;
+ if (!Next(&data, &size)) {
+ throw std::runtime_error("Failed to flush compression buffer.");
+ }
+ BufferedOutputStream::BackUp(outputSize - outputPosition);
+ bufferSize = outputSize = outputPosition = 0;
+ return BufferedOutputStream::flush();
+ }
+
+ uint64_t CompressionStreamBase::getSize() const {
+ return BufferedOutputStream::getSize() -
+ static_cast<uint64_t>(outputSize - outputPosition);
+ }
+
+ void CompressionStreamBase::ensureHeader() {
+ // adjust 3 bytes for the compression header
+ if (outputPosition + 3 >= outputSize) {
+ int newPosition = outputPosition + 3 - outputSize;
+ if (!BufferedOutputStream::Next(
+ reinterpret_cast<void **>(&outputBuffer),
+ &outputSize)) {
+ throw std::runtime_error(
+ "Failed to get next output buffer from output stream.");
+ }
+ outputPosition = newPosition;
+ } else {
+ outputPosition += 3;
+ }
+ }
+
+ /**
+ * Streaming compression base class
+ */
+ class CompressionStream: public CompressionStreamBase {
+ public:
+ CompressionStream(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool);
+
+ virtual bool Next(void** data, int*size) override;
+ virtual std::string getName() const override = 0;
+
+ protected:
+ // return total compressed size
+ virtual uint64_t doStreamingCompression() = 0;
+ };
+
+ CompressionStream::CompressionStream(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool) :
+ CompressionStreamBase(outStream,
+ compressionLevel,
+ capacity,
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ bool CompressionStream::Next(void** data, int*size) {
+ if (bufferSize != 0) {
+ ensureHeader();
+
+ uint64_t totalCompressedSize = doStreamingCompression();
+
+ char * header = outputBuffer + outputPosition - totalCompressedSize - 3;
+ if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) {
+ writeHeader(header, static_cast<size_t>(bufferSize), true);
+ memcpy(
+ header + 3,
+ rawInputBuffer.data(),
+ static_cast<size_t>(bufferSize));
+
+ int backup = static_cast<int>(totalCompressedSize) - bufferSize;
+ BufferedOutputStream::BackUp(backup);
+ outputPosition -= backup;
+ outputSize -= backup;
+ } else {
+ writeHeader(header, totalCompressedSize, false);
+ }
+ }
+
+ *data = rawInputBuffer.data();
+ *size = static_cast<int>(rawInputBuffer.size());
+ bufferSize = *size;
+
+ return true;
+ }
+
+ class ZlibCompressionStream: public CompressionStream {
+ public:
+ ZlibCompressionStream(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool);
+
+ virtual ~ZlibCompressionStream() override {
+ end();
+ }
+
+ virtual std::string getName() const override;
+
+ protected:
+ virtual uint64_t doStreamingCompression() override;
+
+ private:
+ void init();
+ void end();
+ z_stream strm;
+ };
+
+ ZlibCompressionStream::ZlibCompressionStream(
+ OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool)
+ : CompressionStream(outStream,
+ compressionLevel,
+ capacity,
+ blockSize,
+ pool) {
+ init();
+ }
+
+ uint64_t ZlibCompressionStream::doStreamingCompression() {
+ if (deflateReset(&strm) != Z_OK) {
+ throw std::runtime_error("Failed to reset inflate.");
+ }
+
+ strm.avail_in = static_cast<unsigned int>(bufferSize);
+ strm.next_in = rawInputBuffer.data();
+
+ do {
+ if (outputPosition >= outputSize) {
+ if (!BufferedOutputStream::Next(
+ reinterpret_cast<void **>(&outputBuffer),
+ &outputSize)) {
+ throw std::runtime_error(
+ "Failed to get next output buffer from output stream.");
+ }
+ outputPosition = 0;
+ }
+ strm.next_out = reinterpret_cast<unsigned char *>
+ (outputBuffer + outputPosition);
+ strm.avail_out = static_cast<unsigned int>
+ (outputSize - outputPosition);
+
+ int ret = deflate(&strm, Z_FINISH);
+ outputPosition = outputSize - static_cast<int>(strm.avail_out);
+
+ if (ret == Z_STREAM_END) {
+ break;
+ } else if (ret == Z_OK) {
+ // needs more buffer so will continue the loop
+ } else {
+ throw std::runtime_error("Failed to deflate input data.");
+ }
+ } while (strm.avail_out == 0);
+
+ return strm.total_out;
+ }
+
+ std::string ZlibCompressionStream::getName() const {
+ return "ZlibCompressionStream";
+ }
+
+DIAGNOSTIC_PUSH
+
+#if defined(__GNUC__) || defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wold-style-cast")
+#endif
+
+ void ZlibCompressionStream::init() {
+ strm.zalloc = nullptr;
+ strm.zfree = nullptr;
+ strm.opaque = nullptr;
strm.next_in = nullptr;
-
- if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)
- != Z_OK) {
- throw std::runtime_error("Error while calling deflateInit2() for zlib.");
- }
- }
-
- void ZlibCompressionStream::end() {
- (void)deflateEnd(&strm);
- }
-
-DIAGNOSTIC_PUSH
-
- enum DecompressState { DECOMPRESS_HEADER,
- DECOMPRESS_START,
- DECOMPRESS_CONTINUE,
- DECOMPRESS_ORIGINAL,
- DECOMPRESS_EOF};
-
- class ZlibDecompressionStream: public SeekableInputStream {
- public:
- ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool);
- virtual ~ZlibDecompressionStream() override;
- virtual bool Next(const void** data, int*size) override;
- virtual void BackUp(int count) override;
- virtual bool Skip(int count) override;
- virtual int64_t ByteCount() const override;
- virtual void seek(PositionProvider& position) override;
- virtual std::string getName() const override;
-
- private:
- void readBuffer(bool failOnEof) {
- int length;
- if (!input->Next(reinterpret_cast<const void**>(&inputBuffer),
- &length)) {
- if (failOnEof) {
- throw ParseError("Read past EOF in "
- "ZlibDecompressionStream::readBuffer");
- }
- state = DECOMPRESS_EOF;
- inputBuffer = nullptr;
- inputBufferEnd = nullptr;
- } else {
- inputBufferEnd = inputBuffer + length;
- }
- }
-
- uint32_t readByte(bool failOnEof) {
- if (inputBuffer == inputBufferEnd) {
- readBuffer(failOnEof);
- if (state == DECOMPRESS_EOF) {
- return 0;
- }
- }
- return static_cast<unsigned char>(*(inputBuffer++));
- }
-
- void readHeader() {
- uint32_t header = readByte(false);
- if (state != DECOMPRESS_EOF) {
- header |= readByte(true) << 8;
- header |= readByte(true) << 16;
- if (header & 1) {
- state = DECOMPRESS_ORIGINAL;
- } else {
- state = DECOMPRESS_START;
- }
- remainingLength = header >> 1;
- } else {
- remainingLength = 0;
- }
- }
-
- MemoryPool& pool;
- const size_t blockSize;
- std::unique_ptr<SeekableInputStream> input;
- z_stream zstream;
- DataBuffer<char> buffer;
-
- // the current state
- DecompressState state;
-
- // the start of the current buffer
- // This pointer is not owned by us. It is either owned by zstream or
- // the underlying stream.
- const char* outputBuffer;
- // the size of the current buffer
- size_t outputBufferLength;
- // the size of the current chunk
- size_t remainingLength;
-
- // the last buffer returned from the input
- const char *inputBuffer;
- const char *inputBufferEnd;
-
- // roughly the number of bytes returned
- off_t bytesReturned;
- };
-
-DIAGNOSTIC_PUSH
-
-#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wold-style-cast")
-#endif
-
- ZlibDecompressionStream::ZlibDecompressionStream
- (std::unique_ptr<SeekableInputStream> inStream,
- size_t _blockSize,
- MemoryPool& _pool
- ): pool(_pool),
- blockSize(_blockSize),
- buffer(pool, _blockSize) {
- input.reset(inStream.release());
- zstream.next_in = nullptr;
- zstream.avail_in = 0;
- zstream.zalloc = nullptr;
- zstream.zfree = nullptr;
- zstream.opaque = nullptr;
- zstream.next_out = reinterpret_cast<Bytef*>(buffer.data());
- zstream.avail_out = static_cast<uInt>(blockSize);
- int64_t result = inflateInit2(&zstream, -15);
- switch (result) {
- case Z_OK:
- break;
- case Z_MEM_ERROR:
- throw std::logic_error("Memory error from inflateInit2");
- case Z_VERSION_ERROR:
- throw std::logic_error("Version error from inflateInit2");
- case Z_STREAM_ERROR:
- throw std::logic_error("Stream error from inflateInit2");
- default:
- throw std::logic_error("Unknown error from inflateInit2");
- }
- outputBuffer = nullptr;
- outputBufferLength = 0;
- remainingLength = 0;
- state = DECOMPRESS_HEADER;
- inputBuffer = nullptr;
- inputBufferEnd = nullptr;
- bytesReturned = 0;
- }
-
-DIAGNOSTIC_POP
-
- ZlibDecompressionStream::~ZlibDecompressionStream() {
- int64_t result = inflateEnd(&zstream);
- if (result != Z_OK) {
- // really can't throw in destructors
- std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n";
- }
- }
-
- bool ZlibDecompressionStream::Next(const void** data, int*size) {
- // if the user pushed back, return them the partial buffer
- if (outputBufferLength) {
- *data = outputBuffer;
- *size = static_cast<int>(outputBufferLength);
- outputBuffer += outputBufferLength;
- outputBufferLength = 0;
- return true;
- }
- if (state == DECOMPRESS_HEADER || remainingLength == 0) {
- readHeader();
- }
- if (state == DECOMPRESS_EOF) {
- return false;
- }
- if (inputBuffer == inputBufferEnd) {
- readBuffer(true);
- }
- size_t availSize =
- std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
- remainingLength);
- if (state == DECOMPRESS_ORIGINAL) {
- *data = inputBuffer;
- *size = static_cast<int>(availSize);
- outputBuffer = inputBuffer + availSize;
- outputBufferLength = 0;
- } else if (state == DECOMPRESS_START) {
- zstream.next_in =
- reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
- zstream.avail_in = static_cast<uInt>(availSize);
- outputBuffer = buffer.data();
- zstream.next_out =
- reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
- zstream.avail_out = static_cast<uInt>(blockSize);
- if (inflateReset(&zstream) != Z_OK) {
- throw std::logic_error("Bad inflateReset in "
- "ZlibDecompressionStream::Next");
- }
- int64_t result;
- do {
- result = inflate(&zstream, availSize == remainingLength ? Z_FINISH :
- Z_SYNC_FLUSH);
- switch (result) {
- case Z_OK:
- remainingLength -= availSize;
- inputBuffer += availSize;
- readBuffer(true);
- availSize =
- std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
- remainingLength);
- zstream.next_in =
- reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
- zstream.avail_in = static_cast<uInt>(availSize);
- break;
- case Z_STREAM_END:
- break;
- case Z_BUF_ERROR:
- throw std::logic_error("Buffer error in "
- "ZlibDecompressionStream::Next");
- case Z_DATA_ERROR:
- throw std::logic_error("Data error in "
- "ZlibDecompressionStream::Next");
- case Z_STREAM_ERROR:
- throw std::logic_error("Stream error in "
- "ZlibDecompressionStream::Next");
- default:
- throw std::logic_error("Unknown error in "
- "ZlibDecompressionStream::Next");
- }
- } while (result != Z_STREAM_END);
- *size = static_cast<int>(blockSize - zstream.avail_out);
- *data = outputBuffer;
- outputBufferLength = 0;
- outputBuffer += *size;
- } else {
- throw std::logic_error("Unknown compression state in "
- "ZlibDecompressionStream::Next");
- }
- inputBuffer += availSize;
- remainingLength -= availSize;
- bytesReturned += *size;
- return true;
- }
-
- void ZlibDecompressionStream::BackUp(int count) {
- if (outputBuffer == nullptr || outputBufferLength != 0) {
- throw std::logic_error("Backup without previous Next in "
- "ZlibDecompressionStream");
- }
- outputBuffer -= static_cast<size_t>(count);
- outputBufferLength = static_cast<size_t>(count);
- bytesReturned -= count;
- }
-
- bool ZlibDecompressionStream::Skip(int count) {
- bytesReturned += count;
- // this is a stupid implementation for now.
- // should skip entire blocks without decompressing
- while (count > 0) {
- const void *ptr;
- int len;
- if (!Next(&ptr, &len)) {
- return false;
- }
- if (len > count) {
- BackUp(len - count);
- count = 0;
- } else {
- count -= len;
- }
- }
- return true;
- }
-
- int64_t ZlibDecompressionStream::ByteCount() const {
- return bytesReturned;
- }
-
- void ZlibDecompressionStream::seek(PositionProvider& position) {
- // clear state to force seek to read from the right position
- state = DECOMPRESS_HEADER;
- outputBuffer = nullptr;
- outputBufferLength = 0;
- remainingLength = 0;
- inputBuffer = nullptr;
- inputBufferEnd = nullptr;
-
- input->seek(position);
- bytesReturned = static_cast<off_t>(input->ByteCount());
- if (!Skip(static_cast<int>(position.next()))) {
- throw ParseError("Bad skip in ZlibDecompressionStream::seek");
- }
- }
-
- std::string ZlibDecompressionStream::getName() const {
- std::ostringstream result;
- result << "zlib(" << input->getName() << ")";
- return result.str();
- }
-
- class BlockDecompressionStream: public SeekableInputStream {
- public:
- BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool);
-
- virtual ~BlockDecompressionStream() override {}
- virtual bool Next(const void** data, int*size) override;
- virtual void BackUp(int count) override;
- virtual bool Skip(int count) override;
- virtual int64_t ByteCount() const override;
- virtual void seek(PositionProvider& position) override;
- virtual std::string getName() const override = 0;
-
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength) = 0;
-
- std::string getStreamName() const {
- return input->getName();
- }
-
- private:
- void readBuffer(bool failOnEof) {
- int length;
- if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr),
- &length)) {
- if (failOnEof) {
- throw ParseError(getName() + "read past EOF");
- }
- state = DECOMPRESS_EOF;
- inputBufferPtr = nullptr;
- inputBufferPtrEnd = nullptr;
- } else {
- inputBufferPtrEnd = inputBufferPtr + length;
- }
- }
-
- uint32_t readByte(bool failOnEof) {
- if (inputBufferPtr == inputBufferPtrEnd) {
- readBuffer(failOnEof);
- if (state == DECOMPRESS_EOF) {
- return 0;
- }
- }
- return static_cast<unsigned char>(*(inputBufferPtr++));
- }
-
- void readHeader() {
- uint32_t header = readByte(false);
- if (state != DECOMPRESS_EOF) {
- header |= readByte(true) << 8;
- header |= readByte(true) << 16;
- if (header & 1) {
- state = DECOMPRESS_ORIGINAL;
- } else {
- state = DECOMPRESS_START;
- }
- remainingLength = header >> 1;
- } else {
- remainingLength = 0;
- }
- }
-
- std::unique_ptr<SeekableInputStream> input;
- MemoryPool& pool;
-
- // may need to stitch together multiple input buffers;
- // to give snappy a contiguous block
- DataBuffer<char> inputBuffer;
-
- // uncompressed output
- DataBuffer<char> outputBuffer;
-
- // the current state
- DecompressState state;
-
- // the start of the current output buffer
- const char* outputBufferPtr;
- // the size of the current output buffer
- size_t outputBufferLength;
-
- // the size of the current chunk
- size_t remainingLength;
-
- // the last buffer returned from the input
- const char *inputBufferPtr;
- const char *inputBufferPtrEnd;
-
- // bytes returned by this stream
- off_t bytesReturned;
- };
-
- BlockDecompressionStream::BlockDecompressionStream
- (std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& _pool
- ) : pool(_pool),
- inputBuffer(pool, bufferSize),
- outputBuffer(pool, bufferSize),
- state(DECOMPRESS_HEADER),
- outputBufferPtr(nullptr),
- outputBufferLength(0),
- remainingLength(0),
- inputBufferPtr(nullptr),
- inputBufferPtrEnd(nullptr),
- bytesReturned(0) {
- input.reset(inStream.release());
- }
-
- bool BlockDecompressionStream::Next(const void** data, int*size) {
- // if the user pushed back, return them the partial buffer
- if (outputBufferLength) {
- *data = outputBufferPtr;
- *size = static_cast<int>(outputBufferLength);
- outputBufferPtr += outputBufferLength;
- bytesReturned += static_cast<off_t>(outputBufferLength);
- outputBufferLength = 0;
- return true;
- }
- if (state == DECOMPRESS_HEADER || remainingLength == 0) {
- readHeader();
- }
- if (state == DECOMPRESS_EOF) {
- return false;
- }
- if (inputBufferPtr == inputBufferPtrEnd) {
- readBuffer(true);
- }
-
- size_t availSize =
- std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr),
- remainingLength);
- if (state == DECOMPRESS_ORIGINAL) {
- *data = inputBufferPtr;
- *size = static_cast<int>(availSize);
- outputBufferPtr = inputBufferPtr + availSize;
- outputBufferLength = 0;
- inputBufferPtr += availSize;
- remainingLength -= availSize;
- } else if (state == DECOMPRESS_START) {
- // Get contiguous bytes of compressed block.
- const char *compressed = inputBufferPtr;
- if (remainingLength == availSize) {
- inputBufferPtr += availSize;
- } else {
- // Did not read enough from input.
- if (inputBuffer.capacity() < remainingLength) {
- inputBuffer.resize(remainingLength);
- }
- ::memcpy(inputBuffer.data(), inputBufferPtr, availSize);
- inputBufferPtr += availSize;
- compressed = inputBuffer.data();
-
- for (size_t pos = availSize; pos < remainingLength; ) {
- readBuffer(true);
- size_t avail =
- std::min(static_cast<size_t>(inputBufferPtrEnd -
- inputBufferPtr),
- remainingLength - pos);
- ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail);
- pos += avail;
- inputBufferPtr += avail;
- }
- }
-
- outputBufferLength = decompress(compressed, remainingLength,
- outputBuffer.data(),
- outputBuffer.capacity());
-
- remainingLength = 0;
- state = DECOMPRESS_HEADER;
- *data = outputBuffer.data();
- *size = static_cast<int>(outputBufferLength);
- outputBufferPtr = outputBuffer.data() + outputBufferLength;
- outputBufferLength = 0;
- }
-
- bytesReturned += *size;
- return true;
- }
-
- void BlockDecompressionStream::BackUp(int count) {
- if (outputBufferPtr == nullptr || outputBufferLength != 0) {
- throw std::logic_error("Backup without previous Next in "+getName());
- }
- outputBufferPtr -= static_cast<size_t>(count);
- outputBufferLength = static_cast<size_t>(count);
- bytesReturned -= count;
- }
-
- bool BlockDecompressionStream::Skip(int count) {
- bytesReturned += count;
- // this is a stupid implementation for now.
- // should skip entire blocks without decompressing
- while (count > 0) {
- const void *ptr;
- int len;
- if (!Next(&ptr, &len)) {
- return false;
- }
- if (len > count) {
- BackUp(len - count);
- count = 0;
- } else {
- count -= len;
- }
- }
- return true;
- }
-
- int64_t BlockDecompressionStream::ByteCount() const {
- return bytesReturned;
- }
-
- void BlockDecompressionStream::seek(PositionProvider& position) {
- // clear state to force seek to read from the right position
- state = DECOMPRESS_HEADER;
- outputBufferPtr = nullptr;
- outputBufferLength = 0;
- remainingLength = 0;
- inputBufferPtr = nullptr;
- inputBufferPtrEnd = nullptr;
-
- input->seek(position);
- if (!Skip(static_cast<int>(position.next()))) {
- throw ParseError("Bad skip in " + getName());
- }
- }
-
- class SnappyDecompressionStream: public BlockDecompressionStream {
- public:
- SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- pool) {
- // PASS
- }
-
- std::string getName() const override {
- std::ostringstream result;
- result << "snappy(" << getStreamName() << ")";
- return result.str();
- }
-
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
- };
-
- uint64_t SnappyDecompressionStream::decompress(const char *input,
- uint64_t length,
- char *output,
- size_t maxOutputLength) {
- size_t outLength;
- if (!snappy::GetUncompressedLength(input, length, &outLength)) {
- throw ParseError("SnappyDecompressionStream choked on corrupt input");
- }
-
- if (outLength > maxOutputLength) {
- throw std::logic_error("Snappy length exceeds block size");
- }
-
- if (!snappy::RawUncompress(input, length, output)) {
- throw ParseError("SnappyDecompressionStream choked on corrupt input");
- }
- return outLength;
- }
-
- class LzoDecompressionStream: public BlockDecompressionStream {
- public:
- LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- pool) {
- // PASS
- }
-
- std::string getName() const override {
- std::ostringstream result;
- result << "lzo(" << getStreamName() << ")";
- return result.str();
- }
-
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
- };
-
- uint64_t LzoDecompressionStream::decompress(const char *input,
- uint64_t length,
- char *output,
- size_t maxOutputLength) {
- return lzoDecompress(input, input + length, output,
- output + maxOutputLength);
- }
-
- class Lz4DecompressionStream: public BlockDecompressionStream {
- public:
- Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- pool) {
- // PASS
- }
-
- std::string getName() const override {
- std::ostringstream result;
- result << "lz4(" << getStreamName() << ")";
- return result.str();
- }
-
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
- };
-
- uint64_t Lz4DecompressionStream::decompress(const char *input,
- uint64_t length,
- char *output,
- size_t maxOutputLength) {
- int result = LZ4_decompress_safe(input, output, static_cast<int>(length),
- static_cast<int>(maxOutputLength));
- if (result < 0) {
- throw ParseError(getName() + " - failed to decompress");
- }
- return static_cast<uint64_t>(result);
- }
-
- /**
- * Block compression base class
- */
- class BlockCompressionStream: public CompressionStreamBase {
- public:
- BlockCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : CompressionStreamBase(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool)
- , compressorBuffer(pool) {
- // PASS
- }
-
- virtual bool Next(void** data, int*size) override;
- virtual std::string getName() const override = 0;
-
- protected:
- // compresses a block and returns the compressed size
- virtual uint64_t doBlockCompression() = 0;
-
- // return maximum possible compression size for allocating space for
- // compressorBuffer below
- virtual uint64_t estimateMaxCompressionSize() = 0;
-
- // should allocate max possible compressed size
- DataBuffer<unsigned char> compressorBuffer;
- };
-
- bool BlockCompressionStream::Next(void** data, int*size) {
- if (bufferSize != 0) {
- ensureHeader();
-
- // perform compression
- size_t totalCompressedSize = doBlockCompression();
-
- const unsigned char * dataToWrite = nullptr;
- int totalSizeToWrite = 0;
- char * header = outputBuffer + outputPosition - 3;
-
- if (totalCompressedSize >= static_cast<size_t>(bufferSize)) {
- writeHeader(header, static_cast<size_t>(bufferSize), true);
- dataToWrite = rawInputBuffer.data();
- totalSizeToWrite = bufferSize;
- } else {
- writeHeader(header, totalCompressedSize, false);
- dataToWrite = compressorBuffer.data();
- totalSizeToWrite = static_cast<int>(totalCompressedSize);
- }
-
- char * dst = header + 3;
- while (totalSizeToWrite > 0) {
- if (outputPosition == outputSize) {
- if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::logic_error(
- "Failed to get next output buffer from output stream.");
- }
- outputPosition = 0;
- dst = outputBuffer;
- } else if (outputPosition > outputSize) {
- // this will unlikely happen, but we have seen a few on zstd v1.1.0
- throw std::logic_error("Write to an out-of-bound place!");
- }
-
- int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition);
- std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite));
-
- outputPosition += sizeToWrite;
- dataToWrite += sizeToWrite;
- totalSizeToWrite -= sizeToWrite;
- dst += sizeToWrite;
- }
- }
-
- *data = rawInputBuffer.data();
- *size = static_cast<int>(rawInputBuffer.size());
- bufferSize = *size;
- compressorBuffer.resize(estimateMaxCompressionSize());
-
- return true;
- }
-
- /**
- * ZSTD block compression
- */
- class ZSTDCompressionStream: public BlockCompressionStream {
- public:
- ZSTDCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool) {
+
+ if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)
+ != Z_OK) {
+ throw std::runtime_error("Error while calling deflateInit2() for zlib.");
+ }
+ }
+
+ void ZlibCompressionStream::end() {
+ (void)deflateEnd(&strm);
+ }
+
+DIAGNOSTIC_PUSH
+
+ enum DecompressState { DECOMPRESS_HEADER,
+ DECOMPRESS_START,
+ DECOMPRESS_CONTINUE,
+ DECOMPRESS_ORIGINAL,
+ DECOMPRESS_EOF};
+
+ class ZlibDecompressionStream: public SeekableInputStream {
+ public:
+ ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool);
+ virtual ~ZlibDecompressionStream() override;
+ virtual bool Next(const void** data, int*size) override;
+ virtual void BackUp(int count) override;
+ virtual bool Skip(int count) override;
+ virtual int64_t ByteCount() const override;
+ virtual void seek(PositionProvider& position) override;
+ virtual std::string getName() const override;
+
+ private:
+ void readBuffer(bool failOnEof) {
+ int length;
+ if (!input->Next(reinterpret_cast<const void**>(&inputBuffer),
+ &length)) {
+ if (failOnEof) {
+ throw ParseError("Read past EOF in "
+ "ZlibDecompressionStream::readBuffer");
+ }
+ state = DECOMPRESS_EOF;
+ inputBuffer = nullptr;
+ inputBufferEnd = nullptr;
+ } else {
+ inputBufferEnd = inputBuffer + length;
+ }
+ }
+
+ uint32_t readByte(bool failOnEof) {
+ if (inputBuffer == inputBufferEnd) {
+ readBuffer(failOnEof);
+ if (state == DECOMPRESS_EOF) {
+ return 0;
+ }
+ }
+ return static_cast<unsigned char>(*(inputBuffer++));
+ }
+
+ void readHeader() {
+ uint32_t header = readByte(false);
+ if (state != DECOMPRESS_EOF) {
+ header |= readByte(true) << 8;
+ header |= readByte(true) << 16;
+ if (header & 1) {
+ state = DECOMPRESS_ORIGINAL;
+ } else {
+ state = DECOMPRESS_START;
+ }
+ remainingLength = header >> 1;
+ } else {
+ remainingLength = 0;
+ }
+ }
+
+ MemoryPool& pool;
+ const size_t blockSize;
+ std::unique_ptr<SeekableInputStream> input;
+ z_stream zstream;
+ DataBuffer<char> buffer;
+
+ // the current state
+ DecompressState state;
+
+ // the start of the current buffer
+ // This pointer is not owned by us. It is either owned by zstream or
+ // the underlying stream.
+ const char* outputBuffer;
+ // the size of the current buffer
+ size_t outputBufferLength;
+ // the size of the current chunk
+ size_t remainingLength;
+
+ // the last buffer returned from the input
+ const char *inputBuffer;
+ const char *inputBufferEnd;
+
+ // roughly the number of bytes returned
+ off_t bytesReturned;
+ };
+
+DIAGNOSTIC_PUSH
+
+#if defined(__GNUC__) || defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wold-style-cast")
+#endif
+
+ ZlibDecompressionStream::ZlibDecompressionStream
+ (std::unique_ptr<SeekableInputStream> inStream,
+ size_t _blockSize,
+ MemoryPool& _pool
+ ): pool(_pool),
+ blockSize(_blockSize),
+ buffer(pool, _blockSize) {
+ input.reset(inStream.release());
+ zstream.next_in = nullptr;
+ zstream.avail_in = 0;
+ zstream.zalloc = nullptr;
+ zstream.zfree = nullptr;
+ zstream.opaque = nullptr;
+ zstream.next_out = reinterpret_cast<Bytef*>(buffer.data());
+ zstream.avail_out = static_cast<uInt>(blockSize);
+ int64_t result = inflateInit2(&zstream, -15);
+ switch (result) {
+ case Z_OK:
+ break;
+ case Z_MEM_ERROR:
+ throw std::logic_error("Memory error from inflateInit2");
+ case Z_VERSION_ERROR:
+ throw std::logic_error("Version error from inflateInit2");
+ case Z_STREAM_ERROR:
+ throw std::logic_error("Stream error from inflateInit2");
+ default:
+ throw std::logic_error("Unknown error from inflateInit2");
+ }
+ outputBuffer = nullptr;
+ outputBufferLength = 0;
+ remainingLength = 0;
+ state = DECOMPRESS_HEADER;
+ inputBuffer = nullptr;
+ inputBufferEnd = nullptr;
+ bytesReturned = 0;
+ }
+
+DIAGNOSTIC_POP
+
+ ZlibDecompressionStream::~ZlibDecompressionStream() {
+ int64_t result = inflateEnd(&zstream);
+ if (result != Z_OK) {
+ // really can't throw in destructors
+ std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n";
+ }
+ }
+
+ bool ZlibDecompressionStream::Next(const void** data, int*size) {
+ // if the user pushed back, return them the partial buffer
+ if (outputBufferLength) {
+ *data = outputBuffer;
+ *size = static_cast<int>(outputBufferLength);
+ outputBuffer += outputBufferLength;
+ outputBufferLength = 0;
+ return true;
+ }
+ if (state == DECOMPRESS_HEADER || remainingLength == 0) {
+ readHeader();
+ }
+ if (state == DECOMPRESS_EOF) {
+ return false;
+ }
+ if (inputBuffer == inputBufferEnd) {
+ readBuffer(true);
+ }
+ size_t availSize =
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
+ remainingLength);
+ if (state == DECOMPRESS_ORIGINAL) {
+ *data = inputBuffer;
+ *size = static_cast<int>(availSize);
+ outputBuffer = inputBuffer + availSize;
+ outputBufferLength = 0;
+ } else if (state == DECOMPRESS_START) {
+ zstream.next_in =
+ reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream.avail_in = static_cast<uInt>(availSize);
+ outputBuffer = buffer.data();
+ zstream.next_out =
+ reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
+ zstream.avail_out = static_cast<uInt>(blockSize);
+ if (inflateReset(&zstream) != Z_OK) {
+ throw std::logic_error("Bad inflateReset in "
+ "ZlibDecompressionStream::Next");
+ }
+ int64_t result;
+ do {
+ result = inflate(&zstream, availSize == remainingLength ? Z_FINISH :
+ Z_SYNC_FLUSH);
+ switch (result) {
+ case Z_OK:
+ remainingLength -= availSize;
+ inputBuffer += availSize;
+ readBuffer(true);
+ availSize =
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
+ remainingLength);
+ zstream.next_in =
+ reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream.avail_in = static_cast<uInt>(availSize);
+ break;
+ case Z_STREAM_END:
+ break;
+ case Z_BUF_ERROR:
+ throw std::logic_error("Buffer error in "
+ "ZlibDecompressionStream::Next");
+ case Z_DATA_ERROR:
+ throw std::logic_error("Data error in "
+ "ZlibDecompressionStream::Next");
+ case Z_STREAM_ERROR:
+ throw std::logic_error("Stream error in "
+ "ZlibDecompressionStream::Next");
+ default:
+ throw std::logic_error("Unknown error in "
+ "ZlibDecompressionStream::Next");
+ }
+ } while (result != Z_STREAM_END);
+ *size = static_cast<int>(blockSize - zstream.avail_out);
+ *data = outputBuffer;
+ outputBufferLength = 0;
+ outputBuffer += *size;
+ } else {
+ throw std::logic_error("Unknown compression state in "
+ "ZlibDecompressionStream::Next");
+ }
+ inputBuffer += availSize;
+ remainingLength -= availSize;
+ bytesReturned += *size;
+ return true;
+ }
+
+ void ZlibDecompressionStream::BackUp(int count) {
+ if (outputBuffer == nullptr || outputBufferLength != 0) {
+ throw std::logic_error("Backup without previous Next in "
+ "ZlibDecompressionStream");
+ }
+ outputBuffer -= static_cast<size_t>(count);
+ outputBufferLength = static_cast<size_t>(count);
+ bytesReturned -= count;
+ }
+
+ bool ZlibDecompressionStream::Skip(int count) {
+ bytesReturned += count;
+ // this is a stupid implementation for now.
+ // should skip entire blocks without decompressing
+ while (count > 0) {
+ const void *ptr;
+ int len;
+ if (!Next(&ptr, &len)) {
+ return false;
+ }
+ if (len > count) {
+ BackUp(len - count);
+ count = 0;
+ } else {
+ count -= len;
+ }
+ }
+ return true;
+ }
+
+ int64_t ZlibDecompressionStream::ByteCount() const {
+ return bytesReturned;
+ }
+
+ void ZlibDecompressionStream::seek(PositionProvider& position) {
+ // clear state to force seek to read from the right position
+ state = DECOMPRESS_HEADER;
+ outputBuffer = nullptr;
+ outputBufferLength = 0;
+ remainingLength = 0;
+ inputBuffer = nullptr;
+ inputBufferEnd = nullptr;
+
+ input->seek(position);
+ bytesReturned = static_cast<off_t>(input->ByteCount());
+ if (!Skip(static_cast<int>(position.next()))) {
+ throw ParseError("Bad skip in ZlibDecompressionStream::seek");
+ }
+ }
+
+ std::string ZlibDecompressionStream::getName() const {
+ std::ostringstream result;
+ result << "zlib(" << input->getName() << ")";
+ return result.str();
+ }
+
+ class BlockDecompressionStream: public SeekableInputStream {
+ public:
+ BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool);
+
+ virtual ~BlockDecompressionStream() override {}
+ virtual bool Next(const void** data, int*size) override;
+ virtual void BackUp(int count) override;
+ virtual bool Skip(int count) override;
+ virtual int64_t ByteCount() const override;
+ virtual void seek(PositionProvider& position) override;
+ virtual std::string getName() const override = 0;
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength) = 0;
+
+ std::string getStreamName() const {
+ return input->getName();
+ }
+
+ private:
+ void readBuffer(bool failOnEof) {
+ int length;
+ if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr),
+ &length)) {
+ if (failOnEof) {
+ throw ParseError(getName() + "read past EOF");
+ }
+ state = DECOMPRESS_EOF;
+ inputBufferPtr = nullptr;
+ inputBufferPtrEnd = nullptr;
+ } else {
+ inputBufferPtrEnd = inputBufferPtr + length;
+ }
+ }
+
+ uint32_t readByte(bool failOnEof) {
+ if (inputBufferPtr == inputBufferPtrEnd) {
+ readBuffer(failOnEof);
+ if (state == DECOMPRESS_EOF) {
+ return 0;
+ }
+ }
+ return static_cast<unsigned char>(*(inputBufferPtr++));
+ }
+
+ void readHeader() {
+ uint32_t header = readByte(false);
+ if (state != DECOMPRESS_EOF) {
+ header |= readByte(true) << 8;
+ header |= readByte(true) << 16;
+ if (header & 1) {
+ state = DECOMPRESS_ORIGINAL;
+ } else {
+ state = DECOMPRESS_START;
+ }
+ remainingLength = header >> 1;
+ } else {
+ remainingLength = 0;
+ }
+ }
+
+ std::unique_ptr<SeekableInputStream> input;
+ MemoryPool& pool;
+
+ // may need to stitch together multiple input buffers;
+ // to give snappy a contiguous block
+ DataBuffer<char> inputBuffer;
+
+ // uncompressed output
+ DataBuffer<char> outputBuffer;
+
+ // the current state
+ DecompressState state;
+
+ // the start of the current output buffer
+ const char* outputBufferPtr;
+ // the size of the current output buffer
+ size_t outputBufferLength;
+
+ // the size of the current chunk
+ size_t remainingLength;
+
+ // the last buffer returned from the input
+ const char *inputBufferPtr;
+ const char *inputBufferPtrEnd;
+
+ // bytes returned by this stream
+ off_t bytesReturned;
+ };
+
+ BlockDecompressionStream::BlockDecompressionStream
+ (std::unique_ptr<SeekableInputStream> inStream,
+ size_t bufferSize,
+ MemoryPool& _pool
+ ) : pool(_pool),
+ inputBuffer(pool, bufferSize),
+ outputBuffer(pool, bufferSize),
+ state(DECOMPRESS_HEADER),
+ outputBufferPtr(nullptr),
+ outputBufferLength(0),
+ remainingLength(0),
+ inputBufferPtr(nullptr),
+ inputBufferPtrEnd(nullptr),
+ bytesReturned(0) {
+ input.reset(inStream.release());
+ }
+
+ bool BlockDecompressionStream::Next(const void** data, int*size) {
+ // if the user pushed back, return them the partial buffer
+ if (outputBufferLength) {
+ *data = outputBufferPtr;
+ *size = static_cast<int>(outputBufferLength);
+ outputBufferPtr += outputBufferLength;
+ bytesReturned += static_cast<off_t>(outputBufferLength);
+ outputBufferLength = 0;
+ return true;
+ }
+ if (state == DECOMPRESS_HEADER || remainingLength == 0) {
+ readHeader();
+ }
+ if (state == DECOMPRESS_EOF) {
+ return false;
+ }
+ if (inputBufferPtr == inputBufferPtrEnd) {
+ readBuffer(true);
+ }
+
+ size_t availSize =
+ std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr),
+ remainingLength);
+ if (state == DECOMPRESS_ORIGINAL) {
+ *data = inputBufferPtr;
+ *size = static_cast<int>(availSize);
+ outputBufferPtr = inputBufferPtr + availSize;
+ outputBufferLength = 0;
+ inputBufferPtr += availSize;
+ remainingLength -= availSize;
+ } else if (state == DECOMPRESS_START) {
+ // Get contiguous bytes of compressed block.
+ const char *compressed = inputBufferPtr;
+ if (remainingLength == availSize) {
+ inputBufferPtr += availSize;
+ } else {
+ // Did not read enough from input.
+ if (inputBuffer.capacity() < remainingLength) {
+ inputBuffer.resize(remainingLength);
+ }
+ ::memcpy(inputBuffer.data(), inputBufferPtr, availSize);
+ inputBufferPtr += availSize;
+ compressed = inputBuffer.data();
+
+ for (size_t pos = availSize; pos < remainingLength; ) {
+ readBuffer(true);
+ size_t avail =
+ std::min(static_cast<size_t>(inputBufferPtrEnd -
+ inputBufferPtr),
+ remainingLength - pos);
+ ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail);
+ pos += avail;
+ inputBufferPtr += avail;
+ }
+ }
+
+ outputBufferLength = decompress(compressed, remainingLength,
+ outputBuffer.data(),
+ outputBuffer.capacity());
+
+ remainingLength = 0;
+ state = DECOMPRESS_HEADER;
+ *data = outputBuffer.data();
+ *size = static_cast<int>(outputBufferLength);
+ outputBufferPtr = outputBuffer.data() + outputBufferLength;
+ outputBufferLength = 0;
+ }
+
+ bytesReturned += *size;
+ return true;
+ }
+
+ void BlockDecompressionStream::BackUp(int count) {
+ if (outputBufferPtr == nullptr || outputBufferLength != 0) {
+ throw std::logic_error("Backup without previous Next in "+getName());
+ }
+ outputBufferPtr -= static_cast<size_t>(count);
+ outputBufferLength = static_cast<size_t>(count);
+ bytesReturned -= count;
+ }
+
+ bool BlockDecompressionStream::Skip(int count) {
+ bytesReturned += count;
+ // this is a stupid implementation for now.
+ // should skip entire blocks without decompressing
+ while (count > 0) {
+ const void *ptr;
+ int len;
+ if (!Next(&ptr, &len)) {
+ return false;
+ }
+ if (len > count) {
+ BackUp(len - count);
+ count = 0;
+ } else {
+ count -= len;
+ }
+ }
+ return true;
+ }
+
+ int64_t BlockDecompressionStream::ByteCount() const {
+ return bytesReturned;
+ }
+
+ void BlockDecompressionStream::seek(PositionProvider& position) {
+ // clear state to force seek to read from the right position
+ state = DECOMPRESS_HEADER;
+ outputBufferPtr = nullptr;
+ outputBufferLength = 0;
+ remainingLength = 0;
+ inputBufferPtr = nullptr;
+ inputBufferPtrEnd = nullptr;
+
+ input->seek(position);
+ if (!Skip(static_cast<int>(position.next()))) {
+ throw ParseError("Bad skip in " + getName());
+ }
+ }
+
+ class SnappyDecompressionStream: public BlockDecompressionStream {
+ public:
+ SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool
+ ): BlockDecompressionStream
+ (std::move(inStream),
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "snappy(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength
+ ) override;
+ };
+
+ uint64_t SnappyDecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
+ size_t outLength;
+ if (!snappy::GetUncompressedLength(input, length, &outLength)) {
+ throw ParseError("SnappyDecompressionStream choked on corrupt input");
+ }
+
+ if (outLength > maxOutputLength) {
+ throw std::logic_error("Snappy length exceeds block size");
+ }
+
+ if (!snappy::RawUncompress(input, length, output)) {
+ throw ParseError("SnappyDecompressionStream choked on corrupt input");
+ }
+ return outLength;
+ }
+
+ class LzoDecompressionStream: public BlockDecompressionStream {
+ public:
+ LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool
+ ): BlockDecompressionStream
+ (std::move(inStream),
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "lzo(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength
+ ) override;
+ };
+
+ uint64_t LzoDecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
+ return lzoDecompress(input, input + length, output,
+ output + maxOutputLength);
+ }
+
+ class Lz4DecompressionStream: public BlockDecompressionStream {
+ public:
+ Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool
+ ): BlockDecompressionStream
+ (std::move(inStream),
+ blockSize,
+ pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "lz4(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input, uint64_t length,
+ char *output, size_t maxOutputLength
+ ) override;
+ };
+
+ uint64_t Lz4DecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
+ int result = LZ4_decompress_safe(input, output, static_cast<int>(length),
+ static_cast<int>(maxOutputLength));
+ if (result < 0) {
+ throw ParseError(getName() + " - failed to decompress");
+ }
+ return static_cast<uint64_t>(result);
+ }
+
+ /**
+ * Block compression base class
+ */
+ class BlockCompressionStream: public CompressionStreamBase {
+ public:
+ BlockCompressionStream(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool)
+ : CompressionStreamBase(outStream,
+ compressionLevel,
+ capacity,
+ blockSize,
+ pool)
+ , compressorBuffer(pool) {
+ // PASS
+ }
+
+ virtual bool Next(void** data, int*size) override;
+ virtual std::string getName() const override = 0;
+
+ protected:
+ // compresses a block and returns the compressed size
+ virtual uint64_t doBlockCompression() = 0;
+
+ // return maximum possible compression size for allocating space for
+ // compressorBuffer below
+ virtual uint64_t estimateMaxCompressionSize() = 0;
+
+ // should allocate max possible compressed size
+ DataBuffer<unsigned char> compressorBuffer;
+ };
+
+ bool BlockCompressionStream::Next(void** data, int*size) {
+ if (bufferSize != 0) {
+ ensureHeader();
+
+ // perform compression
+ size_t totalCompressedSize = doBlockCompression();
+
+ const unsigned char * dataToWrite = nullptr;
+ int totalSizeToWrite = 0;
+ char * header = outputBuffer + outputPosition - 3;
+
+ if (totalCompressedSize >= static_cast<size_t>(bufferSize)) {
+ writeHeader(header, static_cast<size_t>(bufferSize), true);
+ dataToWrite = rawInputBuffer.data();
+ totalSizeToWrite = bufferSize;
+ } else {
+ writeHeader(header, totalCompressedSize, false);
+ dataToWrite = compressorBuffer.data();
+ totalSizeToWrite = static_cast<int>(totalCompressedSize);
+ }
+
+ char * dst = header + 3;
+ while (totalSizeToWrite > 0) {
+ if (outputPosition == outputSize) {
+ if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer),
+ &outputSize)) {
+ throw std::logic_error(
+ "Failed to get next output buffer from output stream.");
+ }
+ outputPosition = 0;
+ dst = outputBuffer;
+ } else if (outputPosition > outputSize) {
+ // this will unlikely happen, but we have seen a few on zstd v1.1.0
+ throw std::logic_error("Write to an out-of-bound place!");
+ }
+
+ int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition);
+ std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite));
+
+ outputPosition += sizeToWrite;
+ dataToWrite += sizeToWrite;
+ totalSizeToWrite -= sizeToWrite;
+ dst += sizeToWrite;
+ }
+ }
+
+ *data = rawInputBuffer.data();
+ *size = static_cast<int>(rawInputBuffer.size());
+ bufferSize = *size;
+ compressorBuffer.resize(estimateMaxCompressionSize());
+
+ return true;
+ }
+
+ /**
+ * ZSTD block compression
+ */
+ class ZSTDCompressionStream: public BlockCompressionStream {
+ public:
+ ZSTDCompressionStream(OutputStream * outStream,
+ int compressionLevel,
+ uint64_t capacity,
+ uint64_t blockSize,
+ MemoryPool& pool)
+ : BlockCompressionStream(outStream,
+ compressionLevel,
+ capacity,
+ blockSize,
+ pool) {
this->init();
- }
-
- virtual std::string getName() const override {
- return "ZstdCompressionStream";
- }
+ }
+
+ virtual std::string getName() const override {
+ return "ZstdCompressionStream";
+ }
virtual ~ZSTDCompressionStream() override {
this->end();
}
-
- protected:
- virtual uint64_t doBlockCompression() override;
-
- virtual uint64_t estimateMaxCompressionSize() override {
- return ZSTD_compressBound(static_cast<size_t>(bufferSize));
- }
+
+ protected:
+ virtual uint64_t doBlockCompression() override;
+
+ virtual uint64_t estimateMaxCompressionSize() override {
+ return ZSTD_compressBound(static_cast<size_t>(bufferSize));
+ }
private:
void init();
void end();
ZSTD_CCtx *cctx;
- };
-
- uint64_t ZSTDCompressionStream::doBlockCompression() {
+ };
+
+ uint64_t ZSTDCompressionStream::doBlockCompression() {
return ZSTD_compressCCtx(cctx,
compressorBuffer.data(),
compressorBuffer.size(),
rawInputBuffer.data(),
static_cast<size_t>(bufferSize),
level);
- }
+ }
DIAGNOSTIC_PUSH
-
+
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
@@ -1086,53 +1086,53 @@ DIAGNOSTIC_PUSH
DIAGNOSTIC_PUSH
- /**
- * ZSTD block decompression
- */
- class ZSTDDecompressionStream: public BlockDecompressionStream {
- public:
- ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool)
- : BlockDecompressionStream(std::move(inStream),
- blockSize,
- pool) {
+ /**
+ * ZSTD block decompression
+ */
+ class ZSTDDecompressionStream: public BlockDecompressionStream {
+ public:
+ ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ MemoryPool& pool)
+ : BlockDecompressionStream(std::move(inStream),
+ blockSize,
+ pool) {
this->init();
- }
-
+ }
+
virtual ~ZSTDDecompressionStream() override {
this->end();
}
- std::string getName() const override {
- std::ostringstream result;
- result << "zstd(" << getStreamName() << ")";
- return result.str();
- }
-
- protected:
- virtual uint64_t decompress(const char *input,
- uint64_t length,
- char *output,
- size_t maxOutputLength) override;
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "zstd(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ virtual uint64_t decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) override;
private:
void init();
void end();
ZSTD_DCtx *dctx;
- };
-
- uint64_t ZSTDDecompressionStream::decompress(const char *input,
- uint64_t length,
- char *output,
- size_t maxOutputLength) {
+ };
+
+ uint64_t ZSTDDecompressionStream::decompress(const char *input,
+ uint64_t length,
+ char *output,
+ size_t maxOutputLength) {
return static_cast<uint64_t>(ZSTD_decompressDCtx(dctx,
output,
maxOutputLength,
input,
length));
- }
-
+ }
+
DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
@@ -1155,71 +1155,71 @@ DIAGNOSTIC_PUSH
DIAGNOSTIC_PUSH
- std::unique_ptr<BufferedOutputStream>
- createCompressor(
- CompressionKind kind,
- OutputStream * outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool) {
- switch (static_cast<int64_t>(kind)) {
- case CompressionKind_NONE: {
- return std::unique_ptr<BufferedOutputStream>
- (new BufferedOutputStream(
- pool, outStream, bufferCapacity, compressionBlockSize));
- }
- case CompressionKind_ZLIB: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
- return std::unique_ptr<BufferedOutputStream>
- (new ZlibCompressionStream(
- outStream, level, bufferCapacity, compressionBlockSize, pool));
- }
- case CompressionKind_ZSTD: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- 1 : ZSTD_CLEVEL_DEFAULT;
- return std::unique_ptr<BufferedOutputStream>
- (new ZSTDCompressionStream(
- outStream, level, bufferCapacity, compressionBlockSize, pool));
- }
- case CompressionKind_SNAPPY:
- case CompressionKind_LZO:
- case CompressionKind_LZ4:
- default:
- throw NotImplementedYet("compression codec");
- }
- }
-
- std::unique_ptr<SeekableInputStream>
- createDecompressor(CompressionKind kind,
- std::unique_ptr<SeekableInputStream> input,
- uint64_t blockSize,
- MemoryPool& pool) {
- switch (static_cast<int64_t>(kind)) {
- case CompressionKind_NONE:
- return REDUNDANT_MOVE(input);
- case CompressionKind_ZLIB:
- return std::unique_ptr<SeekableInputStream>
- (new ZlibDecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_SNAPPY:
- return std::unique_ptr<SeekableInputStream>
- (new SnappyDecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_LZO:
- return std::unique_ptr<SeekableInputStream>
- (new LzoDecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_LZ4:
- return std::unique_ptr<SeekableInputStream>
- (new Lz4DecompressionStream(std::move(input), blockSize, pool));
- case CompressionKind_ZSTD:
- return std::unique_ptr<SeekableInputStream>
- (new ZSTDDecompressionStream(std::move(input), blockSize, pool));
- default: {
- std::ostringstream buffer;
- buffer << "Unknown compression codec " << kind;
- throw NotImplementedYet(buffer.str());
- }
- }
- }
-
-}
+ std::unique_ptr<BufferedOutputStream>
+ createCompressor(
+ CompressionKind kind,
+ OutputStream * outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ MemoryPool& pool) {
+ switch (static_cast<int64_t>(kind)) {
+ case CompressionKind_NONE: {
+ return std::unique_ptr<BufferedOutputStream>
+ (new BufferedOutputStream(
+ pool, outStream, bufferCapacity, compressionBlockSize));
+ }
+ case CompressionKind_ZLIB: {
+ int level = (strategy == CompressionStrategy_SPEED) ?
+ Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
+ return std::unique_ptr<BufferedOutputStream>
+ (new ZlibCompressionStream(
+ outStream, level, bufferCapacity, compressionBlockSize, pool));
+ }
+ case CompressionKind_ZSTD: {
+ int level = (strategy == CompressionStrategy_SPEED) ?
+ 1 : ZSTD_CLEVEL_DEFAULT;
+ return std::unique_ptr<BufferedOutputStream>
+ (new ZSTDCompressionStream(
+ outStream, level, bufferCapacity, compressionBlockSize, pool));
+ }
+ case CompressionKind_SNAPPY:
+ case CompressionKind_LZO:
+ case CompressionKind_LZ4:
+ default:
+ throw NotImplementedYet("compression codec");
+ }
+ }
+
+ std::unique_ptr<SeekableInputStream>
+ createDecompressor(CompressionKind kind,
+ std::unique_ptr<SeekableInputStream> input,
+ uint64_t blockSize,
+ MemoryPool& pool) {
+ switch (static_cast<int64_t>(kind)) {
+ case CompressionKind_NONE:
+ return REDUNDANT_MOVE(input);
+ case CompressionKind_ZLIB:
+ return std::unique_ptr<SeekableInputStream>
+ (new ZlibDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_SNAPPY:
+ return std::unique_ptr<SeekableInputStream>
+ (new SnappyDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_LZO:
+ return std::unique_ptr<SeekableInputStream>
+ (new LzoDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_LZ4:
+ return std::unique_ptr<SeekableInputStream>
+ (new Lz4DecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_ZSTD:
+ return std::unique_ptr<SeekableInputStream>
+ (new ZSTDDecompressionStream(std::move(input), blockSize, pool));
+ default: {
+ std::ostringstream buffer;
+ buffer << "Unknown compression codec " << kind;
+ throw NotImplementedYet(buffer.str());
+ }
+ }
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh
index ff79377d83..84e85bddaf 100644
--- a/contrib/libs/apache/orc/c++/src/Compression.hh
+++ b/contrib/libs/apache/orc/c++/src/Compression.hh
@@ -1,58 +1,58 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_COMPRESSION_HH
-#define ORC_COMPRESSION_HH
-
-#include "io/InputStream.hh"
-#include "io/OutputStream.hh"
-
-namespace orc {
-
- /**
- * Create a decompressor for the given compression kind.
- * @param kind the compression type to implement
- * @param input the input stream that is the underlying source
- * @param bufferSize the maximum size of the buffer
- * @param pool the memory pool
- */
- std::unique_ptr<SeekableInputStream>
- createDecompressor(CompressionKind kind,
- std::unique_ptr<SeekableInputStream> input,
- uint64_t bufferSize,
- MemoryPool& pool);
-
- /**
- * Create a compressor for the given compression kind.
- * @param kind the compression type to implement
- * @param outStream the output stream that is the underlying target
- * @param strategy compression strategy
- * @param bufferCapacity compression stream buffer total capacity
- * @param compressionBlockSize compression buffer block size
- * @param pool the memory pool
- */
- std::unique_ptr<BufferedOutputStream>
- createCompressor(CompressionKind kind,
- OutputStream * outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_COMPRESSION_HH
+#define ORC_COMPRESSION_HH
+
+#include "io/InputStream.hh"
+#include "io/OutputStream.hh"
+
+namespace orc {
+
+ /**
+ * Create a decompressor for the given compression kind.
+ * @param kind the compression type to implement
+ * @param input the input stream that is the underlying source
+ * @param bufferSize the maximum size of the buffer
+ * @param pool the memory pool
+ */
+ std::unique_ptr<SeekableInputStream>
+ createDecompressor(CompressionKind kind,
+ std::unique_ptr<SeekableInputStream> input,
+ uint64_t bufferSize,
+ MemoryPool& pool);
+
+ /**
+ * Create a compressor for the given compression kind.
+ * @param kind the compression type to implement
+ * @param outStream the output stream that is the underlying target
+ * @param strategy compression strategy
+ * @param bufferCapacity compression stream buffer total capacity
+ * @param compressionBlockSize compression buffer block size
+ * @param pool the memory pool
+ */
+ std::unique_ptr<BufferedOutputStream>
+ createCompressor(CompressionKind kind,
+ OutputStream * outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ MemoryPool& pool);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc
index 2077b27df4..f721c05a88 100644
--- a/contrib/libs/apache/orc/c++/src/Exceptions.cc
+++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc
@@ -1,78 +1,78 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Exceptions.hh"
-
-namespace orc {
-
- NotImplementedYet::NotImplementedYet(const std::string& what_arg
- ) : logic_error(what_arg) {
- // PASS
- }
-
- NotImplementedYet::NotImplementedYet(const char* what_arg
- ) :logic_error(what_arg) {
- // PASS
- }
-
- NotImplementedYet::NotImplementedYet(const NotImplementedYet& error
- ): logic_error(error) {
- // PASS
- }
-
- NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT {
- // PASS
- }
-
- ParseError::ParseError(const std::string& what_arg
- ): runtime_error(what_arg) {
- // PASS
- }
-
- ParseError::ParseError(const char* what_arg
- ): runtime_error(what_arg) {
- // PASS
- }
-
- ParseError::ParseError(const ParseError& error): runtime_error(error) {
- // PASS
- }
-
- ParseError::~ParseError() ORC_NOEXCEPT {
- // PASS
- }
-
- InvalidArgument::InvalidArgument(const std::string& what_arg
- ): runtime_error(what_arg) {
- // PASS
- }
-
- InvalidArgument::InvalidArgument(const char* what_arg
- ): runtime_error(what_arg) {
- // PASS
- }
-
- InvalidArgument::InvalidArgument(const InvalidArgument& error
- ): runtime_error(error) {
- // PASS
- }
-
- InvalidArgument::~InvalidArgument() ORC_NOEXCEPT {
- // PASS
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+
+namespace orc {
+
+ NotImplementedYet::NotImplementedYet(const std::string& what_arg
+ ) : logic_error(what_arg) {
+ // PASS
+ }
+
+ NotImplementedYet::NotImplementedYet(const char* what_arg
+ ) :logic_error(what_arg) {
+ // PASS
+ }
+
+ NotImplementedYet::NotImplementedYet(const NotImplementedYet& error
+ ): logic_error(error) {
+ // PASS
+ }
+
+ NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT {
+ // PASS
+ }
+
+ ParseError::ParseError(const std::string& what_arg
+ ): runtime_error(what_arg) {
+ // PASS
+ }
+
+ ParseError::ParseError(const char* what_arg
+ ): runtime_error(what_arg) {
+ // PASS
+ }
+
+ ParseError::ParseError(const ParseError& error): runtime_error(error) {
+ // PASS
+ }
+
+ ParseError::~ParseError() ORC_NOEXCEPT {
+ // PASS
+ }
+
+ InvalidArgument::InvalidArgument(const std::string& what_arg
+ ): runtime_error(what_arg) {
+ // PASS
+ }
+
+ InvalidArgument::InvalidArgument(const char* what_arg
+ ): runtime_error(what_arg) {
+ // PASS
+ }
+
+ InvalidArgument::InvalidArgument(const InvalidArgument& error
+ ): runtime_error(error) {
+ // PASS
+ }
+
+ InvalidArgument::~InvalidArgument() ORC_NOEXCEPT {
+ // PASS
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc
index 433e6fa193..96266e855c 100644
--- a/contrib/libs/apache/orc/c++/src/Int128.cc
+++ b/contrib/libs/apache/orc/c++/src/Int128.cc
@@ -1,494 +1,494 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Int128.hh"
-#include "Adaptor.hh"
-
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-
-namespace orc {
-
- Int128 Int128::maximumValue() {
- return Int128(0x7fffffffffffffff, 0xfffffffffffffff);
- }
-
- Int128 Int128::minimumValue() {
- return Int128(static_cast<int64_t>(0x8000000000000000), 0x0);
- }
-
- Int128::Int128(const std::string& str) {
- lowbits = 0;
- highbits = 0;
- size_t length = str.length();
- if (length > 0) {
- bool isNegative = str[0] == '-';
- size_t posn = isNegative ? 1 : 0;
- while (posn < length) {
- size_t group = std::min(static_cast<size_t>(18), length - posn);
- int64_t chunk = std::stoll(str.substr(posn, group));
- int64_t multiple = 1;
- for(size_t i=0; i < group; ++i) {
- multiple *= 10;
- }
- *this *= multiple;
- *this += chunk;
- posn += group;
- }
- if (isNegative) {
- negate();
- }
- }
- }
-
- Int128& Int128::operator*=(const Int128 &right) {
- const uint64_t INT_MASK = 0xffffffff;
- const uint64_t CARRY_BIT = INT_MASK + 1;
-
- // Break the left and right numbers into 32 bit chunks
- // so that we can multiply them without overflow.
- uint64_t L0 = static_cast<uint64_t>(highbits) >> 32;
- uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK;
- uint64_t L2 = lowbits >> 32;
- uint64_t L3 = lowbits & INT_MASK;
- uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32;
- uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK;
- uint64_t R2 = right.lowbits >> 32;
- uint64_t R3 = right.lowbits & INT_MASK;
-
- uint64_t product = L3 * R3;
- lowbits = product & INT_MASK;
- uint64_t sum = product >> 32;
- product = L2 * R3;
- sum += product;
- highbits = sum < product ? CARRY_BIT : 0;
- product = L3 * R2;
- sum += product;
- if (sum < product) {
- highbits += CARRY_BIT;
- }
- lowbits += sum << 32;
- highbits += static_cast<int64_t>(sum >> 32);
- highbits += L1 * R3 + L2 * R2 + L3 * R1;
- highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32;
- return *this;
- }
-
- /**
- * Expands the given value into an array of ints so that we can work on
- * it. The array will be converted to an absolute value and the wasNegative
- * flag will be set appropriately. The array will remove leading zeros from
- * the value.
- * @param array an array of length 4 to set with the value
- * @param wasNegative a flag for whether the value was original negative
- * @result the output length of the array
- */
- int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const {
- uint64_t high;
- uint64_t low;
- if (highbits < 0) {
- low = ~lowbits + 1;
- high = static_cast<uint64_t>(~highbits);
- if (low == 0) {
- high += 1;
- }
- wasNegative = true;
- } else {
- low = lowbits;
- high = static_cast<uint64_t>(highbits);
- wasNegative = false;
- }
- if (high != 0) {
- if (high > UINT32_MAX) {
- array[0] = static_cast<uint32_t>(high >> 32);
- array[1] = static_cast<uint32_t>(high);
- array[2] = static_cast<uint32_t>(low >> 32);
- array[3] = static_cast<uint32_t>(low);
- return 4;
- } else {
- array[0] = static_cast<uint32_t>(high);
- array[1] = static_cast<uint32_t>(low >> 32);
- array[2] = static_cast<uint32_t>(low);
- return 3;
- }
- } else if (low >= UINT32_MAX) {
- array[0] = static_cast<uint32_t>(low >> 32);
- array[1] = static_cast<uint32_t>(low);
- return 2;
- } else if (low == 0) {
- return 0;
- } else {
- array[0] = static_cast<uint32_t>(low);
- return 1;
- }
- }
-
-
- /**
- * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is
- * the MSB. We can replace this with bsrq asm instruction on x64.
- */
- int64_t fls(uint32_t x) {
- int64_t bitpos = 0;
- while (x) {
- x >>= 1;
- bitpos += 1;
- }
- return bitpos;
- }
-
- /**
- * Shift the number in the array left by bits positions.
- * @param array the number to shift, must have length elements
- * @param length the number of entries in the array
- * @param bits the number of bits to shift (0 <= bits < 32)
- */
- void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
- if (length > 0 && bits != 0) {
- for(int64_t i=0; i < length-1; ++i) {
- array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits));
- }
- array[length-1] <<= bits;
- }
- }
-
- /**
- * Shift the number in the array right by bits positions.
- * @param array the number to shift, must have length elements
- * @param length the number of entries in the array
- * @param bits the number of bits to shift (0 <= bits < 32)
- */
- void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
- if (length > 0 && bits != 0) {
- for(int64_t i=length-1; i > 0; --i) {
- array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits));
- }
- array[0] >>= bits;
- }
- }
-
- /**
- * Fix the signs of the result and remainder at the end of the division
- * based on the signs of the dividend and divisor.
- */
- void fixDivisionSigns(Int128 &result, Int128 &remainder,
- bool dividendWasNegative, bool divisorWasNegative) {
- if (dividendWasNegative != divisorWasNegative) {
- result.negate();
- }
- if (dividendWasNegative) {
- remainder.negate();
- }
- }
-
- /**
- * Build a Int128 from a list of ints.
- */
- void buildFromArray(Int128& value, uint32_t* array, int64_t length) {
- switch (length) {
- case 0:
- value = 0;
- break;
- case 1:
- value = array[0];
- break;
- case 2:
- value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]);
- break;
- case 3:
- value = Int128(array[0],
- (static_cast<uint64_t>(array[1]) << 32) + array[2]);
- break;
- case 4:
- value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1],
- (static_cast<uint64_t>(array[2]) << 32) + array[3]);
- break;
- case 5:
- if (array[0] != 0) {
- throw std::logic_error("Can't build Int128 with 5 ints.");
- }
- value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2],
- (static_cast<uint64_t>(array[3]) << 32) + array[4]);
- break;
- default:
- throw std::logic_error("Unsupported length for building Int128");
- }
- }
-
- /**
- * Do a division where the divisor fits into a single 32 bit value.
- */
- Int128 singleDivide(uint32_t* dividend, int64_t dividendLength,
- uint32_t divisor, Int128& remainder,
- bool dividendWasNegative, bool divisorWasNegative) {
- uint64_t r = 0;
- uint32_t resultArray[5];
- for(int64_t j=0; j < dividendLength; j++) {
- r <<= 32;
- r += dividend[j];
- resultArray[j] = static_cast<uint32_t>(r / divisor);
- r %= divisor;
- }
- Int128 result;
- buildFromArray(result, resultArray, dividendLength);
- remainder = static_cast<int64_t>(r);
- fixDivisionSigns(result, remainder, dividendWasNegative,
- divisorWasNegative);
- return result;
- }
-
- Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const {
- // Split the dividend and divisor into integer pieces so that we can
- // work on them.
- uint32_t dividendArray[5];
- uint32_t divisorArray[4];
- bool dividendWasNegative;
- bool divisorWasNegative;
- // leave an extra zero before the dividend
- dividendArray[0] = 0;
- int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1;
- int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative);
-
- // Handle some of the easy cases.
- if (dividendLength <= divisorLength) {
- remainder = *this;
- return 0;
- } else if (divisorLength == 0) {
- throw std::range_error("Division by 0 in Int128");
- } else if (divisorLength == 1) {
- return singleDivide(dividendArray, dividendLength, divisorArray[0],
- remainder, dividendWasNegative, divisorWasNegative);
- }
-
- int64_t resultLength = dividendLength - divisorLength;
- uint32_t resultArray[4];
-
- // Normalize by shifting both by a multiple of 2 so that
- // the digit guessing is better. The requirement is that
- // divisorArray[0] is greater than 2**31.
- int64_t normalizeBits = 32 - fls(divisorArray[0]);
- shiftArrayLeft(divisorArray, divisorLength, normalizeBits);
- shiftArrayLeft(dividendArray, dividendLength, normalizeBits);
-
- // compute each digit in the result
- for(int64_t j=0; j < resultLength; ++j) {
- // Guess the next digit. At worst it is two too large
- uint32_t guess = UINT32_MAX;
- uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 |
- dividendArray[j+1];
- if (dividendArray[j] != divisorArray[0]) {
- guess = static_cast<uint32_t>(highDividend / divisorArray[0]);
- }
-
- // catch all of the cases where guess is two too large and most of the
- // cases where it is one too large
- uint32_t rhat =
- static_cast<uint32_t>(highDividend - guess *
- static_cast<uint64_t>(divisorArray[0]));
- while (static_cast<uint64_t>(divisorArray[1]) * guess >
- (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) {
- guess -= 1;
- rhat += divisorArray[0];
- if (static_cast<uint64_t>(rhat) < divisorArray[0]) {
- break;
- }
- }
-
- // subtract off the guess * divisor from the dividend
- uint64_t mult = 0;
- for(int64_t i=divisorLength-1; i >= 0; --i) {
- mult += static_cast<uint64_t>(guess) * divisorArray[i];
- uint32_t prev = dividendArray[j+i+1];
- dividendArray[j+i+1] -= static_cast<uint32_t>(mult);
- mult >>= 32;
- if (dividendArray[j+i+1] > prev) {
- mult += 1;
- }
- }
- uint32_t prev = dividendArray[j];
- dividendArray[j] -= static_cast<uint32_t>(mult);
-
- // if guess was too big, we add back divisor
- if (dividendArray[j] > prev) {
- guess -= 1;
- uint32_t carry = 0;
- for(int64_t i=divisorLength-1; i >= 0; --i) {
- uint64_t sum = static_cast<uint64_t>(divisorArray[i]) +
- dividendArray[j+i+1] + carry;
- dividendArray[j+i+1] = static_cast<uint32_t>(sum);
- carry = static_cast<uint32_t>(sum >> 32);
- }
- dividendArray[j] += carry;
- }
-
- resultArray[j] = guess;
- }
-
- // denormalize the remainder
- shiftArrayRight(dividendArray, dividendLength, normalizeBits);
-
- // return result and remainder
- Int128 result;
- buildFromArray(result, resultArray, resultLength);
- buildFromArray(remainder, dividendArray, dividendLength);
- fixDivisionSigns(result, remainder,
- dividendWasNegative, divisorWasNegative);
- return result;
- }
-
- std::string Int128::toString() const {
- // 10**18 - the largest power of 10 less than 63 bits
- const Int128 tenTo18(0xde0b6b3a7640000);
- // 10**36
- const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000);
- Int128 remainder;
- std::stringstream buf;
- bool needFill = false;
-
- // get anything above 10**36 and print it
- Int128 top = divide(tenTo36, remainder);
- if (top != 0) {
- buf << top.toLong();
- remainder.abs();
- needFill = true;
- }
-
- // now get anything above 10**18 and print it
- Int128 tail;
- top = remainder.divide(tenTo18, tail);
- if (needFill || top != 0) {
- if (needFill) {
- buf << std::setw(18) << std::setfill('0');
- } else {
- needFill = true;
- tail.abs();
- }
- buf << top.toLong();
- }
-
- // finally print the tail, which is less than 10**18
- if (needFill) {
- buf << std::setw(18) << std::setfill('0');
- }
- buf << tail.toLong();
- return buf.str();
- }
-
- std::string Int128::toDecimalString(int32_t scale) const {
- std::string str = toString();
- if (scale == 0) {
- return str;
- } else if (*this < 0) {
- int32_t len = static_cast<int32_t>(str.length());
- if (len - 1 > scale) {
- return str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(scale));
- } else if (len - 1 == scale) {
- return "-0." + str.substr(1, std::string::npos);
- } else {
- std::string result = "-0.";
- for(int32_t i=0; i < scale - len + 1; ++i) {
- result += "0";
- }
- return result + str.substr(1, std::string::npos);
- }
- } else {
- int32_t len = static_cast<int32_t>(str.length());
- if (len > scale) {
- return str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(scale));
- } else if (len == scale) {
- return "0." + str;
- } else {
- std::string result = "0.";
- for(int32_t i=0; i < scale - len; ++i) {
- result += "0";
- }
- return result + str;
- }
- }
- }
-
- std::string Int128::toHexString() const {
- std::stringstream buf;
- buf << std::hex << "0x"
- << std::setw(16) << std::setfill('0') << highbits
- << std::setw(16) << std::setfill('0') << lowbits;
- return buf.str();
- }
-
- const static int32_t MAX_PRECISION_64 = 18;
- const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] =
- {1,
- 10,
- 100,
- 1000,
- 10000,
- 100000,
- 1000000,
- 10000000,
- 100000000,
- 1000000000,
- 10000000000,
- 100000000000,
- 1000000000000,
- 10000000000000,
- 100000000000000,
- 1000000000000000,
- 10000000000000000,
- 100000000000000000,
- 1000000000000000000};
-
- Int128 scaleUpInt128ByPowerOfTen(Int128 value,
- int32_t power,
- bool &overflow) {
- overflow = false;
- Int128 remainder;
-
- while (power > 0) {
- int32_t step = std::min(power, MAX_PRECISION_64);
- if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) {
- overflow = true;
- return Int128::maximumValue();
- } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) {
- overflow = true;
- return Int128::minimumValue();
- }
-
- value *= POWERS_OF_TEN[step];
- power -= step;
- }
-
- return value;
- }
-
- Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) {
- Int128 remainder;
- while (power > 0) {
- int32_t step = std::min(std::abs(power), MAX_PRECISION_64);
- value = value.divide(POWERS_OF_TEN[step], remainder);
- power -= step;
- }
- return value;
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Int128.hh"
+#include "Adaptor.hh"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+namespace orc {
+
+ Int128 Int128::maximumValue() {
+ return Int128(0x7fffffffffffffff, 0xfffffffffffffff);
+ }
+
+ Int128 Int128::minimumValue() {
+ return Int128(static_cast<int64_t>(0x8000000000000000), 0x0);
+ }
+
+ Int128::Int128(const std::string& str) {
+ lowbits = 0;
+ highbits = 0;
+ size_t length = str.length();
+ if (length > 0) {
+ bool isNegative = str[0] == '-';
+ size_t posn = isNegative ? 1 : 0;
+ while (posn < length) {
+ size_t group = std::min(static_cast<size_t>(18), length - posn);
+ int64_t chunk = std::stoll(str.substr(posn, group));
+ int64_t multiple = 1;
+ for(size_t i=0; i < group; ++i) {
+ multiple *= 10;
+ }
+ *this *= multiple;
+ *this += chunk;
+ posn += group;
+ }
+ if (isNegative) {
+ negate();
+ }
+ }
+ }
+
+ Int128& Int128::operator*=(const Int128 &right) {
+ const uint64_t INT_MASK = 0xffffffff;
+ const uint64_t CARRY_BIT = INT_MASK + 1;
+
+ // Break the left and right numbers into 32 bit chunks
+ // so that we can multiply them without overflow.
+ uint64_t L0 = static_cast<uint64_t>(highbits) >> 32;
+ uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK;
+ uint64_t L2 = lowbits >> 32;
+ uint64_t L3 = lowbits & INT_MASK;
+ uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32;
+ uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK;
+ uint64_t R2 = right.lowbits >> 32;
+ uint64_t R3 = right.lowbits & INT_MASK;
+
+ uint64_t product = L3 * R3;
+ lowbits = product & INT_MASK;
+ uint64_t sum = product >> 32;
+ product = L2 * R3;
+ sum += product;
+ highbits = sum < product ? CARRY_BIT : 0;
+ product = L3 * R2;
+ sum += product;
+ if (sum < product) {
+ highbits += CARRY_BIT;
+ }
+ lowbits += sum << 32;
+ highbits += static_cast<int64_t>(sum >> 32);
+ highbits += L1 * R3 + L2 * R2 + L3 * R1;
+ highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32;
+ return *this;
+ }
+
+ /**
+ * Expands the given value into an array of ints so that we can work on
+ * it. The array will be converted to an absolute value and the wasNegative
+ * flag will be set appropriately. The array will remove leading zeros from
+ * the value.
+ * @param array an array of length 4 to set with the value
+ * @param wasNegative a flag for whether the value was original negative
+ * @result the output length of the array
+ */
+ int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const {
+ uint64_t high;
+ uint64_t low;
+ if (highbits < 0) {
+ low = ~lowbits + 1;
+ high = static_cast<uint64_t>(~highbits);
+ if (low == 0) {
+ high += 1;
+ }
+ wasNegative = true;
+ } else {
+ low = lowbits;
+ high = static_cast<uint64_t>(highbits);
+ wasNegative = false;
+ }
+ if (high != 0) {
+ if (high > UINT32_MAX) {
+ array[0] = static_cast<uint32_t>(high >> 32);
+ array[1] = static_cast<uint32_t>(high);
+ array[2] = static_cast<uint32_t>(low >> 32);
+ array[3] = static_cast<uint32_t>(low);
+ return 4;
+ } else {
+ array[0] = static_cast<uint32_t>(high);
+ array[1] = static_cast<uint32_t>(low >> 32);
+ array[2] = static_cast<uint32_t>(low);
+ return 3;
+ }
+ } else if (low >= UINT32_MAX) {
+ array[0] = static_cast<uint32_t>(low >> 32);
+ array[1] = static_cast<uint32_t>(low);
+ return 2;
+ } else if (low == 0) {
+ return 0;
+ } else {
+ array[0] = static_cast<uint32_t>(low);
+ return 1;
+ }
+ }
+
+
+ /**
+ * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is
+ * the MSB. We can replace this with bsrq asm instruction on x64.
+ */
+ int64_t fls(uint32_t x) {
+ int64_t bitpos = 0;
+ while (x) {
+ x >>= 1;
+ bitpos += 1;
+ }
+ return bitpos;
+ }
+
+ /**
+ * Shift the number in the array left by bits positions.
+ * @param array the number to shift, must have length elements
+ * @param length the number of entries in the array
+ * @param bits the number of bits to shift (0 <= bits < 32)
+ */
+ void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for(int64_t i=0; i < length-1; ++i) {
+ array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits));
+ }
+ array[length-1] <<= bits;
+ }
+ }
+
+ /**
+ * Shift the number in the array right by bits positions.
+ * @param array the number to shift, must have length elements
+ * @param length the number of entries in the array
+ * @param bits the number of bits to shift (0 <= bits < 32)
+ */
+ void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for(int64_t i=length-1; i > 0; --i) {
+ array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits));
+ }
+ array[0] >>= bits;
+ }
+ }
+
+ /**
+ * Fix the signs of the result and remainder at the end of the division
+ * based on the signs of the dividend and divisor.
+ */
+ void fixDivisionSigns(Int128 &result, Int128 &remainder,
+ bool dividendWasNegative, bool divisorWasNegative) {
+ if (dividendWasNegative != divisorWasNegative) {
+ result.negate();
+ }
+ if (dividendWasNegative) {
+ remainder.negate();
+ }
+ }
+
+ /**
+ * Build a Int128 from a list of ints.
+ */
+ void buildFromArray(Int128& value, uint32_t* array, int64_t length) {
+ switch (length) {
+ case 0:
+ value = 0;
+ break;
+ case 1:
+ value = array[0];
+ break;
+ case 2:
+ value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]);
+ break;
+ case 3:
+ value = Int128(array[0],
+ (static_cast<uint64_t>(array[1]) << 32) + array[2]);
+ break;
+ case 4:
+ value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1],
+ (static_cast<uint64_t>(array[2]) << 32) + array[3]);
+ break;
+ case 5:
+ if (array[0] != 0) {
+ throw std::logic_error("Can't build Int128 with 5 ints.");
+ }
+ value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2],
+ (static_cast<uint64_t>(array[3]) << 32) + array[4]);
+ break;
+ default:
+ throw std::logic_error("Unsupported length for building Int128");
+ }
+ }
+
+ /**
+ * Do a division where the divisor fits into a single 32 bit value.
+ */
+ Int128 singleDivide(uint32_t* dividend, int64_t dividendLength,
+ uint32_t divisor, Int128& remainder,
+ bool dividendWasNegative, bool divisorWasNegative) {
+ uint64_t r = 0;
+ uint32_t resultArray[5];
+ for(int64_t j=0; j < dividendLength; j++) {
+ r <<= 32;
+ r += dividend[j];
+ resultArray[j] = static_cast<uint32_t>(r / divisor);
+ r %= divisor;
+ }
+ Int128 result;
+ buildFromArray(result, resultArray, dividendLength);
+ remainder = static_cast<int64_t>(r);
+ fixDivisionSigns(result, remainder, dividendWasNegative,
+ divisorWasNegative);
+ return result;
+ }
+
+ Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const {
+ // Split the dividend and divisor into integer pieces so that we can
+ // work on them.
+ uint32_t dividendArray[5];
+ uint32_t divisorArray[4];
+ bool dividendWasNegative;
+ bool divisorWasNegative;
+ // leave an extra zero before the dividend
+ dividendArray[0] = 0;
+ int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1;
+ int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative);
+
+ // Handle some of the easy cases.
+ if (dividendLength <= divisorLength) {
+ remainder = *this;
+ return 0;
+ } else if (divisorLength == 0) {
+ throw std::range_error("Division by 0 in Int128");
+ } else if (divisorLength == 1) {
+ return singleDivide(dividendArray, dividendLength, divisorArray[0],
+ remainder, dividendWasNegative, divisorWasNegative);
+ }
+
+ int64_t resultLength = dividendLength - divisorLength;
+ uint32_t resultArray[4];
+
+ // Normalize by shifting both by a multiple of 2 so that
+ // the digit guessing is better. The requirement is that
+ // divisorArray[0] is greater than 2**31.
+ int64_t normalizeBits = 32 - fls(divisorArray[0]);
+ shiftArrayLeft(divisorArray, divisorLength, normalizeBits);
+ shiftArrayLeft(dividendArray, dividendLength, normalizeBits);
+
+ // compute each digit in the result
+ for(int64_t j=0; j < resultLength; ++j) {
+ // Guess the next digit. At worst it is two too large
+ uint32_t guess = UINT32_MAX;
+ uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 |
+ dividendArray[j+1];
+ if (dividendArray[j] != divisorArray[0]) {
+ guess = static_cast<uint32_t>(highDividend / divisorArray[0]);
+ }
+
+ // catch all of the cases where guess is two too large and most of the
+ // cases where it is one too large
+ uint32_t rhat =
+ static_cast<uint32_t>(highDividend - guess *
+ static_cast<uint64_t>(divisorArray[0]));
+ while (static_cast<uint64_t>(divisorArray[1]) * guess >
+ (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) {
+ guess -= 1;
+ rhat += divisorArray[0];
+ if (static_cast<uint64_t>(rhat) < divisorArray[0]) {
+ break;
+ }
+ }
+
+ // subtract off the guess * divisor from the dividend
+ uint64_t mult = 0;
+ for(int64_t i=divisorLength-1; i >= 0; --i) {
+ mult += static_cast<uint64_t>(guess) * divisorArray[i];
+ uint32_t prev = dividendArray[j+i+1];
+ dividendArray[j+i+1] -= static_cast<uint32_t>(mult);
+ mult >>= 32;
+ if (dividendArray[j+i+1] > prev) {
+ mult += 1;
+ }
+ }
+ uint32_t prev = dividendArray[j];
+ dividendArray[j] -= static_cast<uint32_t>(mult);
+
+ // if guess was too big, we add back divisor
+ if (dividendArray[j] > prev) {
+ guess -= 1;
+ uint32_t carry = 0;
+ for(int64_t i=divisorLength-1; i >= 0; --i) {
+ uint64_t sum = static_cast<uint64_t>(divisorArray[i]) +
+ dividendArray[j+i+1] + carry;
+ dividendArray[j+i+1] = static_cast<uint32_t>(sum);
+ carry = static_cast<uint32_t>(sum >> 32);
+ }
+ dividendArray[j] += carry;
+ }
+
+ resultArray[j] = guess;
+ }
+
+ // denormalize the remainder
+ shiftArrayRight(dividendArray, dividendLength, normalizeBits);
+
+ // return result and remainder
+ Int128 result;
+ buildFromArray(result, resultArray, resultLength);
+ buildFromArray(remainder, dividendArray, dividendLength);
+ fixDivisionSigns(result, remainder,
+ dividendWasNegative, divisorWasNegative);
+ return result;
+ }
+
+ std::string Int128::toString() const {
+ // 10**18 - the largest power of 10 less than 63 bits
+ const Int128 tenTo18(0xde0b6b3a7640000);
+ // 10**36
+ const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000);
+ Int128 remainder;
+ std::stringstream buf;
+ bool needFill = false;
+
+ // get anything above 10**36 and print it
+ Int128 top = divide(tenTo36, remainder);
+ if (top != 0) {
+ buf << top.toLong();
+ remainder.abs();
+ needFill = true;
+ }
+
+ // now get anything above 10**18 and print it
+ Int128 tail;
+ top = remainder.divide(tenTo18, tail);
+ if (needFill || top != 0) {
+ if (needFill) {
+ buf << std::setw(18) << std::setfill('0');
+ } else {
+ needFill = true;
+ tail.abs();
+ }
+ buf << top.toLong();
+ }
+
+ // finally print the tail, which is less than 10**18
+ if (needFill) {
+ buf << std::setw(18) << std::setfill('0');
+ }
+ buf << tail.toLong();
+ return buf.str();
+ }
+
+ std::string Int128::toDecimalString(int32_t scale) const {
+ std::string str = toString();
+ if (scale == 0) {
+ return str;
+ } else if (*this < 0) {
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len - 1 > scale) {
+ return str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len - 1 == scale) {
+ return "-0." + str.substr(1, std::string::npos);
+ } else {
+ std::string result = "-0.";
+ for(int32_t i=0; i < scale - len + 1; ++i) {
+ result += "0";
+ }
+ return result + str.substr(1, std::string::npos);
+ }
+ } else {
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len > scale) {
+ return str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len == scale) {
+ return "0." + str;
+ } else {
+ std::string result = "0.";
+ for(int32_t i=0; i < scale - len; ++i) {
+ result += "0";
+ }
+ return result + str;
+ }
+ }
+ }
+
+ std::string Int128::toHexString() const {
+ std::stringstream buf;
+ buf << std::hex << "0x"
+ << std::setw(16) << std::setfill('0') << highbits
+ << std::setw(16) << std::setfill('0') << lowbits;
+ return buf.str();
+ }
+
+ const static int32_t MAX_PRECISION_64 = 18;
+ const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] =
+ {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+ Int128 scaleUpInt128ByPowerOfTen(Int128 value,
+ int32_t power,
+ bool &overflow) {
+ overflow = false;
+ Int128 remainder;
+
+ while (power > 0) {
+ int32_t step = std::min(power, MAX_PRECISION_64);
+ if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) {
+ overflow = true;
+ return Int128::maximumValue();
+ } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) {
+ overflow = true;
+ return Int128::minimumValue();
+ }
+
+ value *= POWERS_OF_TEN[step];
+ power -= step;
+ }
+
+ return value;
+ }
+
+ Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) {
+ Int128 remainder;
+ while (power > 0) {
+ int32_t step = std::min(std::abs(power), MAX_PRECISION_64);
+ value = value.divide(POWERS_OF_TEN[step], remainder);
+ power -= step;
+ }
+ return value;
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc
index d1ba183aeb..7bf91dee13 100644
--- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc
+++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc
@@ -1,391 +1,391 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "Compression.hh"
-#include "orc/Exceptions.hh"
-
-#include <string>
-
-namespace orc {
-
- static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4};
- static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3};
-
- static const int32_t SIZE_OF_SHORT = 2;
- static const int32_t SIZE_OF_INT = 4;
- static const int32_t SIZE_OF_LONG = 8;
-
- static std::string toHex(uint64_t val) {
- std::ostringstream out;
- out << "0x" << std::hex << val;
- return out.str();
- }
-
- static std::string toString(int64_t val) {
- std::ostringstream out;
- out << val;
- return out.str();
- }
-
- class MalformedInputException: public ParseError {
- public:
- MalformedInputException(int64_t off
- ) :ParseError("MalformedInputException at " +
- toString(off)) {
- }
-
- MalformedInputException(int64_t off, const std::string& msg
- ): ParseError("MalformedInputException " + msg +
- " at " + toString(off)) {
- }
-
- MalformedInputException(const MalformedInputException& other
- ): ParseError(other.what()) {
- }
-
- virtual ~MalformedInputException() noexcept;
- };
-
- MalformedInputException::~MalformedInputException() noexcept {
- // PASS
- }
-
- uint64_t lzoDecompress(const char *inputAddress,
- const char *inputLimit,
- char *outputAddress,
- char *outputLimit) {
- // nothing compresses to nothing
- if (inputAddress == inputLimit) {
- return 0;
- }
-
- // maximum offset in buffers to which it's safe to write long-at-a-time
- char * const fastOutputLimit = outputLimit - SIZE_OF_LONG;
-
- // LZO can concat two blocks together so, decode until the input data is
- // consumed
- const char *input = inputAddress;
- char *output = outputAddress;
- while (input < inputLimit) {
- //
- // Note: For safety some of the code below may stop decoding early or
- // skip decoding, because input is not available. This makes the code
- // safe, and since LZO requires an explicit "stop" command, the decoder
- // will still throw a exception.
- //
-
- bool firstCommand = true;
- uint32_t lastLiteralLength = 0;
- while (true) {
- if (input >= inputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- uint32_t command = *(input++) & 0xFF;
- if (command == 0x11) {
- break;
- }
-
- // Commands are described using a bit pattern notation:
- // 0: bit is not set
- // 1: bit is set
- // L: part of literal length
- // P: part of match offset position
- // M: part of match length
- // ?: see documentation in command decoder
-
- int32_t matchLength;
- int32_t matchOffset;
- uint32_t literalLength;
- if ((command & 0xf0) == 0) {
- if (lastLiteralLength == 0) {
- // 0b0000_LLLL (0bLLLL_LLLL)*
-
- // copy length :: fixed
- // 0
- matchOffset = 0;
-
- // copy offset :: fixed
- // 0
- matchLength = 0;
-
- // literal length - 3 :: variable bits :: valid range [4..]
- // 3 + variableLength(command bits [0..3], 4)
- literalLength = command & 0xf;
- if (literalLength == 0) {
- literalLength = 0xf;
-
- uint32_t nextByte = 0;
- while (input < inputLimit &&
- (nextByte = *(input++) & 0xFF) == 0) {
- literalLength += 0xff;
- }
- literalLength += nextByte;
- }
- literalLength += 3;
- } else if (lastLiteralLength <= 3) {
- // 0b0000_PPLL 0bPPPP_PPPP
-
- // copy length: fixed
- // 3
- matchLength = 3;
-
- // copy offset :: 12 bits :: valid range [2048..3071]
- // [0..1] from command [2..3]
- // [2..9] from trailer [0..7]
- // [10] unset
- // [11] set
- if (input >= inputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- matchOffset = (command & 0xc) >> 2;
- matchOffset |= (*(input++) & 0xFF) << 2;
- matchOffset |= 0x800;
-
- // literal length :: 2 bits :: valid range [0..3]
- // [0..1] from command [0..1]
- literalLength = (command & 0x3);
- } else {
- // 0b0000_PPLL 0bPPPP_PPPP
-
- // copy length :: fixed
- // 2
- matchLength = 2;
-
- // copy offset :: 10 bits :: valid range [0..1023]
- // [0..1] from command [2..3]
- // [2..9] from trailer [0..7]
- if (input >= inputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- matchOffset = (command & 0xc) >> 2;
- matchOffset |= (*(input++) & 0xFF) << 2;
-
- // literal length :: 2 bits :: valid range [0..3]
- // [0..1] from command [0..1]
- literalLength = (command & 0x3);
- }
- } else if (firstCommand) {
- // first command has special handling when high nibble is set
- matchLength = 0;
- matchOffset = 0;
- literalLength = command - 17;
- } else if ((command & 0xf0) == 0x10) {
- // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
-
- // copy length - 2 :: variable bits :: valid range [3..]
- // 2 + variableLength(command bits [0..2], 3)
- matchLength = command & 0x7;
- if (matchLength == 0) {
- matchLength = 0x7;
-
- int32_t nextByte = 0;
- while (input < inputLimit &&
- (nextByte = *(input++) & 0xFF) == 0) {
- matchLength += 0xff;
- }
- matchLength += nextByte;
- }
- matchLength += 2;
-
- // read trailer
- if (input + SIZE_OF_SHORT > inputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF;
- input += SIZE_OF_SHORT;
-
- // copy offset :: 16 bits :: valid range [32767..49151]
- // [0..13] from trailer [2..15]
- // [14] if command bit [3] unset
- // [15] if command bit [3] set
- matchOffset = trailer >> 2;
- if ((command & 0x8) == 0) {
- matchOffset |= 0x4000;
- } else {
- matchOffset |= 0x8000;
- }
- matchOffset--;
-
- // literal length :: 2 bits :: valid range [0..3]
- // [0..1] from trailer [0..1]
- literalLength = trailer & 0x3;
- } else if ((command & 0xe0) == 0x20) {
- // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
-
- // copy length - 2 :: variable bits :: valid range [3..]
- // 2 + variableLength(command bits [0..4], 5)
- matchLength = command & 0x1f;
- if (matchLength == 0) {
- matchLength = 0x1f;
-
- int nextByte = 0;
- while (input < inputLimit &&
- (nextByte = *(input++) & 0xFF) == 0) {
- matchLength += 0xff;
- }
- matchLength += nextByte;
- }
- matchLength += 2;
-
- // read trailer
- if (input + SIZE_OF_SHORT > inputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF;
- input += SIZE_OF_SHORT;
-
- // copy offset :: 14 bits :: valid range [0..16383]
- // [0..13] from trailer [2..15]
- matchOffset = trailer >> 2;
-
- // literal length :: 2 bits :: valid range [0..3]
- // [0..1] from trailer [0..1]
- literalLength = trailer & 0x3;
- } else if ((command & 0xc0) != 0) {
- // 0bMMMP_PPLL 0bPPPP_PPPP
-
- // copy length - 1 :: 3 bits :: valid range [1..8]
- // [0..2] from command [5..7]
- // add 1
- matchLength = (command & 0xe0) >> 5;
- matchLength += 1;
-
- // copy offset :: 11 bits :: valid range [0..4095]
- // [0..2] from command [2..4]
- // [3..10] from trailer [0..7]
- if (input >= inputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- matchOffset = (command & 0x1c) >> 2;
- matchOffset |= (*(input++) & 0xFF) << 3;
-
- // literal length :: 2 bits :: valid range [0..3]
- // [0..1] from command [0..1]
- literalLength = (command & 0x3);
- } else {
- throw MalformedInputException(input - inputAddress - 1,
- "Invalid LZO command " +
- toHex(command));
- }
- firstCommand = false;
-
- // copy match
- if (matchLength != 0) {
- // lzo encodes match offset minus one
- matchOffset++;
-
- char *matchAddress = output - matchOffset;
- if (matchAddress < outputAddress ||
- output + matchLength > outputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
- char *matchOutputLimit = output + matchLength;
-
- if (output > fastOutputLimit) {
- // slow match copy
- while (output < matchOutputLimit) {
- *(output++) = *(matchAddress++);
- }
- } else {
- // copy repeated sequence
- if (matchOffset < SIZE_OF_LONG) {
- // 8 bytes apart so that we can copy long-at-a-time below
- int32_t increment32 = DEC_32_TABLE[matchOffset];
- int32_t decrement64 = DEC_64_TABLE[matchOffset];
-
- output[0] = *matchAddress;
- output[1] = *(matchAddress + 1);
- output[2] = *(matchAddress + 2);
- output[3] = *(matchAddress + 3);
- output += SIZE_OF_INT;
- matchAddress += increment32;
-
- *reinterpret_cast<int32_t*>(output) =
- *reinterpret_cast<int32_t*>(matchAddress);
- output += SIZE_OF_INT;
- matchAddress -= decrement64;
- } else {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<int64_t*>(matchAddress);
- matchAddress += SIZE_OF_LONG;
- output += SIZE_OF_LONG;
- }
-
- if (matchOutputLimit >= fastOutputLimit) {
- if (matchOutputLimit > outputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
-
- while (output < fastOutputLimit) {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<int64_t*>(matchAddress);
- matchAddress += SIZE_OF_LONG;
- output += SIZE_OF_LONG;
- }
-
- while (output < matchOutputLimit) {
- *(output++) = *(matchAddress++);
- }
- } else {
- while (output < matchOutputLimit) {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<int64_t*>(matchAddress);
- matchAddress += SIZE_OF_LONG;
- output += SIZE_OF_LONG;
- }
- }
- }
- output = matchOutputLimit; // correction in case we over-copied
- }
-
- // copy literal
- char *literalOutputLimit = output + literalLength;
- if (literalOutputLimit > fastOutputLimit ||
- input + literalLength > inputLimit - SIZE_OF_LONG) {
- if (literalOutputLimit > outputLimit) {
- throw MalformedInputException(input - inputAddress);
- }
-
- // slow, precise copy
- memcpy(output, input, literalLength);
- input += literalLength;
- output += literalLength;
- } else {
- // fast copy. We may over-copy but there's enough room in input
- // and output to not overrun them
- do {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<const int64_t*>(input);
- input += SIZE_OF_LONG;
- output += SIZE_OF_LONG;
- } while (output < literalOutputLimit);
- // adjust index if we over-copied
- input -= (output - literalOutputLimit);
- output = literalOutputLimit;
- }
- lastLiteralLength = literalLength;
- }
-
- if (input + SIZE_OF_SHORT > inputLimit &&
- *reinterpret_cast<const int16_t*>(input) != 0) {
- throw MalformedInputException(input - inputAddress);
- }
- input += SIZE_OF_SHORT;
- }
-
- return static_cast<uint64_t>(output - outputAddress);
- }
-
-}
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "orc/Exceptions.hh"
+
+#include <string>
+
+namespace orc {
+
+ static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4};
+ static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3};
+
+ static const int32_t SIZE_OF_SHORT = 2;
+ static const int32_t SIZE_OF_INT = 4;
+ static const int32_t SIZE_OF_LONG = 8;
+
+ static std::string toHex(uint64_t val) {
+ std::ostringstream out;
+ out << "0x" << std::hex << val;
+ return out.str();
+ }
+
+ static std::string toString(int64_t val) {
+ std::ostringstream out;
+ out << val;
+ return out.str();
+ }
+
+ class MalformedInputException: public ParseError {
+ public:
+ MalformedInputException(int64_t off
+ ) :ParseError("MalformedInputException at " +
+ toString(off)) {
+ }
+
+ MalformedInputException(int64_t off, const std::string& msg
+ ): ParseError("MalformedInputException " + msg +
+ " at " + toString(off)) {
+ }
+
+ MalformedInputException(const MalformedInputException& other
+ ): ParseError(other.what()) {
+ }
+
+ virtual ~MalformedInputException() noexcept;
+ };
+
+ MalformedInputException::~MalformedInputException() noexcept {
+ // PASS
+ }
+
+ uint64_t lzoDecompress(const char *inputAddress,
+ const char *inputLimit,
+ char *outputAddress,
+ char *outputLimit) {
+ // nothing compresses to nothing
+ if (inputAddress == inputLimit) {
+ return 0;
+ }
+
+ // maximum offset in buffers to which it's safe to write long-at-a-time
+ char * const fastOutputLimit = outputLimit - SIZE_OF_LONG;
+
+ // LZO can concat two blocks together so, decode until the input data is
+ // consumed
+ const char *input = inputAddress;
+ char *output = outputAddress;
+ while (input < inputLimit) {
+ //
+ // Note: For safety some of the code below may stop decoding early or
+ // skip decoding, because input is not available. This makes the code
+ // safe, and since LZO requires an explicit "stop" command, the decoder
+ // will still throw a exception.
+ //
+
+ bool firstCommand = true;
+ uint32_t lastLiteralLength = 0;
+ while (true) {
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ uint32_t command = *(input++) & 0xFF;
+ if (command == 0x11) {
+ break;
+ }
+
+ // Commands are described using a bit pattern notation:
+ // 0: bit is not set
+ // 1: bit is set
+ // L: part of literal length
+ // P: part of match offset position
+ // M: part of match length
+ // ?: see documentation in command decoder
+
+ int32_t matchLength;
+ int32_t matchOffset;
+ uint32_t literalLength;
+ if ((command & 0xf0) == 0) {
+ if (lastLiteralLength == 0) {
+ // 0b0000_LLLL (0bLLLL_LLLL)*
+
+ // copy length :: fixed
+ // 0
+ matchOffset = 0;
+
+ // copy offset :: fixed
+ // 0
+ matchLength = 0;
+
+ // literal length - 3 :: variable bits :: valid range [4..]
+ // 3 + variableLength(command bits [0..3], 4)
+ literalLength = command & 0xf;
+ if (literalLength == 0) {
+ literalLength = 0xf;
+
+ uint32_t nextByte = 0;
+ while (input < inputLimit &&
+ (nextByte = *(input++) & 0xFF) == 0) {
+ literalLength += 0xff;
+ }
+ literalLength += nextByte;
+ }
+ literalLength += 3;
+ } else if (lastLiteralLength <= 3) {
+ // 0b0000_PPLL 0bPPPP_PPPP
+
+ // copy length: fixed
+ // 3
+ matchLength = 3;
+
+ // copy offset :: 12 bits :: valid range [2048..3071]
+ // [0..1] from command [2..3]
+ // [2..9] from trailer [0..7]
+ // [10] unset
+ // [11] set
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ matchOffset = (command & 0xc) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 2;
+ matchOffset |= 0x800;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ } else {
+ // 0b0000_PPLL 0bPPPP_PPPP
+
+ // copy length :: fixed
+ // 2
+ matchLength = 2;
+
+ // copy offset :: 10 bits :: valid range [0..1023]
+ // [0..1] from command [2..3]
+ // [2..9] from trailer [0..7]
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ matchOffset = (command & 0xc) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 2;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ }
+ } else if (firstCommand) {
+ // first command has special handling when high nibble is set
+ matchLength = 0;
+ matchOffset = 0;
+ literalLength = command - 17;
+ } else if ((command & 0xf0) == 0x10) {
+ // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
+
+ // copy length - 2 :: variable bits :: valid range [3..]
+ // 2 + variableLength(command bits [0..2], 3)
+ matchLength = command & 0x7;
+ if (matchLength == 0) {
+ matchLength = 0x7;
+
+ int32_t nextByte = 0;
+ while (input < inputLimit &&
+ (nextByte = *(input++) & 0xFF) == 0) {
+ matchLength += 0xff;
+ }
+ matchLength += nextByte;
+ }
+ matchLength += 2;
+
+ // read trailer
+ if (input + SIZE_OF_SHORT > inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF;
+ input += SIZE_OF_SHORT;
+
+ // copy offset :: 16 bits :: valid range [32767..49151]
+ // [0..13] from trailer [2..15]
+ // [14] if command bit [3] unset
+ // [15] if command bit [3] set
+ matchOffset = trailer >> 2;
+ if ((command & 0x8) == 0) {
+ matchOffset |= 0x4000;
+ } else {
+ matchOffset |= 0x8000;
+ }
+ matchOffset--;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from trailer [0..1]
+ literalLength = trailer & 0x3;
+ } else if ((command & 0xe0) == 0x20) {
+ // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
+
+ // copy length - 2 :: variable bits :: valid range [3..]
+ // 2 + variableLength(command bits [0..4], 5)
+ matchLength = command & 0x1f;
+ if (matchLength == 0) {
+ matchLength = 0x1f;
+
+ int nextByte = 0;
+ while (input < inputLimit &&
+ (nextByte = *(input++) & 0xFF) == 0) {
+ matchLength += 0xff;
+ }
+ matchLength += nextByte;
+ }
+ matchLength += 2;
+
+ // read trailer
+ if (input + SIZE_OF_SHORT > inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF;
+ input += SIZE_OF_SHORT;
+
+ // copy offset :: 14 bits :: valid range [0..16383]
+ // [0..13] from trailer [2..15]
+ matchOffset = trailer >> 2;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from trailer [0..1]
+ literalLength = trailer & 0x3;
+ } else if ((command & 0xc0) != 0) {
+ // 0bMMMP_PPLL 0bPPPP_PPPP
+
+ // copy length - 1 :: 3 bits :: valid range [1..8]
+ // [0..2] from command [5..7]
+ // add 1
+ matchLength = (command & 0xe0) >> 5;
+ matchLength += 1;
+
+ // copy offset :: 11 bits :: valid range [0..4095]
+ // [0..2] from command [2..4]
+ // [3..10] from trailer [0..7]
+ if (input >= inputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ matchOffset = (command & 0x1c) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 3;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ } else {
+ throw MalformedInputException(input - inputAddress - 1,
+ "Invalid LZO command " +
+ toHex(command));
+ }
+ firstCommand = false;
+
+ // copy match
+ if (matchLength != 0) {
+ // lzo encodes match offset minus one
+ matchOffset++;
+
+ char *matchAddress = output - matchOffset;
+ if (matchAddress < outputAddress ||
+ output + matchLength > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ char *matchOutputLimit = output + matchLength;
+
+ if (output > fastOutputLimit) {
+ // slow match copy
+ while (output < matchOutputLimit) {
+ *(output++) = *(matchAddress++);
+ }
+ } else {
+ // copy repeated sequence
+ if (matchOffset < SIZE_OF_LONG) {
+ // 8 bytes apart so that we can copy long-at-a-time below
+ int32_t increment32 = DEC_32_TABLE[matchOffset];
+ int32_t decrement64 = DEC_64_TABLE[matchOffset];
+
+ output[0] = *matchAddress;
+ output[1] = *(matchAddress + 1);
+ output[2] = *(matchAddress + 2);
+ output[3] = *(matchAddress + 3);
+ output += SIZE_OF_INT;
+ matchAddress += increment32;
+
+ *reinterpret_cast<int32_t*>(output) =
+ *reinterpret_cast<int32_t*>(matchAddress);
+ output += SIZE_OF_INT;
+ matchAddress -= decrement64;
+ } else {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<int64_t*>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+
+ if (matchOutputLimit >= fastOutputLimit) {
+ if (matchOutputLimit > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+
+ while (output < fastOutputLimit) {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<int64_t*>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+
+ while (output < matchOutputLimit) {
+ *(output++) = *(matchAddress++);
+ }
+ } else {
+ while (output < matchOutputLimit) {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<int64_t*>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+ }
+ }
+ output = matchOutputLimit; // correction in case we over-copied
+ }
+
+ // copy literal
+ char *literalOutputLimit = output + literalLength;
+ if (literalOutputLimit > fastOutputLimit ||
+ input + literalLength > inputLimit - SIZE_OF_LONG) {
+ if (literalOutputLimit > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+
+ // slow, precise copy
+ memcpy(output, input, literalLength);
+ input += literalLength;
+ output += literalLength;
+ } else {
+ // fast copy. We may over-copy but there's enough room in input
+ // and output to not overrun them
+ do {
+ *reinterpret_cast<int64_t*>(output) =
+ *reinterpret_cast<const int64_t*>(input);
+ input += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ } while (output < literalOutputLimit);
+ // adjust index if we over-copied
+ input -= (output - literalOutputLimit);
+ output = literalOutputLimit;
+ }
+ lastLiteralLength = literalLength;
+ }
+
+ if (input + SIZE_OF_SHORT > inputLimit &&
+ *reinterpret_cast<const int16_t*>(input) != 0) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ input += SIZE_OF_SHORT;
+ }
+
+ return static_cast<uint64_t>(output - outputAddress);
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh
index 9de8537dd8..32d8085174 100644
--- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh
+++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh
@@ -1,42 +1,42 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_LZO_HH
-#define ORC_LZO_HH
-
-#include "orc/OrcFile.hh"
-
-#include "Adaptor.hh"
-
-namespace orc {
-
- /**
- * Decompress the bytes in to the output buffer.
- * @param inputAddress the start of the input
- * @param inputLimit one past the last byte of the input
- * @param outputAddress the start of the output buffer
- * @param outputLimit one past the last byte of the output buffer
- * @result the number of bytes decompressed
- */
- uint64_t lzoDecompress(const char *inputAddress,
- const char *inputLimit,
- char *outputAddress,
- char *outputLimit);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_LZO_HH
+#define ORC_LZO_HH
+
+#include "orc/OrcFile.hh"
+
+#include "Adaptor.hh"
+
+namespace orc {
+
+ /**
+ * Decompress the bytes in to the output buffer.
+ * @param inputAddress the start of the input
+ * @param inputLimit one past the last byte of the input
+ * @param outputAddress the start of the output buffer
+ * @param outputLimit one past the last byte of the output buffer
+ * @result the number of bytes decompressed
+ */
+ uint64_t lzoDecompress(const char *inputAddress,
+ const char *inputLimit,
+ char *outputAddress,
+ char *outputLimit);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc
index ecfb295bae..178e9cc316 100644
--- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc
+++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc
@@ -1,244 +1,244 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Int128.hh"
-#include "orc/MemoryPool.hh"
-
-#include "Adaptor.hh"
-
-#include <cstdlib>
-#include <iostream>
-#include <string.h>
-
-namespace orc {
-
- MemoryPool::~MemoryPool() {
- // PASS
- }
-
- class MemoryPoolImpl: public MemoryPool {
- public:
- virtual ~MemoryPoolImpl() override;
-
- char* malloc(uint64_t size) override;
- void free(char* p) override;
- };
-
- char* MemoryPoolImpl::malloc(uint64_t size) {
- return static_cast<char*>(std::malloc(size));
- }
-
- void MemoryPoolImpl::free(char* p) {
- std::free(p);
- }
-
- MemoryPoolImpl::~MemoryPoolImpl() {
- // PASS
- }
-
- template <class T>
- DataBuffer<T>::DataBuffer(MemoryPool& pool,
- uint64_t newSize
- ): memoryPool(pool),
- buf(nullptr),
- currentSize(0),
- currentCapacity(0) {
- resize(newSize);
- }
-
- template <class T>
- DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer
- ) noexcept:
- memoryPool(buffer.memoryPool),
- buf(buffer.buf),
- currentSize(buffer.currentSize),
- currentCapacity(buffer.currentCapacity) {
- buffer.buf = nullptr;
- buffer.currentSize = 0;
- buffer.currentCapacity = 0;
- }
-
- template <class T>
- DataBuffer<T>::~DataBuffer(){
- for(uint64_t i=currentSize; i > 0; --i) {
- (buf + i - 1)->~T();
- }
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <class T>
- void DataBuffer<T>::resize(uint64_t newSize) {
- reserve(newSize);
- if (currentSize > newSize) {
- for(uint64_t i=currentSize; i > newSize; --i) {
- (buf + i - 1)->~T();
- }
- } else if (newSize > currentSize) {
- for(uint64_t i=currentSize; i < newSize; ++i) {
- new (buf + i) T();
- }
- }
- currentSize = newSize;
- }
-
- template <class T>
- void DataBuffer<T>::reserve(uint64_t newCapacity){
- if (newCapacity > currentCapacity || !buf) {
- if (buf) {
- T* buf_old = buf;
- buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
- memcpy(buf, buf_old, sizeof(T) * currentSize);
- memoryPool.free(reinterpret_cast<char*>(buf_old));
- } else {
- buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
- }
- currentCapacity = newCapacity;
- }
- }
-
- // Specializations for char
-
- template <>
- DataBuffer<char>::~DataBuffer(){
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <>
- void DataBuffer<char>::resize(uint64_t newSize) {
- reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, newSize - currentSize);
- }
- currentSize = newSize;
- }
-
- // Specializations for char*
-
- template <>
- DataBuffer<char*>::~DataBuffer(){
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <>
- void DataBuffer<char*>::resize(uint64_t newSize) {
- reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*));
- }
- currentSize = newSize;
- }
-
- // Specializations for double
-
- template <>
- DataBuffer<double>::~DataBuffer(){
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <>
- void DataBuffer<double>::resize(uint64_t newSize) {
- reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double));
- }
- currentSize = newSize;
- }
-
- // Specializations for int64_t
-
- template <>
- DataBuffer<int64_t>::~DataBuffer(){
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <>
- void DataBuffer<int64_t>::resize(uint64_t newSize) {
- reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t));
- }
- currentSize = newSize;
- }
-
- // Specializations for uint64_t
-
- template <>
- DataBuffer<uint64_t>::~DataBuffer(){
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <>
- void DataBuffer<uint64_t>::resize(uint64_t newSize) {
- reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t));
- }
- currentSize = newSize;
- }
-
- // Specializations for unsigned char
-
- template <>
- DataBuffer<unsigned char>::~DataBuffer(){
- if (buf) {
- memoryPool.free(reinterpret_cast<char*>(buf));
- }
- }
-
- template <>
- void DataBuffer<unsigned char>::resize(uint64_t newSize) {
- reserve(newSize);
- if (newSize > currentSize) {
- memset(buf + currentSize, 0, newSize - currentSize);
- }
- currentSize = newSize;
- }
-
- #ifdef __clang__
- #pragma clang diagnostic ignored "-Wweak-template-vtables"
- #endif
-
- template class DataBuffer<char>;
- template class DataBuffer<char*>;
- template class DataBuffer<double>;
- template class DataBuffer<Int128>;
- template class DataBuffer<int64_t>;
- template class DataBuffer<uint64_t>;
- template class DataBuffer<unsigned char>;
-
- #ifdef __clang__
- #pragma clang diagnostic ignored "-Wexit-time-destructors"
- #endif
-
- MemoryPool* getDefaultPool() {
- static MemoryPoolImpl internal;
- return &internal;
- }
-} // namespace orc
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Int128.hh"
+#include "orc/MemoryPool.hh"
+
+#include "Adaptor.hh"
+
+#include <cstdlib>
+#include <iostream>
+#include <string.h>
+
+namespace orc {
+
+ MemoryPool::~MemoryPool() {
+ // PASS
+ }
+
+ class MemoryPoolImpl: public MemoryPool {
+ public:
+ virtual ~MemoryPoolImpl() override;
+
+ char* malloc(uint64_t size) override;
+ void free(char* p) override;
+ };
+
+ char* MemoryPoolImpl::malloc(uint64_t size) {
+ return static_cast<char*>(std::malloc(size));
+ }
+
+ void MemoryPoolImpl::free(char* p) {
+ std::free(p);
+ }
+
+ MemoryPoolImpl::~MemoryPoolImpl() {
+ // PASS
+ }
+
+ template <class T>
+ DataBuffer<T>::DataBuffer(MemoryPool& pool,
+ uint64_t newSize
+ ): memoryPool(pool),
+ buf(nullptr),
+ currentSize(0),
+ currentCapacity(0) {
+ resize(newSize);
+ }
+
+ template <class T>
+ DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer
+ ) noexcept:
+ memoryPool(buffer.memoryPool),
+ buf(buffer.buf),
+ currentSize(buffer.currentSize),
+ currentCapacity(buffer.currentCapacity) {
+ buffer.buf = nullptr;
+ buffer.currentSize = 0;
+ buffer.currentCapacity = 0;
+ }
+
+ template <class T>
+ DataBuffer<T>::~DataBuffer(){
+ for(uint64_t i=currentSize; i > 0; --i) {
+ (buf + i - 1)->~T();
+ }
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <class T>
+ void DataBuffer<T>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (currentSize > newSize) {
+ for(uint64_t i=currentSize; i > newSize; --i) {
+ (buf + i - 1)->~T();
+ }
+ } else if (newSize > currentSize) {
+ for(uint64_t i=currentSize; i < newSize; ++i) {
+ new (buf + i) T();
+ }
+ }
+ currentSize = newSize;
+ }
+
+ template <class T>
+ void DataBuffer<T>::reserve(uint64_t newCapacity){
+ if (newCapacity > currentCapacity || !buf) {
+ if (buf) {
+ T* buf_old = buf;
+ buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
+ memcpy(buf, buf_old, sizeof(T) * currentSize);
+ memoryPool.free(reinterpret_cast<char*>(buf_old));
+ } else {
+ buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
+ }
+ currentCapacity = newCapacity;
+ }
+ }
+
+ // Specializations for char
+
+ template <>
+ DataBuffer<char>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<char>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, newSize - currentSize);
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for char*
+
+ template <>
+ DataBuffer<char*>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<char*>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for double
+
+ template <>
+ DataBuffer<double>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<double>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for int64_t
+
+ template <>
+ DataBuffer<int64_t>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<int64_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for uint64_t
+
+ template <>
+ DataBuffer<uint64_t>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<uint64_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for unsigned char
+
+ template <>
+ DataBuffer<unsigned char>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<unsigned char>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, newSize - currentSize);
+ }
+ currentSize = newSize;
+ }
+
+ #ifdef __clang__
+ #pragma clang diagnostic ignored "-Wweak-template-vtables"
+ #endif
+
+ template class DataBuffer<char>;
+ template class DataBuffer<char*>;
+ template class DataBuffer<double>;
+ template class DataBuffer<Int128>;
+ template class DataBuffer<int64_t>;
+ template class DataBuffer<uint64_t>;
+ template class DataBuffer<unsigned char>;
+
+ #ifdef __clang__
+ #pragma clang diagnostic ignored "-Wexit-time-destructors"
+ #endif
+
+ MemoryPool* getDefaultPool() {
+ static MemoryPoolImpl internal;
+ return &internal;
+ }
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.cc b/contrib/libs/apache/orc/c++/src/Murmur3.cc
index b45bd6d492..63cf797a04 100644
--- a/contrib/libs/apache/orc/c++/src/Murmur3.cc
+++ b/contrib/libs/apache/orc/c++/src/Murmur3.cc
@@ -1,98 +1,98 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "Murmur3.hh"
-
-#define ROTL64(x, r) ((x << r) | (x >> (64 - r)))
-
-namespace orc {
-
- inline uint64_t rotl64 ( uint64_t x, int8_t r ) {
- return (x << r) | (x >> (64 - r));
- }
-
- inline uint64_t Murmur3::fmix64(uint64_t value) {
- value ^= (value >> 33);
- value *= 0xff51afd7ed558ccdL;
- value ^= (value >> 33);
- value *= 0xc4ceb9fe1a85ec53L;
- value ^= (value >> 33);
- return value;
- }
-
- uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) {
- return hash64(data, len, DEFAULT_SEED);
- }
-
- DIAGNOSTIC_PUSH
-
-#if defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough")
-#endif
-
- uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) {
- uint64_t h = seed;
- uint32_t blocks = len >> 3;
-
- const uint64_t* src = reinterpret_cast<const uint64_t*>(data);
- uint64_t c1 = 0x87c37b91114253d5L;
- uint64_t c2 = 0x4cf5ad432745937fL;
- for (uint32_t i = 0; i < blocks; i++) {
- uint64_t k = src[i];
- k *= c1;
- k = ROTL64(k, 31);
- k *= c2;
-
- h ^= k;
- h = ROTL64(h, 27);
- h = h * 5 + 0x52dce729;
- }
-
- uint64_t k = 0;
- uint32_t idx = blocks << 3;
- switch (len - idx) {
- case 7:
- k ^= static_cast<uint64_t>(data[idx + 6]) << 48;
- case 6:
- k ^= static_cast<uint64_t>(data[idx + 5]) << 40;
- case 5:
- k ^= static_cast<uint64_t>(data[idx + 4]) << 32;
- case 4:
- k ^= static_cast<uint64_t>(data[idx + 3]) << 24;
- case 3:
- k ^= static_cast<uint64_t>(data[idx + 2]) << 16;
- case 2:
- k ^= static_cast<uint64_t>(data[idx + 1]) << 8;
- case 1:
- k ^= static_cast<uint64_t>(data[idx + 0]);
-
- k *= c1;
- k = ROTL64(k, 31);
- k *= c2;
- h ^= k;
- }
-
- h ^= len;
- h = fmix64(h);
- return h;
- }
-
- DIAGNOSTIC_POP
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Murmur3.hh"
+
+#define ROTL64(x, r) ((x << r) | (x >> (64 - r)))
+
+namespace orc {
+
+ inline uint64_t rotl64 ( uint64_t x, int8_t r ) {
+ return (x << r) | (x >> (64 - r));
+ }
+
+ inline uint64_t Murmur3::fmix64(uint64_t value) {
+ value ^= (value >> 33);
+ value *= 0xff51afd7ed558ccdL;
+ value ^= (value >> 33);
+ value *= 0xc4ceb9fe1a85ec53L;
+ value ^= (value >> 33);
+ return value;
+ }
+
+ uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) {
+ return hash64(data, len, DEFAULT_SEED);
+ }
+
+ DIAGNOSTIC_PUSH
+
+#if defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough")
+#endif
+
+ uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) {
+ uint64_t h = seed;
+ uint32_t blocks = len >> 3;
+
+ const uint64_t* src = reinterpret_cast<const uint64_t*>(data);
+ uint64_t c1 = 0x87c37b91114253d5L;
+ uint64_t c2 = 0x4cf5ad432745937fL;
+ for (uint32_t i = 0; i < blocks; i++) {
+ uint64_t k = src[i];
+ k *= c1;
+ k = ROTL64(k, 31);
+ k *= c2;
+
+ h ^= k;
+ h = ROTL64(h, 27);
+ h = h * 5 + 0x52dce729;
+ }
+
+ uint64_t k = 0;
+ uint32_t idx = blocks << 3;
+ switch (len - idx) {
+ case 7:
+ k ^= static_cast<uint64_t>(data[idx + 6]) << 48;
+ case 6:
+ k ^= static_cast<uint64_t>(data[idx + 5]) << 40;
+ case 5:
+ k ^= static_cast<uint64_t>(data[idx + 4]) << 32;
+ case 4:
+ k ^= static_cast<uint64_t>(data[idx + 3]) << 24;
+ case 3:
+ k ^= static_cast<uint64_t>(data[idx + 2]) << 16;
+ case 2:
+ k ^= static_cast<uint64_t>(data[idx + 1]) << 8;
+ case 1:
+ k ^= static_cast<uint64_t>(data[idx + 0]);
+
+ k *= c1;
+ k = ROTL64(k, 31);
+ k *= c2;
+ h ^= k;
+ }
+
+ h ^= len;
+ h = fmix64(h);
+ return h;
+ }
+
+ DIAGNOSTIC_POP
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.hh b/contrib/libs/apache/orc/c++/src/Murmur3.hh
index 02391811b0..9cf1de138f 100644
--- a/contrib/libs/apache/orc/c++/src/Murmur3.hh
+++ b/contrib/libs/apache/orc/c++/src/Murmur3.hh
@@ -1,40 +1,40 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_MURMUR3_HH
-#define ORC_MURMUR3_HH
-
-#include "orc/orc-config.hh"
-
-namespace orc {
-
- class Murmur3 {
- public:
- static const uint32_t DEFAULT_SEED = 104729;
- static const uint64_t NULL_HASHCODE = 2862933555777941757LL;
-
- static uint64_t hash64(const uint8_t *data, uint32_t len);
-
- private:
- static uint64_t fmix64(uint64_t value);
- static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed);
- };
-
-}
-
-#endif //ORC_MURMUR3_HH
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_MURMUR3_HH
+#define ORC_MURMUR3_HH
+
+#include "orc/orc-config.hh"
+
+namespace orc {
+
+ class Murmur3 {
+ public:
+ static const uint32_t DEFAULT_SEED = 104729;
+ static const uint64_t NULL_HASHCODE = 2862933555777941757LL;
+
+ static uint64_t hash64(const uint8_t *data, uint32_t len);
+
+ private:
+ static uint64_t fmix64(uint64_t value);
+ static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed);
+ };
+
+}
+
+#endif //ORC_MURMUR3_HH
diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh
index 795e166138..ee9982cdc2 100644
--- a/contrib/libs/apache/orc/c++/src/Options.hh
+++ b/contrib/libs/apache/orc/c++/src/Options.hh
@@ -1,258 +1,258 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_OPTIONS_HH
-#define ORC_OPTIONS_HH
-
-#include "orc/Int128.hh"
-#include "orc/OrcFile.hh"
-#include "orc/Reader.hh"
-
-#include <limits>
-
-namespace orc {
-
- enum ColumnSelection {
- ColumnSelection_NONE = 0,
- ColumnSelection_NAMES = 1,
- ColumnSelection_FIELD_IDS = 2,
- ColumnSelection_TYPE_IDS = 3,
- };
-
-/**
- * ReaderOptions Implementation
- */
- struct ReaderOptionsPrivate {
- uint64_t tailLocation;
- std::ostream* errorStream;
- MemoryPool* memoryPool;
- std::string serializedTail;
-
- ReaderOptionsPrivate() {
- tailLocation = std::numeric_limits<uint64_t>::max();
- errorStream = &std::cerr;
- memoryPool = getDefaultPool();
- }
- };
-
- ReaderOptions::ReaderOptions():
- privateBits(std::unique_ptr<ReaderOptionsPrivate>
- (new ReaderOptionsPrivate())) {
- // PASS
- }
-
- ReaderOptions::ReaderOptions(const ReaderOptions& rhs):
- privateBits(std::unique_ptr<ReaderOptionsPrivate>
- (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) {
- // PASS
- }
-
- ReaderOptions::ReaderOptions(ReaderOptions& rhs) {
- // swap privateBits with rhs
- ReaderOptionsPrivate* l = privateBits.release();
- privateBits.reset(rhs.privateBits.release());
- rhs.privateBits.reset(l);
- }
-
- ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) {
- if (this != &rhs) {
- privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get())));
- }
- return *this;
- }
-
- ReaderOptions::~ReaderOptions() {
- // PASS
- }
-
- ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) {
- privateBits->memoryPool = &pool;
- return *this;
- }
-
- MemoryPool* ReaderOptions::getMemoryPool() const{
- return privateBits->memoryPool;
- }
-
- ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) {
- privateBits->tailLocation = offset;
- return *this;
- }
-
- uint64_t ReaderOptions::getTailLocation() const {
- return privateBits->tailLocation;
- }
-
- ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value
- ) {
- privateBits->serializedTail = value;
- return *this;
- }
-
- std::string ReaderOptions::getSerializedFileTail() const {
- return privateBits->serializedTail;
- }
-
- ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) {
- privateBits->errorStream = &stream;
- return *this;
- }
-
- std::ostream* ReaderOptions::getErrorStream() const {
- return privateBits->errorStream;
- }
-
-/**
- * RowReaderOptions Implementation
- */
-
- struct RowReaderOptionsPrivate {
- ColumnSelection selection;
- std::list<uint64_t> includedColumnIndexes;
- std::list<std::string> includedColumnNames;
- uint64_t dataStart;
- uint64_t dataLength;
- bool throwOnHive11DecimalOverflow;
- int32_t forcedScaleOnHive11Decimal;
- bool enableLazyDecoding;
-
- RowReaderOptionsPrivate() {
- selection = ColumnSelection_NONE;
- dataStart = 0;
- dataLength = std::numeric_limits<uint64_t>::max();
- throwOnHive11DecimalOverflow = true;
- forcedScaleOnHive11Decimal = 6;
- enableLazyDecoding = false;
- }
- };
-
- RowReaderOptions::RowReaderOptions():
- privateBits(std::unique_ptr<RowReaderOptionsPrivate>
- (new RowReaderOptionsPrivate())) {
- // PASS
- }
-
- RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs):
- privateBits(std::unique_ptr<RowReaderOptionsPrivate>
- (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) {
- // PASS
- }
-
- RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) {
- // swap privateBits with rhs
- RowReaderOptionsPrivate* l = privateBits.release();
- privateBits.reset(rhs.privateBits.release());
- rhs.privateBits.reset(l);
- }
-
- RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) {
- if (this != &rhs) {
- privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get())));
- }
- return *this;
- }
-
- RowReaderOptions::~RowReaderOptions() {
- // PASS
- }
-
- RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) {
- privateBits->selection = ColumnSelection_FIELD_IDS;
- privateBits->includedColumnIndexes.assign(include.begin(), include.end());
- privateBits->includedColumnNames.clear();
- return *this;
- }
-
- RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) {
- privateBits->selection = ColumnSelection_NAMES;
- privateBits->includedColumnNames.assign(include.begin(), include.end());
- privateBits->includedColumnIndexes.clear();
- return *this;
- }
-
- RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) {
- privateBits->selection = ColumnSelection_TYPE_IDS;
- privateBits->includedColumnIndexes.assign(types.begin(), types.end());
- privateBits->includedColumnNames.clear();
- return *this;
- }
-
- RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
- privateBits->dataStart = offset;
- privateBits->dataLength = length;
- return *this;
- }
-
- bool RowReaderOptions::getIndexesSet() const {
- return privateBits->selection == ColumnSelection_FIELD_IDS;
- }
-
- bool RowReaderOptions::getTypeIdsSet() const {
- return privateBits->selection == ColumnSelection_TYPE_IDS;
- }
-
- const std::list<uint64_t>& RowReaderOptions::getInclude() const {
- return privateBits->includedColumnIndexes;
- }
-
- bool RowReaderOptions::getNamesSet() const {
- return privateBits->selection == ColumnSelection_NAMES;
- }
-
- const std::list<std::string>& RowReaderOptions::getIncludeNames() const {
- return privateBits->includedColumnNames;
- }
-
- uint64_t RowReaderOptions::getOffset() const {
- return privateBits->dataStart;
- }
-
- uint64_t RowReaderOptions::getLength() const {
- return privateBits->dataLength;
- }
-
- RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){
- privateBits->throwOnHive11DecimalOverflow = shouldThrow;
- return *this;
- }
-
- bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const {
- return privateBits->throwOnHive11DecimalOverflow;
- }
-
- RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale
- ) {
- privateBits->forcedScaleOnHive11Decimal = forcedScale;
- return *this;
- }
-
- int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const {
- return privateBits->forcedScaleOnHive11Decimal;
- }
-
- bool RowReaderOptions::getEnableLazyDecoding() const {
- return privateBits->enableLazyDecoding;
- }
-
- RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) {
- privateBits->enableLazyDecoding = enable;
- return *this;
- }
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_OPTIONS_HH
+#define ORC_OPTIONS_HH
+
+#include "orc/Int128.hh"
+#include "orc/OrcFile.hh"
+#include "orc/Reader.hh"
+
+#include <limits>
+
+namespace orc {
+
+ enum ColumnSelection {
+ ColumnSelection_NONE = 0,
+ ColumnSelection_NAMES = 1,
+ ColumnSelection_FIELD_IDS = 2,
+ ColumnSelection_TYPE_IDS = 3,
+ };
+
+/**
+ * ReaderOptions Implementation
+ */
+ struct ReaderOptionsPrivate {
+ uint64_t tailLocation;
+ std::ostream* errorStream;
+ MemoryPool* memoryPool;
+ std::string serializedTail;
+
+ ReaderOptionsPrivate() {
+ tailLocation = std::numeric_limits<uint64_t>::max();
+ errorStream = &std::cerr;
+ memoryPool = getDefaultPool();
+ }
+ };
+
+ ReaderOptions::ReaderOptions():
+ privateBits(std::unique_ptr<ReaderOptionsPrivate>
+ (new ReaderOptionsPrivate())) {
+ // PASS
+ }
+
+ ReaderOptions::ReaderOptions(const ReaderOptions& rhs):
+ privateBits(std::unique_ptr<ReaderOptionsPrivate>
+ (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) {
+ // PASS
+ }
+
+ ReaderOptions::ReaderOptions(ReaderOptions& rhs) {
+ // swap privateBits with rhs
+ ReaderOptionsPrivate* l = privateBits.release();
+ privateBits.reset(rhs.privateBits.release());
+ rhs.privateBits.reset(l);
+ }
+
+ ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) {
+ if (this != &rhs) {
+ privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get())));
+ }
+ return *this;
+ }
+
+ ReaderOptions::~ReaderOptions() {
+ // PASS
+ }
+
+ ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) {
+ privateBits->memoryPool = &pool;
+ return *this;
+ }
+
+ MemoryPool* ReaderOptions::getMemoryPool() const{
+ return privateBits->memoryPool;
+ }
+
+ ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) {
+ privateBits->tailLocation = offset;
+ return *this;
+ }
+
+ uint64_t ReaderOptions::getTailLocation() const {
+ return privateBits->tailLocation;
+ }
+
+ ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value
+ ) {
+ privateBits->serializedTail = value;
+ return *this;
+ }
+
+ std::string ReaderOptions::getSerializedFileTail() const {
+ return privateBits->serializedTail;
+ }
+
+ ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) {
+ privateBits->errorStream = &stream;
+ return *this;
+ }
+
+ std::ostream* ReaderOptions::getErrorStream() const {
+ return privateBits->errorStream;
+ }
+
+/**
+ * RowReaderOptions Implementation
+ */
+
+ struct RowReaderOptionsPrivate {
+ ColumnSelection selection;
+ std::list<uint64_t> includedColumnIndexes;
+ std::list<std::string> includedColumnNames;
+ uint64_t dataStart;
+ uint64_t dataLength;
+ bool throwOnHive11DecimalOverflow;
+ int32_t forcedScaleOnHive11Decimal;
+ bool enableLazyDecoding;
+
+ RowReaderOptionsPrivate() {
+ selection = ColumnSelection_NONE;
+ dataStart = 0;
+ dataLength = std::numeric_limits<uint64_t>::max();
+ throwOnHive11DecimalOverflow = true;
+ forcedScaleOnHive11Decimal = 6;
+ enableLazyDecoding = false;
+ }
+ };
+
+ RowReaderOptions::RowReaderOptions():
+ privateBits(std::unique_ptr<RowReaderOptionsPrivate>
+ (new RowReaderOptionsPrivate())) {
+ // PASS
+ }
+
+ RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs):
+ privateBits(std::unique_ptr<RowReaderOptionsPrivate>
+ (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) {
+ // PASS
+ }
+
+ RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) {
+ // swap privateBits with rhs
+ RowReaderOptionsPrivate* l = privateBits.release();
+ privateBits.reset(rhs.privateBits.release());
+ rhs.privateBits.reset(l);
+ }
+
+ RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) {
+ if (this != &rhs) {
+ privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get())));
+ }
+ return *this;
+ }
+
+ RowReaderOptions::~RowReaderOptions() {
+ // PASS
+ }
+
+ RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) {
+ privateBits->selection = ColumnSelection_FIELD_IDS;
+ privateBits->includedColumnIndexes.assign(include.begin(), include.end());
+ privateBits->includedColumnNames.clear();
+ return *this;
+ }
+
+ RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) {
+ privateBits->selection = ColumnSelection_NAMES;
+ privateBits->includedColumnNames.assign(include.begin(), include.end());
+ privateBits->includedColumnIndexes.clear();
+ return *this;
+ }
+
+ RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) {
+ privateBits->selection = ColumnSelection_TYPE_IDS;
+ privateBits->includedColumnIndexes.assign(types.begin(), types.end());
+ privateBits->includedColumnNames.clear();
+ return *this;
+ }
+
+ RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
+ privateBits->dataStart = offset;
+ privateBits->dataLength = length;
+ return *this;
+ }
+
+ bool RowReaderOptions::getIndexesSet() const {
+ return privateBits->selection == ColumnSelection_FIELD_IDS;
+ }
+
+ bool RowReaderOptions::getTypeIdsSet() const {
+ return privateBits->selection == ColumnSelection_TYPE_IDS;
+ }
+
+ const std::list<uint64_t>& RowReaderOptions::getInclude() const {
+ return privateBits->includedColumnIndexes;
+ }
+
+ bool RowReaderOptions::getNamesSet() const {
+ return privateBits->selection == ColumnSelection_NAMES;
+ }
+
+ const std::list<std::string>& RowReaderOptions::getIncludeNames() const {
+ return privateBits->includedColumnNames;
+ }
+
+ uint64_t RowReaderOptions::getOffset() const {
+ return privateBits->dataStart;
+ }
+
+ uint64_t RowReaderOptions::getLength() const {
+ return privateBits->dataLength;
+ }
+
+ RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){
+ privateBits->throwOnHive11DecimalOverflow = shouldThrow;
+ return *this;
+ }
+
+ bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const {
+ return privateBits->throwOnHive11DecimalOverflow;
+ }
+
+ RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale
+ ) {
+ privateBits->forcedScaleOnHive11Decimal = forcedScale;
+ return *this;
+ }
+
+ int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const {
+ return privateBits->forcedScaleOnHive11Decimal;
+ }
+
+ bool RowReaderOptions::getEnableLazyDecoding() const {
+ return privateBits->enableLazyDecoding;
+ }
+
+ RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) {
+ privateBits->enableLazyDecoding = enable;
+ return *this;
+ }
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc
index a0158bbadf..5856db692e 100644
--- a/contrib/libs/apache/orc/c++/src/OrcFile.cc
+++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc
@@ -1,184 +1,184 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "orc/OrcFile.hh"
-#include "orc/Exceptions.hh"
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <string.h>
-
-#ifdef _MSC_VER
-#include <io.h>
-#define S_IRUSR _S_IREAD
-#define S_IWUSR _S_IWRITE
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "orc/OrcFile.hh"
+#include "orc/Exceptions.hh"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#ifdef _MSC_VER
+#include <io.h>
+#define S_IRUSR _S_IREAD
+#define S_IWUSR _S_IWRITE
#define stat _stat64
#define fstat _fstat64
-#else
-#include <unistd.h>
-#define O_BINARY 0
-#endif
-
-namespace orc {
-
- class FileInputStream : public InputStream {
- private:
- std::string filename;
- int file;
- uint64_t totalLength;
-
- public:
- FileInputStream(std::string _filename) {
- filename = _filename;
- file = open(filename.c_str(), O_BINARY | O_RDONLY);
- if (file == -1) {
- throw ParseError("Can't open " + filename);
- }
- struct stat fileStat;
- if (fstat(file, &fileStat) == -1) {
- throw ParseError("Can't stat " + filename);
- }
- totalLength = static_cast<uint64_t>(fileStat.st_size);
- }
-
- ~FileInputStream() override;
-
- uint64_t getLength() const override {
- return totalLength;
- }
-
- uint64_t getNaturalReadSize() const override {
- return 128 * 1024;
- }
-
- void read(void* buf,
- uint64_t length,
- uint64_t offset) override {
- if (!buf) {
- throw ParseError("Buffer is null");
- }
- ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset));
-
- if (bytesRead == -1) {
- throw ParseError("Bad read of " + filename);
- }
- if (static_cast<uint64_t>(bytesRead) != length) {
- throw ParseError("Short read of " + filename);
- }
- }
-
- const std::string& getName() const override {
- return filename;
- }
- };
-
- FileInputStream::~FileInputStream() {
- close(file);
- }
-
- std::unique_ptr<InputStream> readFile(const std::string& path) {
-#ifdef BUILD_LIBHDFSPP
- if(strncmp (path.c_str(), "hdfs://", 7) == 0){
- return orc::readHdfsFile(std::string(path));
- } else {
-#endif
- return orc::readLocalFile(std::string(path));
-#ifdef BUILD_LIBHDFSPP
- }
-#endif
- }
-
- std::unique_ptr<InputStream> readLocalFile(const std::string& path) {
- return std::unique_ptr<InputStream>(new FileInputStream(path));
- }
-
- OutputStream::~OutputStream() {
- // PASS
- };
-
- class FileOutputStream : public OutputStream {
- private:
- std::string filename;
- int file;
- uint64_t bytesWritten;
- bool closed;
-
- public:
- FileOutputStream(std::string _filename) {
- bytesWritten = 0;
- filename = _filename;
- closed = false;
- file = open(
- filename.c_str(),
- O_BINARY | O_CREAT | O_WRONLY | O_TRUNC,
- S_IRUSR | S_IWUSR);
- if (file == -1) {
- throw ParseError("Can't open " + filename);
- }
- }
-
- ~FileOutputStream() override;
-
- uint64_t getLength() const override {
- return bytesWritten;
- }
-
- uint64_t getNaturalWriteSize() const override {
- return 128 * 1024;
- }
-
- void write(const void* buf, size_t length) override {
- if (closed) {
- throw std::logic_error("Cannot write to closed stream.");
- }
- ssize_t bytesWrite = ::write(file, buf, length);
- if (bytesWrite == -1) {
- throw ParseError("Bad write of " + filename);
- }
- if (static_cast<uint64_t>(bytesWrite) != length) {
- throw ParseError("Short write of " + filename);
- }
- bytesWritten += static_cast<uint64_t>(bytesWrite);
- }
-
- const std::string& getName() const override {
- return filename;
- }
-
- void close() override {
- if (!closed) {
- ::close(file);
- closed = true;
- }
- }
- };
-
- FileOutputStream::~FileOutputStream() {
- if (!closed) {
- ::close(file);
- closed = true;
- }
- }
-
- std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) {
- return std::unique_ptr<OutputStream>(new FileOutputStream(path));
- }
-}
+#else
+#include <unistd.h>
+#define O_BINARY 0
+#endif
+
+namespace orc {
+
+ class FileInputStream : public InputStream {
+ private:
+ std::string filename;
+ int file;
+ uint64_t totalLength;
+
+ public:
+ FileInputStream(std::string _filename) {
+ filename = _filename;
+ file = open(filename.c_str(), O_BINARY | O_RDONLY);
+ if (file == -1) {
+ throw ParseError("Can't open " + filename);
+ }
+ struct stat fileStat;
+ if (fstat(file, &fileStat) == -1) {
+ throw ParseError("Can't stat " + filename);
+ }
+ totalLength = static_cast<uint64_t>(fileStat.st_size);
+ }
+
+ ~FileInputStream() override;
+
+ uint64_t getLength() const override {
+ return totalLength;
+ }
+
+ uint64_t getNaturalReadSize() const override {
+ return 128 * 1024;
+ }
+
+ void read(void* buf,
+ uint64_t length,
+ uint64_t offset) override {
+ if (!buf) {
+ throw ParseError("Buffer is null");
+ }
+ ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset));
+
+ if (bytesRead == -1) {
+ throw ParseError("Bad read of " + filename);
+ }
+ if (static_cast<uint64_t>(bytesRead) != length) {
+ throw ParseError("Short read of " + filename);
+ }
+ }
+
+ const std::string& getName() const override {
+ return filename;
+ }
+ };
+
+ FileInputStream::~FileInputStream() {
+ close(file);
+ }
+
+ std::unique_ptr<InputStream> readFile(const std::string& path) {
+#ifdef BUILD_LIBHDFSPP
+ if(strncmp (path.c_str(), "hdfs://", 7) == 0){
+ return orc::readHdfsFile(std::string(path));
+ } else {
+#endif
+ return orc::readLocalFile(std::string(path));
+#ifdef BUILD_LIBHDFSPP
+ }
+#endif
+ }
+
+ std::unique_ptr<InputStream> readLocalFile(const std::string& path) {
+ return std::unique_ptr<InputStream>(new FileInputStream(path));
+ }
+
+ OutputStream::~OutputStream() {
+ // PASS
+ };
+
+ class FileOutputStream : public OutputStream {
+ private:
+ std::string filename;
+ int file;
+ uint64_t bytesWritten;
+ bool closed;
+
+ public:
+ FileOutputStream(std::string _filename) {
+ bytesWritten = 0;
+ filename = _filename;
+ closed = false;
+ file = open(
+ filename.c_str(),
+ O_BINARY | O_CREAT | O_WRONLY | O_TRUNC,
+ S_IRUSR | S_IWUSR);
+ if (file == -1) {
+ throw ParseError("Can't open " + filename);
+ }
+ }
+
+ ~FileOutputStream() override;
+
+ uint64_t getLength() const override {
+ return bytesWritten;
+ }
+
+ uint64_t getNaturalWriteSize() const override {
+ return 128 * 1024;
+ }
+
+ void write(const void* buf, size_t length) override {
+ if (closed) {
+ throw std::logic_error("Cannot write to closed stream.");
+ }
+ ssize_t bytesWrite = ::write(file, buf, length);
+ if (bytesWrite == -1) {
+ throw ParseError("Bad write of " + filename);
+ }
+ if (static_cast<uint64_t>(bytesWrite) != length) {
+ throw ParseError("Short write of " + filename);
+ }
+ bytesWritten += static_cast<uint64_t>(bytesWrite);
+ }
+
+ const std::string& getName() const override {
+ return filename;
+ }
+
+ void close() override {
+ if (!closed) {
+ ::close(file);
+ closed = true;
+ }
+ }
+ };
+
+ FileOutputStream::~FileOutputStream() {
+ if (!closed) {
+ ::close(file);
+ closed = true;
+ }
+ }
+
+ std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) {
+ return std::unique_ptr<OutputStream>(new FileOutputStream(path));
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc
index 21f9082216..ea0181deaf 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.cc
+++ b/contrib/libs/apache/orc/c++/src/RLE.cc
@@ -1,121 +1,121 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "RLEv1.hh"
-#include "RLEv2.hh"
-#include "orc/Exceptions.hh"
-
-namespace orc {
-
- RleEncoder::~RleEncoder() {
- // PASS
- }
-
- RleDecoder::~RleDecoder() {
- // PASS
- }
-
- std::unique_ptr<RleEncoder> createRleEncoder
- (std::unique_ptr<BufferedOutputStream> output,
- bool isSigned,
- RleVersion version,
- MemoryPool&,
- bool alignedBitpacking) {
- switch (static_cast<int64_t>(version)) {
- case RleVersion_1:
- // We don't have std::make_unique() yet.
- return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output),
- isSigned));
- case RleVersion_2:
- return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output),
- isSigned, alignedBitpacking));
- default:
- throw NotImplementedYet("Not implemented yet");
- }
- }
-
- std::unique_ptr<RleDecoder> createRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- bool isSigned,
- RleVersion version,
- MemoryPool& pool) {
- switch (static_cast<int64_t>(version)) {
- case RleVersion_1:
- // We don't have std::make_unique() yet.
- return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input),
- isSigned));
- case RleVersion_2:
- return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input),
- isSigned, pool));
- default:
- throw NotImplementedYet("Not implemented yet");
- }
- }
-
- void RleEncoder::add(const int64_t* data, uint64_t numValues,
- const char* notNull) {
- for (uint64_t i = 0; i < numValues; ++i) {
- if (!notNull || notNull[i]) {
- write(data[i]);
- }
- }
- }
-
- void RleEncoder::writeVslong(int64_t val) {
- writeVulong((val << 1) ^ (val >> 63));
- }
-
- void RleEncoder::writeVulong(int64_t val) {
- while (true) {
- if ((val & ~0x7f) == 0) {
- writeByte(static_cast<char>(val));
- return;
- } else {
- writeByte(static_cast<char>(0x80 | (val & 0x7f)));
- // cast val to unsigned so as to force 0-fill right shift
- val = (static_cast<uint64_t>(val) >> 7);
- }
- }
- }
-
- void RleEncoder::writeByte(char c) {
- if (bufferPosition == bufferLength) {
- int addedSize = 0;
- if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
- throw std::bad_alloc();
- }
- bufferPosition = 0;
- bufferLength = static_cast<size_t>(addedSize);
- }
- buffer[bufferPosition++] = c;
- }
-
- void RleEncoder::recordPosition(PositionRecorder* recorder) const {
- uint64_t flushedSize = outputStream->getSize();
- uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
- if (outputStream->isCompressed()) {
- recorder->add(flushedSize);
- recorder->add(unflushedSize);
- } else {
- flushedSize -= static_cast<uint64_t>(bufferLength);
- recorder->add(flushedSize + unflushedSize);
- }
- recorder->add(static_cast<uint64_t>(numLiterals));
- }
-
-} // namespace orc
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "RLEv1.hh"
+#include "RLEv2.hh"
+#include "orc/Exceptions.hh"
+
+namespace orc {
+
+ RleEncoder::~RleEncoder() {
+ // PASS
+ }
+
+ RleDecoder::~RleDecoder() {
+ // PASS
+ }
+
+ std::unique_ptr<RleEncoder> createRleEncoder
+ (std::unique_ptr<BufferedOutputStream> output,
+ bool isSigned,
+ RleVersion version,
+ MemoryPool&,
+ bool alignedBitpacking) {
+ switch (static_cast<int64_t>(version)) {
+ case RleVersion_1:
+ // We don't have std::make_unique() yet.
+ return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output),
+ isSigned));
+ case RleVersion_2:
+ return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output),
+ isSigned, alignedBitpacking));
+ default:
+ throw NotImplementedYet("Not implemented yet");
+ }
+ }
+
+ std::unique_ptr<RleDecoder> createRleDecoder
+ (std::unique_ptr<SeekableInputStream> input,
+ bool isSigned,
+ RleVersion version,
+ MemoryPool& pool) {
+ switch (static_cast<int64_t>(version)) {
+ case RleVersion_1:
+ // We don't have std::make_unique() yet.
+ return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input),
+ isSigned));
+ case RleVersion_2:
+ return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input),
+ isSigned, pool));
+ default:
+ throw NotImplementedYet("Not implemented yet");
+ }
+ }
+
+ void RleEncoder::add(const int64_t* data, uint64_t numValues,
+ const char* notNull) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ write(data[i]);
+ }
+ }
+ }
+
+ void RleEncoder::writeVslong(int64_t val) {
+ writeVulong((val << 1) ^ (val >> 63));
+ }
+
+ void RleEncoder::writeVulong(int64_t val) {
+ while (true) {
+ if ((val & ~0x7f) == 0) {
+ writeByte(static_cast<char>(val));
+ return;
+ } else {
+ writeByte(static_cast<char>(0x80 | (val & 0x7f)));
+ // cast val to unsigned so as to force 0-fill right shift
+ val = (static_cast<uint64_t>(val) >> 7);
+ }
+ }
+ }
+
+ void RleEncoder::writeByte(char c) {
+ if (bufferPosition == bufferLength) {
+ int addedSize = 0;
+ if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
+ throw std::bad_alloc();
+ }
+ bufferPosition = 0;
+ bufferLength = static_cast<size_t>(addedSize);
+ }
+ buffer[bufferPosition++] = c;
+ }
+
+ void RleEncoder::recordPosition(PositionRecorder* recorder) const {
+ uint64_t flushedSize = outputStream->getSize();
+ uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
+ if (outputStream->isCompressed()) {
+ recorder->add(flushedSize);
+ recorder->add(unflushedSize);
+ } else {
+ flushedSize -= static_cast<uint64_t>(bufferLength);
+ recorder->add(flushedSize + unflushedSize);
+ }
+ recorder->add(static_cast<uint64_t>(numLiterals));
+ }
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh
index 6822bd812e..ec0330559e 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.hh
+++ b/contrib/libs/apache/orc/c++/src/RLE.hh
@@ -1,155 +1,155 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_RLE_HH
-#define ORC_RLE_HH
-
-#include "io/InputStream.hh"
-#include "io/OutputStream.hh"
-
-#include <memory>
-
-namespace orc {
-
- inline int64_t zigZag(int64_t value) {
- return (value << 1) ^ (value >> 63);
- }
-
- inline int64_t unZigZag(uint64_t value) {
- return value >> 1 ^ -(value & 1);
- }
-
- class RleEncoder {
- public:
- // must be non-inline!
- virtual ~RleEncoder();
-
- RleEncoder(
- std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned):
- outputStream(std::move(outStream)),
- bufferPosition(0),
- bufferLength(0),
- numLiterals(0),
- isSigned(hasSigned),
- buffer(nullptr){
- //pass
- }
-
- /**
- * Encode the next batch of values.
- * @param data the array to read from
- * @param numValues the number of values to write
- * @param notNull If the pointer is null, all values are read. If the
- * pointer is not null, positions that are false are skipped.
- */
- virtual void add(const int64_t* data, uint64_t numValues,
- const char* notNull);
-
- /**
- * Get size of buffer used so far.
- */
- uint64_t getBufferSize() const {
- return outputStream->getSize();
- }
-
- /**
- * Flushing underlying BufferedOutputStream
- */
- virtual uint64_t flush() = 0;
-
- /**
- * record current position
- * @param recorder use the recorder to record current positions
- */
- virtual void recordPosition(PositionRecorder* recorder) const;
-
- virtual void write(int64_t val) = 0;
-
- protected:
- std::unique_ptr<BufferedOutputStream> outputStream;
- size_t bufferPosition;
- size_t bufferLength;
- size_t numLiterals;
- int64_t* literals;
- bool isSigned;
- char* buffer;
-
- virtual void writeByte(char c);
-
- virtual void writeVulong(int64_t val);
-
- virtual void writeVslong(int64_t val);
- };
-
- class RleDecoder {
- public:
- // must be non-inline!
- virtual ~RleDecoder();
-
- /**
- * Seek to a particular spot.
- */
- virtual void seek(PositionProvider&) = 0;
-
- /**
- * Seek over a given number of values.
- */
- virtual void skip(uint64_t numValues) = 0;
-
- /**
- * Read a number of values into the batch.
- * @param data the array to read into
- * @param numValues the number of values to read
- * @param notNull If the pointer is null, all values are read. If the
- * pointer is not null, positions that are false are skipped.
- */
- virtual void next(int64_t* data, uint64_t numValues,
- const char* notNull) = 0;
- };
-
- /**
- * Create an RLE encoder.
- * @param output the output stream to write to
- * @param isSigned true if the number sequence is signed
- * @param version version of RLE decoding to do
- * @param pool memory pool to use for allocation
- */
- std::unique_ptr<RleEncoder> createRleEncoder
- (std::unique_ptr<BufferedOutputStream> output,
- bool isSigned,
- RleVersion version,
- MemoryPool& pool,
- bool alignedBitpacking);
-
- /**
- * Create an RLE decoder.
- * @param input the input stream to read from
- * @param isSigned true if the number sequence is signed
- * @param version version of RLE decoding to do
- * @param pool memory pool to use for allocation
- */
- std::unique_ptr<RleDecoder> createRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- bool isSigned,
- RleVersion version,
- MemoryPool& pool);
-
-} // namespace orc
-
-#endif // ORC_RLE_HH
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_RLE_HH
+#define ORC_RLE_HH
+
+#include "io/InputStream.hh"
+#include "io/OutputStream.hh"
+
+#include <memory>
+
+namespace orc {
+
+ inline int64_t zigZag(int64_t value) {
+ return (value << 1) ^ (value >> 63);
+ }
+
+ inline int64_t unZigZag(uint64_t value) {
+ return value >> 1 ^ -(value & 1);
+ }
+
+ class RleEncoder {
+ public:
+ // must be non-inline!
+ virtual ~RleEncoder();
+
+ RleEncoder(
+ std::unique_ptr<BufferedOutputStream> outStream,
+ bool hasSigned):
+ outputStream(std::move(outStream)),
+ bufferPosition(0),
+ bufferLength(0),
+ numLiterals(0),
+ isSigned(hasSigned),
+ buffer(nullptr){
+ //pass
+ }
+
+ /**
+ * Encode the next batch of values.
+ * @param data the array to read from
+ * @param numValues the number of values to write
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void add(const int64_t* data, uint64_t numValues,
+ const char* notNull);
+
+ /**
+ * Get size of buffer used so far.
+ */
+ uint64_t getBufferSize() const {
+ return outputStream->getSize();
+ }
+
+ /**
+ * Flushing underlying BufferedOutputStream
+ */
+ virtual uint64_t flush() = 0;
+
+ /**
+ * record current position
+ * @param recorder use the recorder to record current positions
+ */
+ virtual void recordPosition(PositionRecorder* recorder) const;
+
+ virtual void write(int64_t val) = 0;
+
+ protected:
+ std::unique_ptr<BufferedOutputStream> outputStream;
+ size_t bufferPosition;
+ size_t bufferLength;
+ size_t numLiterals;
+ int64_t* literals;
+ bool isSigned;
+ char* buffer;
+
+ virtual void writeByte(char c);
+
+ virtual void writeVulong(int64_t val);
+
+ virtual void writeVslong(int64_t val);
+ };
+
+ class RleDecoder {
+ public:
+ // must be non-inline!
+ virtual ~RleDecoder();
+
+ /**
+ * Seek to a particular spot.
+ */
+ virtual void seek(PositionProvider&) = 0;
+
+ /**
+ * Seek over a given number of values.
+ */
+ virtual void skip(uint64_t numValues) = 0;
+
+ /**
+ * Read a number of values into the batch.
+ * @param data the array to read into
+ * @param numValues the number of values to read
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void next(int64_t* data, uint64_t numValues,
+ const char* notNull) = 0;
+ };
+
+ /**
+ * Create an RLE encoder.
+ * @param output the output stream to write to
+ * @param isSigned true if the number sequence is signed
+ * @param version version of RLE decoding to do
+ * @param pool memory pool to use for allocation
+ */
+ std::unique_ptr<RleEncoder> createRleEncoder
+ (std::unique_ptr<BufferedOutputStream> output,
+ bool isSigned,
+ RleVersion version,
+ MemoryPool& pool,
+ bool alignedBitpacking);
+
+ /**
+ * Create an RLE decoder.
+ * @param input the input stream to read from
+ * @param isSigned true if the number sequence is signed
+ * @param version version of RLE decoding to do
+ * @param pool memory pool to use for allocation
+ */
+ std::unique_ptr<RleDecoder> createRleDecoder
+ (std::unique_ptr<SeekableInputStream> input,
+ bool isSigned,
+ RleVersion version,
+ MemoryPool& pool);
+
+} // namespace orc
+
+#endif // ORC_RLE_HH
diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc
index 12e2d057cd..20fc0931ef 100644
--- a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc
+++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc
@@ -1,70 +1,70 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with option work for additional information
- * regarding copyright ownership. The ASF licenses option file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use option file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "RLEV2Util.hh"
-
-namespace orc {
-
- // Map FBS enum to bit width value.
- const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = {
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- 26, 28, 30, 32, 40, 48, 56, 64
- };
-
- // Map bit length i to closest fixed bit width that can contain i bits.
- const uint8_t ClosestFixedBitsMap[65] = {
- 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- 26, 26, 28, 28, 30, 30, 32, 32,
- 40, 40, 40, 40, 40, 40, 40, 40,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 56, 56, 56, 56, 56, 56, 56, 56,
- 64, 64, 64, 64, 64, 64, 64, 64
- };
-
- // Map bit length i to closest aligned fixed bit width that can contain i bits.
- const uint8_t ClosestAlignedFixedBitsMap[65] = {
- 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24,
- 32, 32, 32, 32, 32, 32, 32, 32,
- 40, 40, 40, 40, 40, 40, 40, 40,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 56, 56, 56, 56, 56, 56, 56, 56,
- 64, 64, 64, 64, 64, 64, 64, 64
- };
-
- // Map bit width to FBS enum.
- const uint8_t BitWidthToFBSMap[65] = {
- FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR,
- FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT,
- FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE,
- FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN,
- FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY,
- FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR,
- FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX,
- FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT,
- FixedBitSizes::THIRTY, FixedBitSizes::THIRTY,
- FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO,
- FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
- FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
- FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
- FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
- FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
- FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
- FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR,
- FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR
- };
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with option work for additional information
+ * regarding copyright ownership. The ASF licenses option file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use option file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RLEV2Util.hh"
+
+namespace orc {
+
+ // Map FBS enum to bit width value.
+ const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 26, 28, 30, 32, 40, 48, 56, 64
+ };
+
+ // Map bit length i to closest fixed bit width that can contain i bits.
+ const uint8_t ClosestFixedBitsMap[65] = {
+ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 26, 26, 28, 28, 30, 30, 32, 32,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 64, 64, 64, 64, 64, 64, 64, 64
+ };
+
+ // Map bit length i to closest aligned fixed bit width that can contain i bits.
+ const uint8_t ClosestAlignedFixedBitsMap[65] = {
+ 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ 40, 40, 40, 40, 40, 40, 40, 40,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 64, 64, 64, 64, 64, 64, 64, 64
+ };
+
+ // Map bit width to FBS enum.
+ const uint8_t BitWidthToFBSMap[65] = {
+ FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR,
+ FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT,
+ FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE,
+ FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN,
+ FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY,
+ FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR,
+ FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX,
+ FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT,
+ FixedBitSizes::THIRTY, FixedBitSizes::THIRTY,
+ FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO,
+ FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
+ FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY,
+ FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
+ FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT,
+ FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
+ FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX,
+ FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR,
+ FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR
+ };
+}
diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh
index 95a6826eaa..67a94c7c48 100644
--- a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh
@@ -1,81 +1,81 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#ifndef ORC_RLEV2UTIL_HH
-#define ORC_RLEV2UTIL_HH
-
-#include "RLEv2.hh"
-
-namespace orc {
- extern const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE];
- extern const uint8_t ClosestFixedBitsMap[65];
- extern const uint8_t ClosestAlignedFixedBitsMap[65];
- extern const uint8_t BitWidthToFBSMap[65];
-
- // The input n must be less than FixedBitSizes::SIZE.
- inline uint32_t decodeBitWidth(uint32_t n) {
- return FBSToBitWidthMap[n];
- }
-
- inline uint32_t getClosestFixedBits(uint32_t n) {
- if (n <= 64) {
- return ClosestFixedBitsMap[n];
- } else {
- return 64;
- }
- }
-
- inline uint32_t getClosestAlignedFixedBits(uint32_t n) {
- if (n <= 64) {
- return ClosestAlignedFixedBitsMap[n];
- } else {
- return 64;
- }
- }
-
- inline uint32_t encodeBitWidth(uint32_t n) {
- if (n <= 64) {
- return BitWidthToFBSMap[n];
- } else {
- return FixedBitSizes::SIXTYFOUR;
- }
- }
-
- inline uint32_t findClosestNumBits(int64_t value) {
- if (value < 0) {
- return getClosestFixedBits(64);
- }
-
- uint32_t count = 0;
- while (value != 0) {
- count++;
- value = value >> 1;
- }
- return getClosestFixedBits(count);
- }
-
- inline bool isSafeSubtract(int64_t left, int64_t right) {
- return ((left ^ right) >= 0) || ((left ^ (left - right)) >= 0);
- }
-
- inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) {
- return static_cast<uint32_t >(encoding << 6);
- }
-}
-
-#endif //ORC_RLEV2UTIL_HH
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef ORC_RLEV2UTIL_HH
+#define ORC_RLEV2UTIL_HH
+
+#include "RLEv2.hh"
+
+namespace orc {
+ extern const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE];
+ extern const uint8_t ClosestFixedBitsMap[65];
+ extern const uint8_t ClosestAlignedFixedBitsMap[65];
+ extern const uint8_t BitWidthToFBSMap[65];
+
+ // The input n must be less than FixedBitSizes::SIZE.
+ inline uint32_t decodeBitWidth(uint32_t n) {
+ return FBSToBitWidthMap[n];
+ }
+
+ inline uint32_t getClosestFixedBits(uint32_t n) {
+ if (n <= 64) {
+ return ClosestFixedBitsMap[n];
+ } else {
+ return 64;
+ }
+ }
+
+ inline uint32_t getClosestAlignedFixedBits(uint32_t n) {
+ if (n <= 64) {
+ return ClosestAlignedFixedBitsMap[n];
+ } else {
+ return 64;
+ }
+ }
+
+ inline uint32_t encodeBitWidth(uint32_t n) {
+ if (n <= 64) {
+ return BitWidthToFBSMap[n];
+ } else {
+ return FixedBitSizes::SIXTYFOUR;
+ }
+ }
+
+ inline uint32_t findClosestNumBits(int64_t value) {
+ if (value < 0) {
+ return getClosestFixedBits(64);
+ }
+
+ uint32_t count = 0;
+ while (value != 0) {
+ count++;
+ value = value >> 1;
+ }
+ return getClosestFixedBits(count);
+ }
+
+ inline bool isSafeSubtract(int64_t left, int64_t right) {
+ return ((left ^ right) >= 0) || ((left ^ (left - right)) >= 0);
+ }
+
+ inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) {
+ return static_cast<uint32_t >(encoding << 6);
+ }
+}
+
+#endif //ORC_RLEV2UTIL_HH
diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc
index fe333978db..aae9726bf6 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv1.cc
+++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc
@@ -1,302 +1,302 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "Compression.hh"
-#include "orc/Exceptions.hh"
-#include "RLEv1.hh"
-
-#include <algorithm>
-
-namespace orc {
-
-const uint64_t MINIMUM_REPEAT = 3;
-const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT;
-
-const int64_t BASE_128_MASK = 0x7f;
-
-const int64_t MAX_DELTA = 127;
-const int64_t MIN_DELTA = -128;
-const uint64_t MAX_LITERAL_SIZE = 128;
-
-RleEncoderV1::RleEncoderV1(
- std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned):
- RleEncoder(std::move(outStream), hasSigned) {
- literals = new int64_t[MAX_LITERAL_SIZE];
- delta = 0;
- repeat = false;
- tailRunLength = 0;
-}
-
-RleEncoderV1::~RleEncoderV1() {
- delete [] literals;
-}
-
-void RleEncoderV1::writeValues() {
- if (numLiterals != 0) {
- if (repeat) {
- writeByte(static_cast<char>
- (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT));
- writeByte(static_cast<char>(delta));
- if (isSigned) {
- writeVslong(literals[0]);
- } else {
- writeVulong(literals[0]);
- }
- } else {
- writeByte(static_cast<char>(-numLiterals));
- for(size_t i=0; i < numLiterals; ++i) {
- if (isSigned) {
- writeVslong(literals[i]);
- } else {
- writeVulong(literals[i]);
- }
- }
- }
- repeat = false;
- numLiterals = 0;
- tailRunLength = 0;
- }
-}
-
-uint64_t RleEncoderV1::flush() {
- writeValues();
- outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
- uint64_t dataSize = outputStream->flush();
- bufferLength = bufferPosition = 0;
- return dataSize;
-}
-
-void RleEncoderV1::write(int64_t value) {
- if (numLiterals == 0) {
- literals[numLiterals++] = value;
- tailRunLength = 1;
- } else if (repeat) {
- if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) {
- numLiterals += 1;
- if (numLiterals == MAXIMUM_REPEAT) {
- writeValues();
- }
- } else {
- writeValues();
- literals[numLiterals++] = value;
- tailRunLength = 1;
- }
- } else {
- if (tailRunLength == 1) {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
- } else {
- tailRunLength = 2;
- }
- } else if (value == literals[numLiterals - 1] + delta) {
- tailRunLength += 1;
- } else {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
- } else {
- tailRunLength = 2;
- }
- }
- if (tailRunLength == MINIMUM_REPEAT) {
- if (numLiterals + 1 == MINIMUM_REPEAT) {
- repeat = true;
- numLiterals += 1;
- } else {
- numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
- int64_t base = literals[numLiterals];
- writeValues();
- literals[0] = base;
- repeat = true;
- numLiterals = MINIMUM_REPEAT;
- }
- } else {
- literals[numLiterals++] = value;
- if (numLiterals == MAX_LITERAL_SIZE) {
- writeValues();
- }
- }
- }
-}
-
-signed char RleDecoderV1::readByte() {
- if (bufferStart == bufferEnd) {
- int bufferLength;
- const void* bufferPointer;
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
- throw ParseError("bad read in readByte");
- }
- bufferStart = static_cast<const char*>(bufferPointer);
- bufferEnd = bufferStart + bufferLength;
- }
- return *(bufferStart++);
-}
-
-uint64_t RleDecoderV1::readLong() {
- uint64_t result = 0;
- int64_t offset = 0;
- signed char ch = readByte();
- if (ch >= 0) {
- result = static_cast<uint64_t>(ch);
- } else {
- result = static_cast<uint64_t>(ch) & BASE_128_MASK;
- while ((ch = readByte()) < 0) {
- offset += 7;
- result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
- }
- result |= static_cast<uint64_t>(ch) << (offset + 7);
- }
- return result;
-}
-
-void RleDecoderV1::skipLongs(uint64_t numValues) {
- while (numValues > 0) {
- if (readByte() >= 0) {
- --numValues;
- }
- }
-}
-
-void RleDecoderV1::readHeader() {
- signed char ch = readByte();
- if (ch < 0) {
- remainingValues = static_cast<uint64_t>(-ch);
- repeating = false;
- } else {
- remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
- repeating = true;
- delta = readByte();
- value = isSigned
- ? unZigZag(readLong())
- : static_cast<int64_t>(readLong());
- }
-}
-
-RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
- bool hasSigned)
- : inputStream(std::move(input)),
- isSigned(hasSigned),
- remainingValues(0),
- value(0),
- bufferStart(nullptr),
- bufferEnd(bufferStart),
- delta(0),
- repeating(false) {
-}
-
-void RleDecoderV1::seek(PositionProvider& location) {
- // move the input stream
- inputStream->seek(location);
- // force a re-read from the stream
- bufferEnd = bufferStart;
- // read a new header
- readHeader();
- // skip ahead the given number of records
- skip(location.next());
-}
-
-void RleDecoderV1::skip(uint64_t numValues) {
- while (numValues > 0) {
- if (remainingValues == 0) {
- readHeader();
- }
- uint64_t count = std::min(numValues, remainingValues);
- remainingValues -= count;
- numValues -= count;
- if (repeating) {
- value += delta * static_cast<int64_t>(count);
- } else {
- skipLongs(count);
- }
- }
-}
-
-void RleDecoderV1::next(int64_t* const data,
- const uint64_t numValues,
- const char* const notNull) {
- uint64_t position = 0;
- // skipNulls()
- if (notNull) {
- // Skip over null values.
- while (position < numValues && !notNull[position]) {
- ++position;
- }
- }
- while (position < numValues) {
- // If we are out of values, read more.
- if (remainingValues == 0) {
- readHeader();
- }
- // How many do we read out of this block?
- uint64_t count = std::min(numValues - position, remainingValues);
- uint64_t consumed = 0;
- if (repeating) {
- if (notNull) {
- for (uint64_t i = 0; i < count; ++i) {
- if (notNull[position + i]) {
- data[position + i] = value + static_cast<int64_t>(consumed) * delta;
- consumed += 1;
- }
- }
- } else {
- for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = value + static_cast<int64_t>(i) * delta;
- }
- consumed = count;
- }
- value += static_cast<int64_t>(consumed) * delta;
- } else {
- if (notNull) {
- for (uint64_t i = 0 ; i < count; ++i) {
- if (notNull[position + i]) {
- data[position + i] = isSigned
- ? unZigZag(readLong())
- : static_cast<int64_t>(readLong());
- ++consumed;
- }
- }
- } else {
- if (isSigned) {
- for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = unZigZag(readLong());
- }
- } else {
- for (uint64_t i = 0; i < count; ++i) {
- data[position + i] = static_cast<int64_t>(readLong());
- }
- }
- consumed = count;
- }
- }
- remainingValues -= consumed;
- position += count;
-
- // skipNulls()
- if (notNull) {
- // Skip over null values.
- while (position < numValues && !notNull[position]) {
- ++position;
- }
- }
- }
-}
-
-} // namespace orc
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "orc/Exceptions.hh"
+#include "RLEv1.hh"
+
+#include <algorithm>
+
+namespace orc {
+
+const uint64_t MINIMUM_REPEAT = 3;
+const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT;
+
+const int64_t BASE_128_MASK = 0x7f;
+
+const int64_t MAX_DELTA = 127;
+const int64_t MIN_DELTA = -128;
+const uint64_t MAX_LITERAL_SIZE = 128;
+
+RleEncoderV1::RleEncoderV1(
+ std::unique_ptr<BufferedOutputStream> outStream,
+ bool hasSigned):
+ RleEncoder(std::move(outStream), hasSigned) {
+ literals = new int64_t[MAX_LITERAL_SIZE];
+ delta = 0;
+ repeat = false;
+ tailRunLength = 0;
+}
+
+RleEncoderV1::~RleEncoderV1() {
+ delete [] literals;
+}
+
+void RleEncoderV1::writeValues() {
+ if (numLiterals != 0) {
+ if (repeat) {
+ writeByte(static_cast<char>
+ (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT));
+ writeByte(static_cast<char>(delta));
+ if (isSigned) {
+ writeVslong(literals[0]);
+ } else {
+ writeVulong(literals[0]);
+ }
+ } else {
+ writeByte(static_cast<char>(-numLiterals));
+ for(size_t i=0; i < numLiterals; ++i) {
+ if (isSigned) {
+ writeVslong(literals[i]);
+ } else {
+ writeVulong(literals[i]);
+ }
+ }
+ }
+ repeat = false;
+ numLiterals = 0;
+ tailRunLength = 0;
+ }
+}
+
+uint64_t RleEncoderV1::flush() {
+ writeValues();
+ outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
+ uint64_t dataSize = outputStream->flush();
+ bufferLength = bufferPosition = 0;
+ return dataSize;
+}
+
+void RleEncoderV1::write(int64_t value) {
+ if (numLiterals == 0) {
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) {
+ numLiterals += 1;
+ if (numLiterals == MAXIMUM_REPEAT) {
+ writeValues();
+ }
+ } else {
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ }
+ } else {
+ if (tailRunLength == 1) {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ } else if (value == literals[numLiterals - 1] + delta) {
+ tailRunLength += 1;
+ } else {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ }
+ if (tailRunLength == MINIMUM_REPEAT) {
+ if (numLiterals + 1 == MINIMUM_REPEAT) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1);
+ int64_t base = literals[numLiterals];
+ writeValues();
+ literals[0] = base;
+ repeat = true;
+ numLiterals = MINIMUM_REPEAT;
+ }
+ } else {
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
+ }
+ }
+}
+
+signed char RleDecoderV1::readByte() {
+ if (bufferStart == bufferEnd) {
+ int bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in readByte");
+ }
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+ return *(bufferStart++);
+}
+
+uint64_t RleDecoderV1::readLong() {
+ uint64_t result = 0;
+ int64_t offset = 0;
+ signed char ch = readByte();
+ if (ch >= 0) {
+ result = static_cast<uint64_t>(ch);
+ } else {
+ result = static_cast<uint64_t>(ch) & BASE_128_MASK;
+ while ((ch = readByte()) < 0) {
+ offset += 7;
+ result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
+ }
+ result |= static_cast<uint64_t>(ch) << (offset + 7);
+ }
+ return result;
+}
+
+void RleDecoderV1::skipLongs(uint64_t numValues) {
+ while (numValues > 0) {
+ if (readByte() >= 0) {
+ --numValues;
+ }
+ }
+}
+
+void RleDecoderV1::readHeader() {
+ signed char ch = readByte();
+ if (ch < 0) {
+ remainingValues = static_cast<uint64_t>(-ch);
+ repeating = false;
+ } else {
+ remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
+ repeating = true;
+ delta = readByte();
+ value = isSigned
+ ? unZigZag(readLong())
+ : static_cast<int64_t>(readLong());
+ }
+}
+
+RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
+ bool hasSigned)
+ : inputStream(std::move(input)),
+ isSigned(hasSigned),
+ remainingValues(0),
+ value(0),
+ bufferStart(nullptr),
+ bufferEnd(bufferStart),
+ delta(0),
+ repeating(false) {
+}
+
+void RleDecoderV1::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // force a re-read from the stream
+ bufferEnd = bufferStart;
+ // read a new header
+ readHeader();
+ // skip ahead the given number of records
+ skip(location.next());
+}
+
+void RleDecoderV1::skip(uint64_t numValues) {
+ while (numValues > 0) {
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ uint64_t count = std::min(numValues, remainingValues);
+ remainingValues -= count;
+ numValues -= count;
+ if (repeating) {
+ value += delta * static_cast<int64_t>(count);
+ } else {
+ skipLongs(count);
+ }
+ }
+}
+
+void RleDecoderV1::next(int64_t* const data,
+ const uint64_t numValues,
+ const char* const notNull) {
+ uint64_t position = 0;
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
+ }
+ while (position < numValues) {
+ // If we are out of values, read more.
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ // How many do we read out of this block?
+ uint64_t count = std::min(numValues - position, remainingValues);
+ uint64_t consumed = 0;
+ if (repeating) {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = value + static_cast<int64_t>(consumed) * delta;
+ consumed += 1;
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = value + static_cast<int64_t>(i) * delta;
+ }
+ consumed = count;
+ }
+ value += static_cast<int64_t>(consumed) * delta;
+ } else {
+ if (notNull) {
+ for (uint64_t i = 0 ; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = isSigned
+ ? unZigZag(readLong())
+ : static_cast<int64_t>(readLong());
+ ++consumed;
+ }
+ }
+ } else {
+ if (isSigned) {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = unZigZag(readLong());
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = static_cast<int64_t>(readLong());
+ }
+ }
+ consumed = count;
+ }
+ }
+ remainingValues -= consumed;
+ position += count;
+
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
+ }
+ }
+}
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh
index 8e31d70873..eb0cf1d8c2 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv1.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh
@@ -1,91 +1,91 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#ifndef ORC_RLEV1_HH
-#define ORC_RLEV1_HH
-
-#include "Adaptor.hh"
-#include "RLE.hh"
-
-#include <memory>
-
-namespace orc {
-
-class RleEncoderV1 : public RleEncoder {
-public:
- RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned);
- ~RleEncoderV1() override ;
-
- /**
- * Flushing underlying BufferedOutputStream
- */
- uint64_t flush() override;
-
- void write(int64_t val) override;
-
-private:
- int64_t delta;
- bool repeat;
- uint64_t tailRunLength;
-
- void writeValues();
-};
-
-class RleDecoderV1 : public RleDecoder {
-public:
- RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
- bool isSigned);
-
- /**
- * Seek to a particular spot.
- */
- void seek(PositionProvider&) override;
-
- /**
- * Seek over a given number of values.
- */
- void skip(uint64_t numValues) override;
-
- /**
- * Read a number of values into the batch.
- */
- void next(int64_t* data, uint64_t numValues,
- const char* notNull) override;
-
-private:
- inline signed char readByte();
-
- inline void readHeader();
-
- inline uint64_t readLong();
-
- inline void skipLongs(uint64_t numValues);
-
- const std::unique_ptr<SeekableInputStream> inputStream;
- const bool isSigned;
- uint64_t remainingValues;
- int64_t value;
- const char *bufferStart;
- const char *bufferEnd;
- int64_t delta;
- bool repeating;
-};
-} // namespace orc
-
-#endif // ORC_RLEV1_HH
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef ORC_RLEV1_HH
+#define ORC_RLEV1_HH
+
+#include "Adaptor.hh"
+#include "RLE.hh"
+
+#include <memory>
+
+namespace orc {
+
+class RleEncoderV1 : public RleEncoder {
+public:
+ RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream,
+ bool hasSigned);
+ ~RleEncoderV1() override ;
+
+ /**
+ * Flushing underlying BufferedOutputStream
+ */
+ uint64_t flush() override;
+
+ void write(int64_t val) override;
+
+private:
+ int64_t delta;
+ bool repeat;
+ uint64_t tailRunLength;
+
+ void writeValues();
+};
+
+class RleDecoderV1 : public RleDecoder {
+public:
+ RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
+ bool isSigned);
+
+ /**
+ * Seek to a particular spot.
+ */
+ void seek(PositionProvider&) override;
+
+ /**
+ * Seek over a given number of values.
+ */
+ void skip(uint64_t numValues) override;
+
+ /**
+ * Read a number of values into the batch.
+ */
+ void next(int64_t* data, uint64_t numValues,
+ const char* notNull) override;
+
+private:
+ inline signed char readByte();
+
+ inline void readHeader();
+
+ inline uint64_t readLong();
+
+ inline void skipLongs(uint64_t numValues);
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned;
+ uint64_t remainingValues;
+ int64_t value;
+ const char *bufferStart;
+ const char *bufferEnd;
+ int64_t delta;
+ bool repeating;
+};
+} // namespace orc
+
+#endif // ORC_RLEV1_HH
diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh
index f85dabd9e6..5c740dfd27 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv2.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh
@@ -1,251 +1,251 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#ifndef ORC_RLEV2_HH
-#define ORC_RLEV2_HH
-
-#include "Adaptor.hh"
-#include "orc/Exceptions.hh"
-#include "RLE.hh"
-
-#include <vector>
-
-#define MIN_REPEAT 3
-#define HIST_LEN 32
-namespace orc {
-
-struct FixedBitSizes {
- enum FBS {
- ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
- THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
- TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
- TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
- };
-};
-
-enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
-
-struct EncodingOption {
- EncodingType encoding;
- int64_t fixedDelta;
- int64_t gapVsPatchListCount;
- int64_t zigzagLiteralsCount;
- int64_t baseRedLiteralsCount;
- int64_t adjDeltasCount;
- uint32_t zzBits90p;
- uint32_t zzBits100p;
- uint32_t brBits95p;
- uint32_t brBits100p;
- uint32_t bitsDeltaMax;
- uint32_t patchWidth;
- uint32_t patchGapWidth;
- uint32_t patchLength;
- int64_t min;
- bool isFixedDelta;
-};
-
-class RleEncoderV2 : public RleEncoder {
-public:
- RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);
-
- ~RleEncoderV2() override {
- delete [] literals;
- delete [] gapVsPatchList;
- delete [] zigzagLiterals;
- delete [] baseRedLiterals;
- delete [] adjDeltas;
- }
- /**
- * Flushing underlying BufferedOutputStream
- */
- uint64_t flush() override;
-
- void write(int64_t val) override;
-
-private:
-
- const bool alignedBitPacking;
- uint32_t fixedRunLength;
- uint32_t variableRunLength;
- int64_t prevDelta;
- int32_t histgram[HIST_LEN];
-
- // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val),
- // it is move here for performance consideration.
- int64_t* gapVsPatchList;
- int64_t* zigzagLiterals;
- int64_t* baseRedLiterals;
- int64_t* adjDeltas;
-
- uint32_t getOpCode(EncodingType encoding);
- void determineEncoding(EncodingOption& option);
- void computeZigZagLiterals(EncodingOption& option);
- void preparePatchedBlob(EncodingOption& option);
-
- void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize);
- void initializeLiterals(int64_t val);
- void writeValues(EncodingOption& option);
- void writeShortRepeatValues(EncodingOption& option);
- void writeDirectValues(EncodingOption& option);
- void writePatchedBasedValues(EncodingOption& option);
- void writeDeltaValues(EncodingOption& option);
- uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false);
-};
-
-class RleDecoderV2 : public RleDecoder {
-public:
- RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
- bool isSigned, MemoryPool& pool);
-
- /**
- * Seek to a particular spot.
- */
- void seek(PositionProvider&) override;
-
- /**
- * Seek over a given number of values.
- */
- void skip(uint64_t numValues) override;
-
- /**
- * Read a number of values into the batch.
- */
- void next(int64_t* data, uint64_t numValues,
- const char* notNull) override;
-
-private:
-
- // Used by PATCHED_BASE
- void adjustGapAndPatch() {
- curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
- patchBitSize;
- curPatch = unpackedPatch[patchIdx] & patchMask;
- actualGap = 0;
-
- // special case: gap is >255 then patch value will be 0.
- // if gap is <=255 then patch value cannot be 0
- while (curGap == 255 && curPatch == 0) {
- actualGap += 255;
- ++patchIdx;
- curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
- patchBitSize;
- curPatch = unpackedPatch[patchIdx] & patchMask;
- }
- // add the left over gap
- actualGap += curGap;
- }
-
- void resetReadLongs() {
- bitsLeft = 0;
- curByte = 0;
- }
-
- void resetRun() {
- resetReadLongs();
- bitSize = 0;
- }
-
- unsigned char readByte() {
- if (bufferStart == bufferEnd) {
- int bufferLength;
- const void* bufferPointer;
- if (!inputStream->Next(&bufferPointer, &bufferLength)) {
- throw ParseError("bad read in RleDecoderV2::readByte");
- }
- bufferStart = static_cast<const char*>(bufferPointer);
- bufferEnd = bufferStart + bufferLength;
- }
-
- unsigned char result = static_cast<unsigned char>(*bufferStart++);
- return result;
-}
-
- int64_t readLongBE(uint64_t bsz);
- int64_t readVslong();
- uint64_t readVulong();
- uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
- uint64_t fb, const char* notNull = nullptr) {
- uint64_t ret = 0;
-
- // TODO: unroll to improve performance
- for(uint64_t i = offset; i < (offset + len); i++) {
- // skip null positions
- if (notNull && !notNull[i]) {
- continue;
- }
- uint64_t result = 0;
- uint64_t bitsLeftToRead = fb;
- while (bitsLeftToRead > bitsLeft) {
- result <<= bitsLeft;
- result |= curByte & ((1 << bitsLeft) - 1);
- bitsLeftToRead -= bitsLeft;
- curByte = readByte();
- bitsLeft = 8;
- }
-
- // handle the left over bits
- if (bitsLeftToRead > 0) {
- result <<= bitsLeftToRead;
- bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
- result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
- }
- data[i] = static_cast<int64_t>(result);
- ++ret;
- }
-
- return ret;
-}
-
- uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
- uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
- uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
- uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
- const char* notNull);
-
- const std::unique_ptr<SeekableInputStream> inputStream;
- const bool isSigned;
-
- unsigned char firstByte;
- uint64_t runLength;
- uint64_t runRead;
- const char *bufferStart;
- const char *bufferEnd;
- int64_t deltaBase; // Used by DELTA
- uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE
- int64_t firstValue; // Used by SHORT_REPEAT and DELTA
- int64_t prevValue; // Used by DELTA
- uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
- uint32_t bitsLeft; // Used by anything that uses readLongs
- uint32_t curByte; // Used by anything that uses readLongs
- uint32_t patchBitSize; // Used by PATCHED_BASE
- uint64_t unpackedIdx; // Used by PATCHED_BASE
- uint64_t patchIdx; // Used by PATCHED_BASE
- int64_t base; // Used by PATCHED_BASE
- uint64_t curGap; // Used by PATCHED_BASE
- int64_t curPatch; // Used by PATCHED_BASE
- int64_t patchMask; // Used by PATCHED_BASE
- int64_t actualGap; // Used by PATCHED_BASE
- DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
- DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
-};
-} // namespace orc
-
-#endif // ORC_RLEV2_HH
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef ORC_RLEV2_HH
+#define ORC_RLEV2_HH
+
+#include "Adaptor.hh"
+#include "orc/Exceptions.hh"
+#include "RLE.hh"
+
+#include <vector>
+
+#define MIN_REPEAT 3
+#define HIST_LEN 32
+namespace orc {
+
+struct FixedBitSizes {
+ enum FBS {
+ ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
+ THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
+ TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
+ TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
+ };
+};
+
+enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
+
+struct EncodingOption {
+ EncodingType encoding;
+ int64_t fixedDelta;
+ int64_t gapVsPatchListCount;
+ int64_t zigzagLiteralsCount;
+ int64_t baseRedLiteralsCount;
+ int64_t adjDeltasCount;
+ uint32_t zzBits90p;
+ uint32_t zzBits100p;
+ uint32_t brBits95p;
+ uint32_t brBits100p;
+ uint32_t bitsDeltaMax;
+ uint32_t patchWidth;
+ uint32_t patchGapWidth;
+ uint32_t patchLength;
+ int64_t min;
+ bool isFixedDelta;
+};
+
+class RleEncoderV2 : public RleEncoder {
+public:
+ RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);
+
+ ~RleEncoderV2() override {
+ delete [] literals;
+ delete [] gapVsPatchList;
+ delete [] zigzagLiterals;
+ delete [] baseRedLiterals;
+ delete [] adjDeltas;
+ }
+ /**
+ * Flushing underlying BufferedOutputStream
+ */
+ uint64_t flush() override;
+
+ void write(int64_t val) override;
+
+private:
+
+ const bool alignedBitPacking;
+ uint32_t fixedRunLength;
+ uint32_t variableRunLength;
+ int64_t prevDelta;
+ int32_t histgram[HIST_LEN];
+
+ // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val),
+ // it is move here for performance consideration.
+ int64_t* gapVsPatchList;
+ int64_t* zigzagLiterals;
+ int64_t* baseRedLiterals;
+ int64_t* adjDeltas;
+
+ uint32_t getOpCode(EncodingType encoding);
+ void determineEncoding(EncodingOption& option);
+ void computeZigZagLiterals(EncodingOption& option);
+ void preparePatchedBlob(EncodingOption& option);
+
+ void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize);
+ void initializeLiterals(int64_t val);
+ void writeValues(EncodingOption& option);
+ void writeShortRepeatValues(EncodingOption& option);
+ void writeDirectValues(EncodingOption& option);
+ void writePatchedBasedValues(EncodingOption& option);
+ void writeDeltaValues(EncodingOption& option);
+ uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false);
+};
+
+class RleDecoderV2 : public RleDecoder {
+public:
+ RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
+ bool isSigned, MemoryPool& pool);
+
+ /**
+ * Seek to a particular spot.
+ */
+ void seek(PositionProvider&) override;
+
+ /**
+ * Seek over a given number of values.
+ */
+ void skip(uint64_t numValues) override;
+
+ /**
+ * Read a number of values into the batch.
+ */
+ void next(int64_t* data, uint64_t numValues,
+ const char* notNull) override;
+
+private:
+
+ // Used by PATCHED_BASE
+ void adjustGapAndPatch() {
+ curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
+ patchBitSize;
+ curPatch = unpackedPatch[patchIdx] & patchMask;
+ actualGap = 0;
+
+ // special case: gap is >255 then patch value will be 0.
+ // if gap is <=255 then patch value cannot be 0
+ while (curGap == 255 && curPatch == 0) {
+ actualGap += 255;
+ ++patchIdx;
+ curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
+ patchBitSize;
+ curPatch = unpackedPatch[patchIdx] & patchMask;
+ }
+ // add the left over gap
+ actualGap += curGap;
+ }
+
+ void resetReadLongs() {
+ bitsLeft = 0;
+ curByte = 0;
+ }
+
+ void resetRun() {
+ resetReadLongs();
+ bitSize = 0;
+ }
+
+ unsigned char readByte() {
+ if (bufferStart == bufferEnd) {
+ int bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in RleDecoderV2::readByte");
+ }
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+
+ unsigned char result = static_cast<unsigned char>(*bufferStart++);
+ return result;
+}
+
+ int64_t readLongBE(uint64_t bsz);
+ int64_t readVslong();
+ uint64_t readVulong();
+ uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
+ uint64_t fb, const char* notNull = nullptr) {
+ uint64_t ret = 0;
+
+ // TODO: unroll to improve performance
+ for(uint64_t i = offset; i < (offset + len); i++) {
+ // skip null positions
+ if (notNull && !notNull[i]) {
+ continue;
+ }
+ uint64_t result = 0;
+ uint64_t bitsLeftToRead = fb;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= curByte & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ curByte = readByte();
+ bitsLeft = 8;
+ }
+
+ // handle the left over bits
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
+ result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ data[i] = static_cast<int64_t>(result);
+ ++ret;
+ }
+
+ return ret;
+}
+
+ uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+ uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+ uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+ uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned;
+
+ unsigned char firstByte;
+ uint64_t runLength;
+ uint64_t runRead;
+ const char *bufferStart;
+ const char *bufferEnd;
+ int64_t deltaBase; // Used by DELTA
+ uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE
+ int64_t firstValue; // Used by SHORT_REPEAT and DELTA
+ int64_t prevValue; // Used by DELTA
+ uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
+ uint32_t bitsLeft; // Used by anything that uses readLongs
+ uint32_t curByte; // Used by anything that uses readLongs
+ uint32_t patchBitSize; // Used by PATCHED_BASE
+ uint64_t unpackedIdx; // Used by PATCHED_BASE
+ uint64_t patchIdx; // Used by PATCHED_BASE
+ int64_t base; // Used by PATCHED_BASE
+ uint64_t curGap; // Used by PATCHED_BASE
+ int64_t curPatch; // Used by PATCHED_BASE
+ int64_t patchMask; // Used by PATCHED_BASE
+ int64_t actualGap; // Used by PATCHED_BASE
+ DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
+ DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
+};
+} // namespace orc
+
+#endif // ORC_RLEV2_HH
diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc
index f35106ee44..a633567a9c 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.cc
+++ b/contrib/libs/apache/orc/c++/src/Reader.cc
@@ -1,513 +1,513 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "BloomFilter.hh"
-#include "Options.hh"
-#include "Reader.hh"
-#include "Statistics.hh"
-#include "StripeStream.hh"
-
-#include "wrap/coded-stream-wrapper.h"
-
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-#include <iterator>
-#include <set>
-
-namespace orc {
-
- const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() {
- static const WriterVersionImpl version(WriterVersion_HIVE_8732);
- return version;
- }
-
- uint64_t getCompressionBlockSize(const proto::PostScript& ps) {
- if (ps.has_compressionblocksize()) {
- return ps.compressionblocksize();
- } else {
- return 256 * 1024;
- }
- }
-
- CompressionKind convertCompressionKind(const proto::PostScript& ps) {
- if (ps.has_compression()) {
- return static_cast<CompressionKind>(ps.compression());
- } else {
- throw ParseError("Unknown compression type");
- }
- }
-
- std::string ColumnSelector::toDotColumnPath() {
- if (columns.empty()) {
- return std::string();
- }
- std::ostringstream columnStream;
- std::copy(columns.begin(), columns.end(),
- std::ostream_iterator<std::string>(columnStream, "."));
- std::string columnPath = columnStream.str();
- return columnPath.substr(0, columnPath.length() - 1);
- }
-
-
- void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) {
- size_t id = static_cast<size_t>(type.getColumnId());
- if (!selectedColumns[id]) {
- selectedColumns[id] = true;
- for(size_t c = id; c <= type.getMaximumColumnId(); ++c){
- selectedColumns[c] = true;
- }
- }
- }
-
- /**
- * Recurses over a type tree and selects the parents of every selected type.
- * @return true if any child was selected.
- */
- bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const Type& type) {
- size_t id = static_cast<size_t>(type.getColumnId());
- bool result = selectedColumns[id];
- for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
- result |= selectParents(selectedColumns, *type.getSubtype(c));
- }
- selectedColumns[id] = result;
- return result;
- }
-
- /**
- * Recurses over a type tree and build two maps
- * map<TypeName, TypeId>, map<TypeId, Type>
- */
- void ColumnSelector::buildTypeNameIdMap(const Type* type) {
- // map<type_id, Type*>
- idTypeMap[type->getColumnId()] = type;
-
- if (STRUCT == type->getKind()) {
- for (size_t i = 0; i < type->getSubtypeCount(); ++i) {
- const std::string& fieldName = type->getFieldName(i);
- columns.push_back(fieldName);
- nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId();
- buildTypeNameIdMap(type->getSubtype(i));
- columns.pop_back();
- }
- } else {
- // other non-primitive type
- for (size_t j = 0; j < type->getSubtypeCount(); ++j) {
- buildTypeNameIdMap(type->getSubtype(j));
- }
- }
- }
-
- void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns,
- const RowReaderOptions& options) {
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) {
- for(std::list<uint64_t>::const_iterator field = options.getInclude().begin();
- field != options.getInclude().end(); ++field) {
- updateSelectedByFieldId(selectedColumns, *field);
- }
- } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) {
- for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
- field != options.getIncludeNames().end(); ++field) {
- updateSelectedByName(selectedColumns, *field);
- }
- } else if (options.getTypeIdsSet()) {
- for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
- typeId != options.getInclude().end(); ++typeId) {
- updateSelectedByTypeId(selectedColumns, *typeId);
- }
- } else {
- // default is to select all columns
- std::fill(selectedColumns.begin(), selectedColumns.end(), true);
- }
- selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
- }
-
- void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns,
- uint64_t fieldId) {
- if (fieldId < contents->schema->getSubtypeCount()) {
- selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId));
- } else {
- std::stringstream buffer;
- buffer << "Invalid column selected " << fieldId << " out of "
- << contents->schema->getSubtypeCount();
- throw ParseError(buffer.str());
- }
- }
-
- void ColumnSelector::updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId) {
- if (typeId < selectedColumns.size()) {
- const Type& type = *idTypeMap[typeId];
- selectChildren(selectedColumns, type);
- } else {
- std::stringstream buffer;
- buffer << "Invalid type id selected " << typeId << " out of "
- << selectedColumns.size();
- throw ParseError(buffer.str());
- }
- }
-
- void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns,
- const std::string& fieldName) {
- std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName);
- if (ite != nameIdMap.end()) {
- updateSelectedByTypeId(selectedColumns, ite->second);
- } else {
- throw ParseError("Invalid column selected " + fieldName);
- }
- }
-
- ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) {
- buildTypeNameIdMap(contents->schema.get());
- }
-
- RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents,
- const RowReaderOptions& opts
- ): localTimezone(getLocalTimezone()),
- contents(_contents),
- throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()),
- forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()),
- footer(contents->footer.get()),
- firstRowOfStripe(*contents->pool, 0),
- enableEncodedBlock(opts.getEnableLazyDecoding()) {
- uint64_t numberOfStripes;
- numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
- currentStripe = numberOfStripes;
- lastStripe = 0;
- currentRowInStripe = 0;
- rowsInCurrentStripe = 0;
- uint64_t rowTotal = 0;
-
- firstRowOfStripe.resize(numberOfStripes);
- for(size_t i=0; i < numberOfStripes; ++i) {
- firstRowOfStripe[i] = rowTotal;
- proto::StripeInformation stripeInfo =
- footer->stripes(static_cast<int>(i));
- rowTotal += stripeInfo.numberofrows();
- bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() &&
- stripeInfo.offset() < opts.getOffset() + opts.getLength();
- if (isStripeInRange) {
- if (i < currentStripe) {
- currentStripe = i;
- }
- if (i >= lastStripe) {
- lastStripe = i + 1;
- }
- }
- }
- firstStripe = currentStripe;
-
- if (currentStripe == 0) {
- previousRow = (std::numeric_limits<uint64_t>::max)();
- } else if (currentStripe == numberOfStripes) {
- previousRow = footer->numberofrows();
- } else {
- previousRow = firstRowOfStripe[firstStripe]-1;
- }
-
- ColumnSelector column_selector(contents.get());
- column_selector.updateSelected(selectedColumns, opts);
- }
-
- CompressionKind RowReaderImpl::getCompression() const {
- return contents->compression;
- }
-
- uint64_t RowReaderImpl::getCompressionSize() const {
- return contents->blockSize;
- }
-
- const std::vector<bool> RowReaderImpl::getSelectedColumns() const {
- return selectedColumns;
- }
-
- const Type& RowReaderImpl::getSelectedType() const {
- if (selectedSchema.get() == nullptr) {
- selectedSchema = buildSelectedType(contents->schema.get(),
- selectedColumns);
- }
- return *(selectedSchema.get());
- }
-
- uint64_t RowReaderImpl::getRowNumber() const {
- return previousRow;
- }
-
- void RowReaderImpl::seekToRow(uint64_t rowNumber) {
- // Empty file
- if (lastStripe == 0) {
- return;
- }
-
- // If we are reading only a portion of the file
- // (bounded by firstStripe and lastStripe),
- // seeking before or after the portion of interest should return no data.
- // Implement this by setting previousRow to the number of rows in the file.
-
- // seeking past lastStripe
- uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size());
- if ( (lastStripe == num_stripes
- && rowNumber >= footer->numberofrows()) ||
- (lastStripe < num_stripes
- && rowNumber >= firstRowOfStripe[lastStripe]) ) {
- currentStripe = num_stripes;
- previousRow = footer->numberofrows();
- return;
- }
-
- uint64_t seekToStripe = 0;
- while (seekToStripe+1 < lastStripe &&
- firstRowOfStripe[seekToStripe+1] <= rowNumber) {
- seekToStripe++;
- }
-
- // seeking before the first stripe
- if (seekToStripe < firstStripe) {
- currentStripe = num_stripes;
- previousRow = footer->numberofrows();
- return;
- }
-
- currentStripe = seekToStripe;
- currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
- previousRow = rowNumber;
- startNextStripe();
-
- uint64_t rowsToSkip = currentRowInStripe;
-
- if (footer->rowindexstride() > 0 &&
- currentStripeInfo.indexlength() > 0) {
- uint32_t rowGroupId =
- static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride());
- rowsToSkip -= rowGroupId * footer->rowindexstride();
-
- if (rowGroupId != 0) {
- seekToRowGroup(rowGroupId);
- }
- }
-
- reader->skip(rowsToSkip);
- }
-
- void RowReaderImpl::seekToRowGroup(uint32_t rowGroupEntryId) {
- // reset all previous row indexes
- rowIndexes.clear();
-
- // obtain row indexes for selected columns
- uint64_t offset = currentStripeInfo.offset();
- for (int i = 0; i < currentStripeFooter.streams_size(); ++i) {
- const proto::Stream& pbStream = currentStripeFooter.streams(i);
- uint64_t colId = pbStream.column();
- if (selectedColumns[colId] && pbStream.has_kind()
- && pbStream.kind() == proto::Stream_Kind_ROW_INDEX) {
- std::unique_ptr<SeekableInputStream> inStream =
- createDecompressor(getCompression(),
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream
- (contents->stream.get(),
- offset,
- pbStream.length(),
- *contents->pool)),
- getCompressionSize(),
- *contents->pool);
-
- proto::RowIndex rowIndex;
- if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) {
- throw ParseError("Failed to parse the row index");
- }
-
- rowIndexes[colId] = rowIndex;
- }
- offset += pbStream.length();
- }
-
- // store positions for selected columns
- std::vector<std::list<uint64_t>> positions;
- // store position providers for selected colimns
- std::unordered_map<uint64_t, PositionProvider> positionProviders;
-
- for (auto rowIndex = rowIndexes.cbegin();
- rowIndex != rowIndexes.cend(); ++rowIndex) {
- uint64_t colId = rowIndex->first;
- const proto::RowIndexEntry& entry =
- rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId));
-
- // copy index positions for a specific column
- positions.push_back({});
- auto& position = positions.back();
- for (int pos = 0; pos != entry.positions_size(); ++pos) {
- position.push_back(entry.positions(pos));
- }
- positionProviders.insert(std::make_pair(colId, PositionProvider(position)));
- }
-
- reader->seekToRowGroup(positionProviders);
- }
-
- const FileContents& RowReaderImpl::getFileContents() const {
- return *contents;
- }
-
- bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const {
- return throwOnHive11DecimalOverflow;
- }
-
- int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const {
- return forcedScaleOnHive11Decimal;
- }
-
- proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
- const FileContents& contents) {
- uint64_t stripeFooterStart = info.offset() + info.indexlength() +
- info.datalength();
- uint64_t stripeFooterLength = info.footerlength();
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents.compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents.stream.get(),
- stripeFooterStart,
- stripeFooterLength,
- *contents.pool)),
- contents.blockSize,
- *contents.pool);
- proto::StripeFooter result;
- if (!result.ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError(std::string("bad StripeFooter from ") +
- pbStream->getName());
- }
- return result;
- }
-
- ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents,
- const ReaderOptions& opts,
- uint64_t _fileLength,
- uint64_t _postscriptLength
- ): contents(std::move(_contents)),
- options(opts),
- fileLength(_fileLength),
- postscriptLength(_postscriptLength),
- footer(contents->footer.get()) {
- isMetadataLoaded = false;
- checkOrcVersion();
- numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
- contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer));
- contents->blockSize = getCompressionBlockSize(*contents->postscript);
- contents->compression= convertCompressionKind(*contents->postscript);
- }
-
- std::string ReaderImpl::getSerializedFileTail() const {
- proto::FileTail tail;
- proto::PostScript *mutable_ps = tail.mutable_postscript();
- mutable_ps->CopyFrom(*contents->postscript);
- proto::Footer *mutableFooter = tail.mutable_footer();
- mutableFooter->CopyFrom(*footer);
- tail.set_filelength(fileLength);
- tail.set_postscriptlength(postscriptLength);
- TString result;
- if (!tail.SerializeToString(&result)) {
- throw ParseError("Failed to serialize file tail");
- }
- return result;
- }
-
- const ReaderOptions& ReaderImpl::getReaderOptions() const {
- return options;
- }
-
- CompressionKind ReaderImpl::getCompression() const {
- return contents->compression;
- }
-
- uint64_t ReaderImpl::getCompressionSize() const {
- return contents->blockSize;
- }
-
- uint64_t ReaderImpl::getNumberOfStripes() const {
- return numberOfStripes;
- }
-
- uint64_t ReaderImpl::getNumberOfStripeStatistics() const {
- if (!isMetadataLoaded) {
- readMetadata();
- }
- return metadata.get() == nullptr ? 0 :
- static_cast<uint64_t>(metadata->stripestats_size());
- }
-
- std::unique_ptr<StripeInformation>
- ReaderImpl::getStripe(uint64_t stripeIndex) const {
- if (stripeIndex > getNumberOfStripes()) {
- throw std::logic_error("stripe index out of range");
- }
- proto::StripeInformation stripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
-
- return std::unique_ptr<StripeInformation>
- (new StripeInformationImpl
- (stripeInfo.offset(),
- stripeInfo.indexlength(),
- stripeInfo.datalength(),
- stripeInfo.footerlength(),
- stripeInfo.numberofrows(),
- contents->stream.get(),
- *contents->pool,
- contents->compression,
- contents->blockSize));
- }
-
- FileVersion ReaderImpl::getFormatVersion() const {
- if (contents->postscript->version_size() != 2) {
- return FileVersion::v_0_11();
- }
- return FileVersion(
- contents->postscript->version(0),
- contents->postscript->version(1));
- }
-
- uint64_t ReaderImpl::getNumberOfRows() const {
- return footer->numberofrows();
- }
-
- WriterId ReaderImpl::getWriterId() const {
- if (footer->has_writer()) {
- uint32_t id = footer->writer();
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "BloomFilter.hh"
+#include "Options.hh"
+#include "Reader.hh"
+#include "Statistics.hh"
+#include "StripeStream.hh"
+
+#include "wrap/coded-stream-wrapper.h"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <iterator>
+#include <set>
+
+namespace orc {
+
+ const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() {
+ static const WriterVersionImpl version(WriterVersion_HIVE_8732);
+ return version;
+ }
+
+ uint64_t getCompressionBlockSize(const proto::PostScript& ps) {
+ if (ps.has_compressionblocksize()) {
+ return ps.compressionblocksize();
+ } else {
+ return 256 * 1024;
+ }
+ }
+
+ CompressionKind convertCompressionKind(const proto::PostScript& ps) {
+ if (ps.has_compression()) {
+ return static_cast<CompressionKind>(ps.compression());
+ } else {
+ throw ParseError("Unknown compression type");
+ }
+ }
+
+ std::string ColumnSelector::toDotColumnPath() {
+ if (columns.empty()) {
+ return std::string();
+ }
+ std::ostringstream columnStream;
+ std::copy(columns.begin(), columns.end(),
+ std::ostream_iterator<std::string>(columnStream, "."));
+ std::string columnPath = columnStream.str();
+ return columnPath.substr(0, columnPath.length() - 1);
+ }
+
+
+ void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) {
+ size_t id = static_cast<size_t>(type.getColumnId());
+ if (!selectedColumns[id]) {
+ selectedColumns[id] = true;
+ for(size_t c = id; c <= type.getMaximumColumnId(); ++c){
+ selectedColumns[c] = true;
+ }
+ }
+ }
+
+ /**
+ * Recurses over a type tree and selects the parents of every selected type.
+ * @return true if any child was selected.
+ */
+ bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const Type& type) {
+ size_t id = static_cast<size_t>(type.getColumnId());
+ bool result = selectedColumns[id];
+ for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
+ result |= selectParents(selectedColumns, *type.getSubtype(c));
+ }
+ selectedColumns[id] = result;
+ return result;
+ }
+
+ /**
+ * Recurses over a type tree and build two maps
+ * map<TypeName, TypeId>, map<TypeId, Type>
+ */
+ void ColumnSelector::buildTypeNameIdMap(const Type* type) {
+ // map<type_id, Type*>
+ idTypeMap[type->getColumnId()] = type;
+
+ if (STRUCT == type->getKind()) {
+ for (size_t i = 0; i < type->getSubtypeCount(); ++i) {
+ const std::string& fieldName = type->getFieldName(i);
+ columns.push_back(fieldName);
+ nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId();
+ buildTypeNameIdMap(type->getSubtype(i));
+ columns.pop_back();
+ }
+ } else {
+ // other non-primitive type
+ for (size_t j = 0; j < type->getSubtypeCount(); ++j) {
+ buildTypeNameIdMap(type->getSubtype(j));
+ }
+ }
+ }
+
+ void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns,
+ const RowReaderOptions& options) {
+ selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
+ if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) {
+ for(std::list<uint64_t>::const_iterator field = options.getInclude().begin();
+ field != options.getInclude().end(); ++field) {
+ updateSelectedByFieldId(selectedColumns, *field);
+ }
+ } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) {
+ for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
+ field != options.getIncludeNames().end(); ++field) {
+ updateSelectedByName(selectedColumns, *field);
+ }
+ } else if (options.getTypeIdsSet()) {
+ for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
+ typeId != options.getInclude().end(); ++typeId) {
+ updateSelectedByTypeId(selectedColumns, *typeId);
+ }
+ } else {
+ // default is to select all columns
+ std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ }
+ selectParents(selectedColumns, *contents->schema.get());
+ selectedColumns[0] = true; // column 0 is selected by default
+ }
+
+ void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns,
+ uint64_t fieldId) {
+ if (fieldId < contents->schema->getSubtypeCount()) {
+ selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId));
+ } else {
+ std::stringstream buffer;
+ buffer << "Invalid column selected " << fieldId << " out of "
+ << contents->schema->getSubtypeCount();
+ throw ParseError(buffer.str());
+ }
+ }
+
+ void ColumnSelector::updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId) {
+ if (typeId < selectedColumns.size()) {
+ const Type& type = *idTypeMap[typeId];
+ selectChildren(selectedColumns, type);
+ } else {
+ std::stringstream buffer;
+ buffer << "Invalid type id selected " << typeId << " out of "
+ << selectedColumns.size();
+ throw ParseError(buffer.str());
+ }
+ }
+
+ void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns,
+ const std::string& fieldName) {
+ std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName);
+ if (ite != nameIdMap.end()) {
+ updateSelectedByTypeId(selectedColumns, ite->second);
+ } else {
+ throw ParseError("Invalid column selected " + fieldName);
+ }
+ }
+
+ ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) {
+ buildTypeNameIdMap(contents->schema.get());
+ }
+
+ RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents,
+ const RowReaderOptions& opts
+ ): localTimezone(getLocalTimezone()),
+ contents(_contents),
+ throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()),
+ forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()),
+ footer(contents->footer.get()),
+ firstRowOfStripe(*contents->pool, 0),
+ enableEncodedBlock(opts.getEnableLazyDecoding()) {
+ uint64_t numberOfStripes;
+ numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
+ currentStripe = numberOfStripes;
+ lastStripe = 0;
+ currentRowInStripe = 0;
+ rowsInCurrentStripe = 0;
+ uint64_t rowTotal = 0;
+
+ firstRowOfStripe.resize(numberOfStripes);
+ for(size_t i=0; i < numberOfStripes; ++i) {
+ firstRowOfStripe[i] = rowTotal;
+ proto::StripeInformation stripeInfo =
+ footer->stripes(static_cast<int>(i));
+ rowTotal += stripeInfo.numberofrows();
+ bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() &&
+ stripeInfo.offset() < opts.getOffset() + opts.getLength();
+ if (isStripeInRange) {
+ if (i < currentStripe) {
+ currentStripe = i;
+ }
+ if (i >= lastStripe) {
+ lastStripe = i + 1;
+ }
+ }
+ }
+ firstStripe = currentStripe;
+
+ if (currentStripe == 0) {
+ previousRow = (std::numeric_limits<uint64_t>::max)();
+ } else if (currentStripe == numberOfStripes) {
+ previousRow = footer->numberofrows();
+ } else {
+ previousRow = firstRowOfStripe[firstStripe]-1;
+ }
+
+ ColumnSelector column_selector(contents.get());
+ column_selector.updateSelected(selectedColumns, opts);
+ }
+
+ CompressionKind RowReaderImpl::getCompression() const {
+ return contents->compression;
+ }
+
+ uint64_t RowReaderImpl::getCompressionSize() const {
+ return contents->blockSize;
+ }
+
+ const std::vector<bool> RowReaderImpl::getSelectedColumns() const {
+ return selectedColumns;
+ }
+
+ const Type& RowReaderImpl::getSelectedType() const {
+ if (selectedSchema.get() == nullptr) {
+ selectedSchema = buildSelectedType(contents->schema.get(),
+ selectedColumns);
+ }
+ return *(selectedSchema.get());
+ }
+
+ uint64_t RowReaderImpl::getRowNumber() const {
+ return previousRow;
+ }
+
+ void RowReaderImpl::seekToRow(uint64_t rowNumber) {
+ // Empty file
+ if (lastStripe == 0) {
+ return;
+ }
+
+ // If we are reading only a portion of the file
+ // (bounded by firstStripe and lastStripe),
+ // seeking before or after the portion of interest should return no data.
+ // Implement this by setting previousRow to the number of rows in the file.
+
+ // seeking past lastStripe
+ uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size());
+ if ( (lastStripe == num_stripes
+ && rowNumber >= footer->numberofrows()) ||
+ (lastStripe < num_stripes
+ && rowNumber >= firstRowOfStripe[lastStripe]) ) {
+ currentStripe = num_stripes;
+ previousRow = footer->numberofrows();
+ return;
+ }
+
+ uint64_t seekToStripe = 0;
+ while (seekToStripe+1 < lastStripe &&
+ firstRowOfStripe[seekToStripe+1] <= rowNumber) {
+ seekToStripe++;
+ }
+
+ // seeking before the first stripe
+ if (seekToStripe < firstStripe) {
+ currentStripe = num_stripes;
+ previousRow = footer->numberofrows();
+ return;
+ }
+
+ currentStripe = seekToStripe;
+ currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe];
+ previousRow = rowNumber;
+ startNextStripe();
+
+ uint64_t rowsToSkip = currentRowInStripe;
+
+ if (footer->rowindexstride() > 0 &&
+ currentStripeInfo.indexlength() > 0) {
+ uint32_t rowGroupId =
+ static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride());
+ rowsToSkip -= rowGroupId * footer->rowindexstride();
+
+ if (rowGroupId != 0) {
+ seekToRowGroup(rowGroupId);
+ }
+ }
+
+ reader->skip(rowsToSkip);
+ }
+
+ void RowReaderImpl::seekToRowGroup(uint32_t rowGroupEntryId) {
+ // reset all previous row indexes
+ rowIndexes.clear();
+
+ // obtain row indexes for selected columns
+ uint64_t offset = currentStripeInfo.offset();
+ for (int i = 0; i < currentStripeFooter.streams_size(); ++i) {
+ const proto::Stream& pbStream = currentStripeFooter.streams(i);
+ uint64_t colId = pbStream.column();
+ if (selectedColumns[colId] && pbStream.has_kind()
+ && pbStream.kind() == proto::Stream_Kind_ROW_INDEX) {
+ std::unique_ptr<SeekableInputStream> inStream =
+ createDecompressor(getCompression(),
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream
+ (contents->stream.get(),
+ offset,
+ pbStream.length(),
+ *contents->pool)),
+ getCompressionSize(),
+ *contents->pool);
+
+ proto::RowIndex rowIndex;
+ if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) {
+ throw ParseError("Failed to parse the row index");
+ }
+
+ rowIndexes[colId] = rowIndex;
+ }
+ offset += pbStream.length();
+ }
+
+ // store positions for selected columns
+ std::vector<std::list<uint64_t>> positions;
+ // store position providers for selected colimns
+ std::unordered_map<uint64_t, PositionProvider> positionProviders;
+
+ for (auto rowIndex = rowIndexes.cbegin();
+ rowIndex != rowIndexes.cend(); ++rowIndex) {
+ uint64_t colId = rowIndex->first;
+ const proto::RowIndexEntry& entry =
+ rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId));
+
+ // copy index positions for a specific column
+ positions.push_back({});
+ auto& position = positions.back();
+ for (int pos = 0; pos != entry.positions_size(); ++pos) {
+ position.push_back(entry.positions(pos));
+ }
+ positionProviders.insert(std::make_pair(colId, PositionProvider(position)));
+ }
+
+ reader->seekToRowGroup(positionProviders);
+ }
+
+ const FileContents& RowReaderImpl::getFileContents() const {
+ return *contents;
+ }
+
+ bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const {
+ return throwOnHive11DecimalOverflow;
+ }
+
+ int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const {
+ return forcedScaleOnHive11Decimal;
+ }
+
+ proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
+ const FileContents& contents) {
+ uint64_t stripeFooterStart = info.offset() + info.indexlength() +
+ info.datalength();
+ uint64_t stripeFooterLength = info.footerlength();
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(contents.compression,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream(contents.stream.get(),
+ stripeFooterStart,
+ stripeFooterLength,
+ *contents.pool)),
+ contents.blockSize,
+ *contents.pool);
+ proto::StripeFooter result;
+ if (!result.ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError(std::string("bad StripeFooter from ") +
+ pbStream->getName());
+ }
+ return result;
+ }
+
+ ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents,
+ const ReaderOptions& opts,
+ uint64_t _fileLength,
+ uint64_t _postscriptLength
+ ): contents(std::move(_contents)),
+ options(opts),
+ fileLength(_fileLength),
+ postscriptLength(_postscriptLength),
+ footer(contents->footer.get()) {
+ isMetadataLoaded = false;
+ checkOrcVersion();
+ numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
+ contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer));
+ contents->blockSize = getCompressionBlockSize(*contents->postscript);
+ contents->compression= convertCompressionKind(*contents->postscript);
+ }
+
+ std::string ReaderImpl::getSerializedFileTail() const {
+ proto::FileTail tail;
+ proto::PostScript *mutable_ps = tail.mutable_postscript();
+ mutable_ps->CopyFrom(*contents->postscript);
+ proto::Footer *mutableFooter = tail.mutable_footer();
+ mutableFooter->CopyFrom(*footer);
+ tail.set_filelength(fileLength);
+ tail.set_postscriptlength(postscriptLength);
+ TString result;
+ if (!tail.SerializeToString(&result)) {
+ throw ParseError("Failed to serialize file tail");
+ }
+ return result;
+ }
+
+ const ReaderOptions& ReaderImpl::getReaderOptions() const {
+ return options;
+ }
+
+ CompressionKind ReaderImpl::getCompression() const {
+ return contents->compression;
+ }
+
+ uint64_t ReaderImpl::getCompressionSize() const {
+ return contents->blockSize;
+ }
+
+ uint64_t ReaderImpl::getNumberOfStripes() const {
+ return numberOfStripes;
+ }
+
+ uint64_t ReaderImpl::getNumberOfStripeStatistics() const {
+ if (!isMetadataLoaded) {
+ readMetadata();
+ }
+ return metadata.get() == nullptr ? 0 :
+ static_cast<uint64_t>(metadata->stripestats_size());
+ }
+
+ std::unique_ptr<StripeInformation>
+ ReaderImpl::getStripe(uint64_t stripeIndex) const {
+ if (stripeIndex > getNumberOfStripes()) {
+ throw std::logic_error("stripe index out of range");
+ }
+ proto::StripeInformation stripeInfo =
+ footer->stripes(static_cast<int>(stripeIndex));
+
+ return std::unique_ptr<StripeInformation>
+ (new StripeInformationImpl
+ (stripeInfo.offset(),
+ stripeInfo.indexlength(),
+ stripeInfo.datalength(),
+ stripeInfo.footerlength(),
+ stripeInfo.numberofrows(),
+ contents->stream.get(),
+ *contents->pool,
+ contents->compression,
+ contents->blockSize));
+ }
+
+ FileVersion ReaderImpl::getFormatVersion() const {
+ if (contents->postscript->version_size() != 2) {
+ return FileVersion::v_0_11();
+ }
+ return FileVersion(
+ contents->postscript->version(0),
+ contents->postscript->version(1));
+ }
+
+ uint64_t ReaderImpl::getNumberOfRows() const {
+ return footer->numberofrows();
+ }
+
+ WriterId ReaderImpl::getWriterId() const {
+ if (footer->has_writer()) {
+ uint32_t id = footer->writer();
if (id > WriterId::TRINO_WRITER) {
- return WriterId::UNKNOWN_WRITER;
- } else {
- return static_cast<WriterId>(id);
- }
- }
- return WriterId::ORC_JAVA_WRITER;
- }
-
- uint32_t ReaderImpl::getWriterIdValue() const {
- if (footer->has_writer()) {
- return footer->writer();
- } else {
- return WriterId::ORC_JAVA_WRITER;
- }
- }
-
+ return WriterId::UNKNOWN_WRITER;
+ } else {
+ return static_cast<WriterId>(id);
+ }
+ }
+ return WriterId::ORC_JAVA_WRITER;
+ }
+
+ uint32_t ReaderImpl::getWriterIdValue() const {
+ if (footer->has_writer()) {
+ return footer->writer();
+ } else {
+ return WriterId::ORC_JAVA_WRITER;
+ }
+ }
+
std::string ReaderImpl::getSoftwareVersion() const {
std::ostringstream buffer;
buffer << writerIdToString(getWriterIdValue());
@@ -517,704 +517,704 @@ namespace orc {
return buffer.str();
}
- WriterVersion ReaderImpl::getWriterVersion() const {
- if (!contents->postscript->has_writerversion()) {
- return WriterVersion_ORIGINAL;
- }
- return static_cast<WriterVersion>(contents->postscript->writerversion());
- }
-
- uint64_t ReaderImpl::getContentLength() const {
- return footer->contentlength();
- }
-
- uint64_t ReaderImpl::getStripeStatisticsLength() const {
- return contents->postscript->metadatalength();
- }
-
- uint64_t ReaderImpl::getFileFooterLength() const {
- return contents->postscript->footerlength();
- }
-
- uint64_t ReaderImpl::getFilePostscriptLength() const {
- return postscriptLength;
- }
-
- uint64_t ReaderImpl::getFileLength() const {
- return fileLength;
- }
-
- uint64_t ReaderImpl::getRowIndexStride() const {
- return footer->rowindexstride();
- }
-
- const std::string& ReaderImpl::getStreamName() const {
- return contents->stream->getName();
- }
-
- std::list<std::string> ReaderImpl::getMetadataKeys() const {
- std::list<std::string> result;
- for(int i=0; i < footer->metadata_size(); ++i) {
- result.push_back(footer->metadata(i).name());
- }
- return result;
- }
-
- std::string ReaderImpl::getMetadataValue(const std::string& key) const {
- for(int i=0; i < footer->metadata_size(); ++i) {
- if (footer->metadata(i).name() == TString(key)) {
- return footer->metadata(i).value();
- }
- }
- throw std::range_error("key not found");
- }
-
- void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo,
- uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter,
- std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const {
- int num_streams = currentStripeFooter.streams_size();
- uint64_t offset = stripeInfo.offset();
- uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength();
- for (int i = 0; i < num_streams; i++) {
- const proto::Stream& stream = currentStripeFooter.streams(i);
- StreamKind streamKind = static_cast<StreamKind>(stream.kind());
- uint64_t length = static_cast<uint64_t>(stream.length());
- if (streamKind == StreamKind::StreamKind_ROW_INDEX) {
- if (offset + length > indexEnd) {
- std::stringstream msg;
- msg << "Malformed RowIndex stream meta in stripe " << stripeIndex
- << ": streamOffset=" << offset << ", streamLength=" << length
- << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength="
- << stripeInfo.indexlength();
- throw ParseError(msg.str());
- }
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents->stream.get(),
- offset,
- length,
- *contents->pool)),
- contents->blockSize,
- *(contents->pool));
-
- proto::RowIndex rowIndex;
- if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError("Failed to parse RowIndex from stripe footer");
- }
- int num_entries = rowIndex.entry_size();
- size_t column = static_cast<size_t>(stream.column());
- for (int j = 0; j < num_entries; j++) {
- const proto::RowIndexEntry& entry = rowIndex.entry(j);
- (*indexStats)[column].push_back(entry.statistics());
- }
- }
- offset += length;
- }
- }
-
- bool ReaderImpl::hasMetadataValue(const std::string& key) const {
- for(int i=0; i < footer->metadata_size(); ++i) {
- if (footer->metadata(i).name() == TString(key)) {
- return true;
- }
- }
- return false;
- }
-
- const Type& ReaderImpl::getType() const {
- return *(contents->schema.get());
- }
-
- std::unique_ptr<StripeStatistics>
- ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
- if (!isMetadataLoaded) {
- readMetadata();
- }
- if (metadata.get() == nullptr) {
- throw std::logic_error("No stripe statistics in file");
- }
- size_t num_cols = static_cast<size_t>(
- metadata->stripestats(
- static_cast<int>(stripeIndex)).colstats_size());
- std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols);
-
- proto::StripeInformation currentStripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
- proto::StripeFooter currentStripeFooter =
- getStripeFooter(currentStripeInfo, *contents.get());
-
- getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
-
- const Timezone& writerTZ =
- currentStripeFooter.has_writertimezone() ?
- getTimezoneByName(currentStripeFooter.writertimezone()) :
- getLocalTimezone();
- StatContext statContext(hasCorrectStatistics(), &writerTZ);
- return std::unique_ptr<StripeStatistics>
- (new StripeStatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)),
- indexStats, statContext));
- }
-
- std::unique_ptr<Statistics> ReaderImpl::getStatistics() const {
- StatContext statContext(hasCorrectStatistics());
- return std::unique_ptr<Statistics>
- (new StatisticsImpl(*footer, statContext));
- }
-
- std::unique_ptr<ColumnStatistics>
- ReaderImpl::getColumnStatistics(uint32_t index) const {
- if (index >= static_cast<uint64_t>(footer->statistics_size())) {
- throw std::logic_error("column index out of range");
- }
- proto::ColumnStatistics col =
- footer->statistics(static_cast<int32_t>(index));
-
- StatContext statContext(hasCorrectStatistics());
- return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext));
- }
-
- void ReaderImpl::readMetadata() const {
- uint64_t metadataSize = contents->postscript->metadatalength();
- uint64_t footerLength = contents->postscript->footerlength();
- if (fileLength < metadataSize + footerLength + postscriptLength + 1) {
- std::stringstream msg;
- msg << "Invalid Metadata length: fileLength=" << fileLength
- << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength
- << ", postscriptLength=" << postscriptLength;
- throw ParseError(msg.str());
- }
- uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1;
- if (metadataSize != 0) {
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents->stream.get(),
- metadataStart,
- metadataSize,
- *contents->pool)),
- contents->blockSize,
- *contents->pool);
- metadata.reset(new proto::Metadata());
- if (!metadata->ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError("Failed to parse the metadata");
- }
- }
- isMetadataLoaded = true;
- }
-
- bool ReaderImpl::hasCorrectStatistics() const {
- return !WriterVersionImpl::VERSION_HIVE_8732().compareGT(getWriterVersion());
- }
-
- void ReaderImpl::checkOrcVersion() {
- FileVersion version = getFormatVersion();
- if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) {
- *(options.getErrorStream())
- << "Warning: ORC file " << contents->stream->getName()
- << " was written in an unknown format version "
- << version.toString() << "\n";
- }
- }
-
- std::unique_ptr<RowReader> ReaderImpl::createRowReader() const {
- RowReaderOptions defaultOpts;
- return createRowReader(defaultOpts);
- }
-
- std::unique_ptr<RowReader> ReaderImpl::createRowReader(
- const RowReaderOptions& opts) const {
- return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts));
- }
-
- uint64_t maxStreamsForType(const proto::Type& type) {
- switch (static_cast<int64_t>(type.kind())) {
- case proto::Type_Kind_STRUCT:
- return 1;
- case proto::Type_Kind_INT:
- case proto::Type_Kind_LONG:
- case proto::Type_Kind_SHORT:
- case proto::Type_Kind_FLOAT:
- case proto::Type_Kind_DOUBLE:
- case proto::Type_Kind_BOOLEAN:
- case proto::Type_Kind_BYTE:
- case proto::Type_Kind_DATE:
- case proto::Type_Kind_LIST:
- case proto::Type_Kind_MAP:
- case proto::Type_Kind_UNION:
- return 2;
- case proto::Type_Kind_BINARY:
- case proto::Type_Kind_DECIMAL:
- case proto::Type_Kind_TIMESTAMP:
- return 3;
- case proto::Type_Kind_CHAR:
- case proto::Type_Kind_STRING:
- case proto::Type_Kind_VARCHAR:
- return 4;
- default:
- return 0;
- }
- }
-
- uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
- std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true);
- return getMemoryUse(stripeIx, selectedColumns);
- }
-
- uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) {
- std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- ColumnSelector column_selector(contents.get());
- if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) {
- for(std::list<uint64_t>::const_iterator field = include.begin();
- field != include.end(); ++field) {
- column_selector.updateSelectedByFieldId(selectedColumns, *field);
- }
- } else {
- // default is to select all columns
- std::fill(selectedColumns.begin(), selectedColumns.end(), true);
- }
- column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
- return getMemoryUse(stripeIx, selectedColumns);
- }
-
- uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) {
- std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- ColumnSelector column_selector(contents.get());
- if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) {
- for(std::list<std::string>::const_iterator field = names.begin();
- field != names.end(); ++field) {
- column_selector.updateSelectedByName(selectedColumns, *field);
- }
- } else {
- // default is to select all columns
- std::fill(selectedColumns.begin(), selectedColumns.end(), true);
- }
- column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
- return getMemoryUse(stripeIx, selectedColumns);
- }
-
- uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) {
- std::vector<bool> selectedColumns;
- selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
- ColumnSelector column_selector(contents.get());
- if (include.begin() != include.end()) {
- for(std::list<uint64_t>::const_iterator field = include.begin();
- field != include.end(); ++field) {
- column_selector.updateSelectedByTypeId(selectedColumns, *field);
- }
- } else {
- // default is to select all columns
- std::fill(selectedColumns.begin(), selectedColumns.end(), true);
- }
- column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
- return getMemoryUse(stripeIx, selectedColumns);
- }
-
- uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) {
- uint64_t maxDataLength = 0;
-
- if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
- uint64_t stripe = footer->stripes(stripeIx).datalength();
- if (maxDataLength < stripe) {
- maxDataLength = stripe;
- }
- } else {
- for (int i=0; i < footer->stripes_size(); i++) {
- uint64_t stripe = footer->stripes(i).datalength();
- if (maxDataLength < stripe) {
- maxDataLength = stripe;
- }
- }
- }
-
- bool hasStringColumn = false;
- uint64_t nSelectedStreams = 0;
- for (int i=0; !hasStringColumn && i < footer->types_size(); i++) {
- if (selectedColumns[static_cast<size_t>(i)]) {
- const proto::Type& type = footer->types(i);
- nSelectedStreams += maxStreamsForType(type) ;
- switch (static_cast<int64_t>(type.kind())) {
- case proto::Type_Kind_CHAR:
- case proto::Type_Kind_STRING:
- case proto::Type_Kind_VARCHAR:
- case proto::Type_Kind_BINARY: {
- hasStringColumn = true;
- break;
- }
- default: {
- break;
- }
- }
- }
- }
-
- /* If a string column is read, use stripe datalength as a memory estimate
- * because we don't know the dictionary size. Multiply by 2 because
- * a string column requires two buffers:
- * in the input stream and in the seekable input stream.
- * If no string column is read, estimate from the number of streams.
- */
- uint64_t memory = hasStringColumn ? 2 * maxDataLength :
- std::min(uint64_t(maxDataLength),
- nSelectedStreams * contents->stream->getNaturalReadSize());
-
- // Do we need even more memory to read the footer or the metadata?
- if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) {
- memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS;
- }
- if (memory < contents->postscript->metadatalength()) {
- memory = contents->postscript->metadatalength();
- }
-
- // Account for firstRowOfStripe.
- memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t);
-
- // Decompressors need buffers for each stream
- uint64_t decompressorMemory = 0;
- if (contents->compression != CompressionKind_NONE) {
- for (int i=0; i < footer->types_size(); i++) {
- if (selectedColumns[static_cast<size_t>(i)]) {
- const proto::Type& type = footer->types(i);
- decompressorMemory += maxStreamsForType(type) * contents->blockSize;
- }
- }
- if (contents->compression == CompressionKind_SNAPPY) {
- decompressorMemory *= 2; // Snappy decompressor uses a second buffer
- }
- }
-
- return memory + decompressorMemory ;
- }
-
- void RowReaderImpl::startNextStripe() {
- reader.reset(); // ColumnReaders use lots of memory; free old memory first
- currentStripeInfo = footer->stripes(static_cast<int>(currentStripe));
- uint64_t fileLength = contents->stream->getLength();
- if (currentStripeInfo.offset() + currentStripeInfo.indexlength() +
- currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) {
- std::stringstream msg;
- msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength="
- << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength="
- << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength()
- << ", footerLength=" << currentStripeInfo.footerlength() << ")";
- throw ParseError(msg.str());
- }
- currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get());
- rowsInCurrentStripe = currentStripeInfo.numberofrows();
- const Timezone& writerTimezone =
- currentStripeFooter.has_writertimezone() ?
- getTimezoneByName(currentStripeFooter.writertimezone()) :
- localTimezone;
- StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo,
- currentStripeFooter,
- currentStripeInfo.offset(),
- *(contents->stream.get()),
- writerTimezone);
- reader = buildReader(*contents->schema.get(), stripeStreams);
- }
-
- bool RowReaderImpl::next(ColumnVectorBatch& data) {
- if (currentStripe >= lastStripe) {
- data.numElements = 0;
- if (lastStripe > 0) {
- previousRow = firstRowOfStripe[lastStripe - 1] +
- footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows();
- } else {
- previousRow = 0;
- }
- return false;
- }
- if (currentRowInStripe == 0) {
- startNextStripe();
- }
- uint64_t rowsToRead =
- std::min(static_cast<uint64_t>(data.capacity),
- rowsInCurrentStripe - currentRowInStripe);
- data.numElements = rowsToRead;
- if (enableEncodedBlock) {
- reader->nextEncoded(data, rowsToRead, nullptr);
- }
- else {
- reader->next(data, rowsToRead, nullptr);
- }
- // update row number
- previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe;
- currentRowInStripe += rowsToRead;
- if (currentRowInStripe >= rowsInCurrentStripe) {
- currentStripe += 1;
- currentRowInStripe = 0;
- }
- return rowsToRead != 0;
- }
-
- std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch
- (uint64_t capacity) const {
- return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock);
- }
-
- void ensureOrcFooter(InputStream* stream,
- DataBuffer<char> *buffer,
- uint64_t postscriptLength) {
-
- const std::string MAGIC("ORC");
- const uint64_t magicLength = MAGIC.length();
- const char * const bufferStart = buffer->data();
- const uint64_t bufferLength = buffer->size();
-
- if (postscriptLength < magicLength || bufferLength < magicLength) {
- throw ParseError("Invalid ORC postscript length");
- }
- const char* magicStart = bufferStart + bufferLength - 1 - magicLength;
-
- // Look for the magic string at the end of the postscript.
- if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) {
- // If there is no magic string at the end, check the beginning.
- // Only files written by Hive 0.11.0 don't have the tail ORC string.
- std::unique_ptr<char[]> frontBuffer( new char[magicLength] );
- stream->read(frontBuffer.get(), magicLength, 0);
- bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0;
-
- if (!foundMatch) {
- throw ParseError("Not an ORC file");
- }
- }
- }
-
- /**
- * Read the file's postscript from the given buffer.
- * @param stream the file stream
- * @param buffer the buffer with the tail of the file.
- * @param postscriptSize the length of postscript in bytes
- */
- std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream,
- DataBuffer<char> *buffer,
- uint64_t postscriptSize) {
- char *ptr = buffer->data();
- uint64_t readSize = buffer->size();
-
- ensureOrcFooter(stream, buffer, postscriptSize);
-
- std::unique_ptr<proto::PostScript> postscript =
- std::unique_ptr<proto::PostScript>(new proto::PostScript());
- if (readSize < 1 + postscriptSize) {
- std::stringstream msg;
- msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = "
- << stream->getLength();
- throw ParseError(msg.str());
- }
- if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize,
- static_cast<int>(postscriptSize))) {
- throw ParseError("Failed to parse the postscript from " +
- stream->getName());
- }
- return REDUNDANT_MOVE(postscript);
- }
-
- /**
- * Check that indices in the type tree are valid, so we won't crash
- * when we convert the proto::Types to TypeImpls.
- */
- void checkProtoTypeIds(const proto::Footer &footer) {
- std::stringstream msg;
- int maxId = footer.types_size();
- if (maxId <= 0) {
- throw ParseError("Footer is corrupt: no types found");
- }
- for (int i = 0; i < maxId; ++i) {
- const proto::Type& type = footer.types(i);
- for (int j = 0; j < type.subtypes_size(); ++j) {
- int subTypeId = static_cast<int>(type.subtypes(j));
- if (subTypeId <= i) {
- msg << "Footer is corrupt: malformed link from type " << i << " to "
- << subTypeId;
- throw ParseError(msg.str());
- }
- if (subTypeId >= maxId) {
- msg << "Footer is corrupt: types(" << subTypeId << ") not exists";
- throw ParseError(msg.str());
- }
- if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) {
- msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j
- << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= "
- << subTypeId << ")";
- throw ParseError(msg.str());
- }
- }
- }
- }
-
- /**
- * Parse the footer from the given buffer.
- * @param stream the file's stream
- * @param buffer the buffer to parse the footer from
- * @param footerOffset the offset within the buffer that contains the footer
- * @param ps the file's postscript
- * @param memoryPool the memory pool to use
- */
- std::unique_ptr<proto::Footer> readFooter(InputStream* stream,
- const DataBuffer<char> *buffer,
- uint64_t footerOffset,
- const proto::PostScript& ps,
- MemoryPool& memoryPool) {
- const char *footerPtr = buffer->data() + footerOffset;
-
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(convertCompressionKind(ps),
- std::unique_ptr<SeekableInputStream>
- (new SeekableArrayInputStream(footerPtr,
- ps.footerlength())),
- getCompressionBlockSize(ps),
- memoryPool);
-
- std::unique_ptr<proto::Footer> footer =
- std::unique_ptr<proto::Footer>(new proto::Footer());
- if (!footer->ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError("Failed to parse the footer from " +
- stream->getName());
- }
-
- checkProtoTypeIds(*footer);
- return REDUNDANT_MOVE(footer);
- }
-
- std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream,
- const ReaderOptions& options) {
- std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents());
- contents->pool = options.getMemoryPool();
- contents->errorStream = options.getErrorStream();
- std::string serializedFooter = options.getSerializedFileTail();
- uint64_t fileLength;
- uint64_t postscriptLength;
- if (serializedFooter.length() != 0) {
- // Parse the file tail from the serialized one.
- proto::FileTail tail;
- if (!tail.ParseFromString(TString(serializedFooter))) {
- throw ParseError("Failed to parse the file tail from string");
- }
- contents->postscript.reset(new proto::PostScript(tail.postscript()));
- contents->footer.reset(new proto::Footer(tail.footer()));
- fileLength = tail.filelength();
- postscriptLength = tail.postscriptlength();
- } else {
- // figure out the size of the file using the option or filesystem
- fileLength = std::min(options.getTailLocation(),
- static_cast<uint64_t>(stream->getLength()));
-
- //read last bytes into buffer to get PostScript
- uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS);
- if (readSize < 4) {
- throw ParseError("File size too small");
- }
- std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) );
- stream->read(buffer->data(), readSize, fileLength - readSize);
-
- postscriptLength = buffer->data()[readSize - 1] & 0xff;
- contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(),
- buffer.get(), postscriptLength));
- uint64_t footerSize = contents->postscript->footerlength();
- uint64_t tailSize = 1 + postscriptLength + footerSize;
- if (tailSize >= fileLength) {
- std::stringstream msg;
- msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength;
- throw ParseError(msg.str());
- }
- uint64_t footerOffset;
-
- if (tailSize > readSize) {
- buffer->resize(footerSize);
- stream->read(buffer->data(), footerSize, fileLength - tailSize);
- footerOffset = 0;
- } else {
- footerOffset = readSize - tailSize;
- }
-
- contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(),
- footerOffset, *contents->postscript, *contents->pool));
- }
- contents->stream = std::move(stream);
- return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents),
- options,
- fileLength,
- postscriptLength));
- }
-
- std::map<uint32_t, BloomFilterIndex>
- ReaderImpl::getBloomFilters(uint32_t stripeIndex,
- const std::set<uint32_t>& included) const {
- std::map<uint32_t, BloomFilterIndex> ret;
-
- // find stripe info
- if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) {
- throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex)));
- }
- const proto::StripeInformation currentStripeInfo =
- footer->stripes(static_cast<int>(stripeIndex));
- const proto::StripeFooter currentStripeFooter =
- getStripeFooter(currentStripeInfo, *contents);
-
- // iterate stripe footer to get stream of bloomfilter
- uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset());
- for (int i = 0; i < currentStripeFooter.streams_size(); i++) {
- const proto::Stream& stream = currentStripeFooter.streams(i);
- uint32_t column = static_cast<uint32_t>(stream.column());
- uint64_t length = static_cast<uint64_t>(stream.length());
-
- // a bloom filter stream from a selected column is found
- if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 &&
- (included.empty() || included.find(column) != included.end())) {
-
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(contents->compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(contents->stream.get(),
- offset,
- length,
- *contents->pool)),
- contents->blockSize,
- *(contents->pool));
-
- proto::BloomFilterIndex pbBFIndex;
- if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError("Failed to parse BloomFilterIndex");
- }
-
- BloomFilterIndex bfIndex;
- for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) {
- std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize(
- stream.kind(),
- currentStripeFooter.columns(static_cast<int>(stream.column())),
- pbBFIndex.bloomfilter(j));
- bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry)));
- }
-
- // add bloom filters to result for one column
- ret[column] = bfIndex;
- }
-
- offset += length;
- }
-
- return ret;
- }
-
- RowReader::~RowReader() {
- // PASS
- }
-
- Reader::~Reader() {
- // PASS
- }
-
- InputStream::~InputStream() {
- // PASS
- };
-
-
-
-}// namespace
+ WriterVersion ReaderImpl::getWriterVersion() const {
+ if (!contents->postscript->has_writerversion()) {
+ return WriterVersion_ORIGINAL;
+ }
+ return static_cast<WriterVersion>(contents->postscript->writerversion());
+ }
+
+ uint64_t ReaderImpl::getContentLength() const {
+ return footer->contentlength();
+ }
+
+ uint64_t ReaderImpl::getStripeStatisticsLength() const {
+ return contents->postscript->metadatalength();
+ }
+
+ uint64_t ReaderImpl::getFileFooterLength() const {
+ return contents->postscript->footerlength();
+ }
+
+ uint64_t ReaderImpl::getFilePostscriptLength() const {
+ return postscriptLength;
+ }
+
+ uint64_t ReaderImpl::getFileLength() const {
+ return fileLength;
+ }
+
+ uint64_t ReaderImpl::getRowIndexStride() const {
+ return footer->rowindexstride();
+ }
+
+ const std::string& ReaderImpl::getStreamName() const {
+ return contents->stream->getName();
+ }
+
+ std::list<std::string> ReaderImpl::getMetadataKeys() const {
+ std::list<std::string> result;
+ for(int i=0; i < footer->metadata_size(); ++i) {
+ result.push_back(footer->metadata(i).name());
+ }
+ return result;
+ }
+
+ std::string ReaderImpl::getMetadataValue(const std::string& key) const {
+ for(int i=0; i < footer->metadata_size(); ++i) {
+ if (footer->metadata(i).name() == TString(key)) {
+ return footer->metadata(i).value();
+ }
+ }
+ throw std::range_error("key not found");
+ }
+
+ void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo,
+ uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter,
+ std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const {
+ int num_streams = currentStripeFooter.streams_size();
+ uint64_t offset = stripeInfo.offset();
+ uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength();
+ for (int i = 0; i < num_streams; i++) {
+ const proto::Stream& stream = currentStripeFooter.streams(i);
+ StreamKind streamKind = static_cast<StreamKind>(stream.kind());
+ uint64_t length = static_cast<uint64_t>(stream.length());
+ if (streamKind == StreamKind::StreamKind_ROW_INDEX) {
+ if (offset + length > indexEnd) {
+ std::stringstream msg;
+ msg << "Malformed RowIndex stream meta in stripe " << stripeIndex
+ << ": streamOffset=" << offset << ", streamLength=" << length
+ << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength="
+ << stripeInfo.indexlength();
+ throw ParseError(msg.str());
+ }
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(contents->compression,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream(contents->stream.get(),
+ offset,
+ length,
+ *contents->pool)),
+ contents->blockSize,
+ *(contents->pool));
+
+ proto::RowIndex rowIndex;
+ if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError("Failed to parse RowIndex from stripe footer");
+ }
+ int num_entries = rowIndex.entry_size();
+ size_t column = static_cast<size_t>(stream.column());
+ for (int j = 0; j < num_entries; j++) {
+ const proto::RowIndexEntry& entry = rowIndex.entry(j);
+ (*indexStats)[column].push_back(entry.statistics());
+ }
+ }
+ offset += length;
+ }
+ }
+
+ bool ReaderImpl::hasMetadataValue(const std::string& key) const {
+ for(int i=0; i < footer->metadata_size(); ++i) {
+ if (footer->metadata(i).name() == TString(key)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ const Type& ReaderImpl::getType() const {
+ return *(contents->schema.get());
+ }
+
+ std::unique_ptr<StripeStatistics>
+ ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
+ if (!isMetadataLoaded) {
+ readMetadata();
+ }
+ if (metadata.get() == nullptr) {
+ throw std::logic_error("No stripe statistics in file");
+ }
+ size_t num_cols = static_cast<size_t>(
+ metadata->stripestats(
+ static_cast<int>(stripeIndex)).colstats_size());
+ std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols);
+
+ proto::StripeInformation currentStripeInfo =
+ footer->stripes(static_cast<int>(stripeIndex));
+ proto::StripeFooter currentStripeFooter =
+ getStripeFooter(currentStripeInfo, *contents.get());
+
+ getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
+
+ const Timezone& writerTZ =
+ currentStripeFooter.has_writertimezone() ?
+ getTimezoneByName(currentStripeFooter.writertimezone()) :
+ getLocalTimezone();
+ StatContext statContext(hasCorrectStatistics(), &writerTZ);
+ return std::unique_ptr<StripeStatistics>
+ (new StripeStatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)),
+ indexStats, statContext));
+ }
+
+ std::unique_ptr<Statistics> ReaderImpl::getStatistics() const {
+ StatContext statContext(hasCorrectStatistics());
+ return std::unique_ptr<Statistics>
+ (new StatisticsImpl(*footer, statContext));
+ }
+
+ std::unique_ptr<ColumnStatistics>
+ ReaderImpl::getColumnStatistics(uint32_t index) const {
+ if (index >= static_cast<uint64_t>(footer->statistics_size())) {
+ throw std::logic_error("column index out of range");
+ }
+ proto::ColumnStatistics col =
+ footer->statistics(static_cast<int32_t>(index));
+
+ StatContext statContext(hasCorrectStatistics());
+ return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext));
+ }
+
+ void ReaderImpl::readMetadata() const {
+ uint64_t metadataSize = contents->postscript->metadatalength();
+ uint64_t footerLength = contents->postscript->footerlength();
+ if (fileLength < metadataSize + footerLength + postscriptLength + 1) {
+ std::stringstream msg;
+ msg << "Invalid Metadata length: fileLength=" << fileLength
+ << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength
+ << ", postscriptLength=" << postscriptLength;
+ throw ParseError(msg.str());
+ }
+ uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1;
+ if (metadataSize != 0) {
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(contents->compression,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream(contents->stream.get(),
+ metadataStart,
+ metadataSize,
+ *contents->pool)),
+ contents->blockSize,
+ *contents->pool);
+ metadata.reset(new proto::Metadata());
+ if (!metadata->ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError("Failed to parse the metadata");
+ }
+ }
+ isMetadataLoaded = true;
+ }
+
+ bool ReaderImpl::hasCorrectStatistics() const {
+ return !WriterVersionImpl::VERSION_HIVE_8732().compareGT(getWriterVersion());
+ }
+
+ void ReaderImpl::checkOrcVersion() {
+ FileVersion version = getFormatVersion();
+ if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) {
+ *(options.getErrorStream())
+ << "Warning: ORC file " << contents->stream->getName()
+ << " was written in an unknown format version "
+ << version.toString() << "\n";
+ }
+ }
+
+ std::unique_ptr<RowReader> ReaderImpl::createRowReader() const {
+ RowReaderOptions defaultOpts;
+ return createRowReader(defaultOpts);
+ }
+
+ std::unique_ptr<RowReader> ReaderImpl::createRowReader(
+ const RowReaderOptions& opts) const {
+ return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts));
+ }
+
+ uint64_t maxStreamsForType(const proto::Type& type) {
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_STRUCT:
+ return 1;
+ case proto::Type_Kind_INT:
+ case proto::Type_Kind_LONG:
+ case proto::Type_Kind_SHORT:
+ case proto::Type_Kind_FLOAT:
+ case proto::Type_Kind_DOUBLE:
+ case proto::Type_Kind_BOOLEAN:
+ case proto::Type_Kind_BYTE:
+ case proto::Type_Kind_DATE:
+ case proto::Type_Kind_LIST:
+ case proto::Type_Kind_MAP:
+ case proto::Type_Kind_UNION:
+ return 2;
+ case proto::Type_Kind_BINARY:
+ case proto::Type_Kind_DECIMAL:
+ case proto::Type_Kind_TIMESTAMP:
+ return 3;
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_VARCHAR:
+ return 4;
+ default:
+ return 0;
+ }
+ }
+
+ uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
+ std::vector<bool> selectedColumns;
+ selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true);
+ return getMemoryUse(stripeIx, selectedColumns);
+ }
+
+ uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) {
+ std::vector<bool> selectedColumns;
+ selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
+ ColumnSelector column_selector(contents.get());
+ if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) {
+ for(std::list<uint64_t>::const_iterator field = include.begin();
+ field != include.end(); ++field) {
+ column_selector.updateSelectedByFieldId(selectedColumns, *field);
+ }
+ } else {
+ // default is to select all columns
+ std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ }
+ column_selector.selectParents(selectedColumns, *contents->schema.get());
+ selectedColumns[0] = true; // column 0 is selected by default
+ return getMemoryUse(stripeIx, selectedColumns);
+ }
+
+ uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) {
+ std::vector<bool> selectedColumns;
+ selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
+ ColumnSelector column_selector(contents.get());
+ if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) {
+ for(std::list<std::string>::const_iterator field = names.begin();
+ field != names.end(); ++field) {
+ column_selector.updateSelectedByName(selectedColumns, *field);
+ }
+ } else {
+ // default is to select all columns
+ std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ }
+ column_selector.selectParents(selectedColumns, *contents->schema.get());
+ selectedColumns[0] = true; // column 0 is selected by default
+ return getMemoryUse(stripeIx, selectedColumns);
+ }
+
+ uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) {
+ std::vector<bool> selectedColumns;
+ selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false);
+ ColumnSelector column_selector(contents.get());
+ if (include.begin() != include.end()) {
+ for(std::list<uint64_t>::const_iterator field = include.begin();
+ field != include.end(); ++field) {
+ column_selector.updateSelectedByTypeId(selectedColumns, *field);
+ }
+ } else {
+ // default is to select all columns
+ std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ }
+ column_selector.selectParents(selectedColumns, *contents->schema.get());
+ selectedColumns[0] = true; // column 0 is selected by default
+ return getMemoryUse(stripeIx, selectedColumns);
+ }
+
+ uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) {
+ uint64_t maxDataLength = 0;
+
+ if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
+ uint64_t stripe = footer->stripes(stripeIx).datalength();
+ if (maxDataLength < stripe) {
+ maxDataLength = stripe;
+ }
+ } else {
+ for (int i=0; i < footer->stripes_size(); i++) {
+ uint64_t stripe = footer->stripes(i).datalength();
+ if (maxDataLength < stripe) {
+ maxDataLength = stripe;
+ }
+ }
+ }
+
+ bool hasStringColumn = false;
+ uint64_t nSelectedStreams = 0;
+ for (int i=0; !hasStringColumn && i < footer->types_size(); i++) {
+ if (selectedColumns[static_cast<size_t>(i)]) {
+ const proto::Type& type = footer->types(i);
+ nSelectedStreams += maxStreamsForType(type) ;
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_VARCHAR:
+ case proto::Type_Kind_BINARY: {
+ hasStringColumn = true;
+ break;
+ }
+ default: {
+ break;
+ }
+ }
+ }
+ }
+
+ /* If a string column is read, use stripe datalength as a memory estimate
+ * because we don't know the dictionary size. Multiply by 2 because
+ * a string column requires two buffers:
+ * in the input stream and in the seekable input stream.
+ * If no string column is read, estimate from the number of streams.
+ */
+ uint64_t memory = hasStringColumn ? 2 * maxDataLength :
+ std::min(uint64_t(maxDataLength),
+ nSelectedStreams * contents->stream->getNaturalReadSize());
+
+ // Do we need even more memory to read the footer or the metadata?
+ if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) {
+ memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS;
+ }
+ if (memory < contents->postscript->metadatalength()) {
+ memory = contents->postscript->metadatalength();
+ }
+
+ // Account for firstRowOfStripe.
+ memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t);
+
+ // Decompressors need buffers for each stream
+ uint64_t decompressorMemory = 0;
+ if (contents->compression != CompressionKind_NONE) {
+ for (int i=0; i < footer->types_size(); i++) {
+ if (selectedColumns[static_cast<size_t>(i)]) {
+ const proto::Type& type = footer->types(i);
+ decompressorMemory += maxStreamsForType(type) * contents->blockSize;
+ }
+ }
+ if (contents->compression == CompressionKind_SNAPPY) {
+ decompressorMemory *= 2; // Snappy decompressor uses a second buffer
+ }
+ }
+
+ return memory + decompressorMemory ;
+ }
+
+ void RowReaderImpl::startNextStripe() {
+ reader.reset(); // ColumnReaders use lots of memory; free old memory first
+ currentStripeInfo = footer->stripes(static_cast<int>(currentStripe));
+ uint64_t fileLength = contents->stream->getLength();
+ if (currentStripeInfo.offset() + currentStripeInfo.indexlength() +
+ currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) {
+ std::stringstream msg;
+ msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength="
+ << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength="
+ << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength()
+ << ", footerLength=" << currentStripeInfo.footerlength() << ")";
+ throw ParseError(msg.str());
+ }
+ currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get());
+ rowsInCurrentStripe = currentStripeInfo.numberofrows();
+ const Timezone& writerTimezone =
+ currentStripeFooter.has_writertimezone() ?
+ getTimezoneByName(currentStripeFooter.writertimezone()) :
+ localTimezone;
+ StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo,
+ currentStripeFooter,
+ currentStripeInfo.offset(),
+ *(contents->stream.get()),
+ writerTimezone);
+ reader = buildReader(*contents->schema.get(), stripeStreams);
+ }
+
+ bool RowReaderImpl::next(ColumnVectorBatch& data) {
+ if (currentStripe >= lastStripe) {
+ data.numElements = 0;
+ if (lastStripe > 0) {
+ previousRow = firstRowOfStripe[lastStripe - 1] +
+ footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows();
+ } else {
+ previousRow = 0;
+ }
+ return false;
+ }
+ if (currentRowInStripe == 0) {
+ startNextStripe();
+ }
+ uint64_t rowsToRead =
+ std::min(static_cast<uint64_t>(data.capacity),
+ rowsInCurrentStripe - currentRowInStripe);
+ data.numElements = rowsToRead;
+ if (enableEncodedBlock) {
+ reader->nextEncoded(data, rowsToRead, nullptr);
+ }
+ else {
+ reader->next(data, rowsToRead, nullptr);
+ }
+ // update row number
+ previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe;
+ currentRowInStripe += rowsToRead;
+ if (currentRowInStripe >= rowsInCurrentStripe) {
+ currentStripe += 1;
+ currentRowInStripe = 0;
+ }
+ return rowsToRead != 0;
+ }
+
+ std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch
+ (uint64_t capacity) const {
+ return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock);
+ }
+
+ void ensureOrcFooter(InputStream* stream,
+ DataBuffer<char> *buffer,
+ uint64_t postscriptLength) {
+
+ const std::string MAGIC("ORC");
+ const uint64_t magicLength = MAGIC.length();
+ const char * const bufferStart = buffer->data();
+ const uint64_t bufferLength = buffer->size();
+
+ if (postscriptLength < magicLength || bufferLength < magicLength) {
+ throw ParseError("Invalid ORC postscript length");
+ }
+ const char* magicStart = bufferStart + bufferLength - 1 - magicLength;
+
+ // Look for the magic string at the end of the postscript.
+ if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) {
+ // If there is no magic string at the end, check the beginning.
+ // Only files written by Hive 0.11.0 don't have the tail ORC string.
+ std::unique_ptr<char[]> frontBuffer( new char[magicLength] );
+ stream->read(frontBuffer.get(), magicLength, 0);
+ bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0;
+
+ if (!foundMatch) {
+ throw ParseError("Not an ORC file");
+ }
+ }
+ }
+
+ /**
+ * Read the file's postscript from the given buffer.
+ * @param stream the file stream
+ * @param buffer the buffer with the tail of the file.
+ * @param postscriptSize the length of postscript in bytes
+ */
+ std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream,
+ DataBuffer<char> *buffer,
+ uint64_t postscriptSize) {
+ char *ptr = buffer->data();
+ uint64_t readSize = buffer->size();
+
+ ensureOrcFooter(stream, buffer, postscriptSize);
+
+ std::unique_ptr<proto::PostScript> postscript =
+ std::unique_ptr<proto::PostScript>(new proto::PostScript());
+ if (readSize < 1 + postscriptSize) {
+ std::stringstream msg;
+ msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = "
+ << stream->getLength();
+ throw ParseError(msg.str());
+ }
+ if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize,
+ static_cast<int>(postscriptSize))) {
+ throw ParseError("Failed to parse the postscript from " +
+ stream->getName());
+ }
+ return REDUNDANT_MOVE(postscript);
+ }
+
+ /**
+ * Check that indices in the type tree are valid, so we won't crash
+ * when we convert the proto::Types to TypeImpls.
+ */
+ void checkProtoTypeIds(const proto::Footer &footer) {
+ std::stringstream msg;
+ int maxId = footer.types_size();
+ if (maxId <= 0) {
+ throw ParseError("Footer is corrupt: no types found");
+ }
+ for (int i = 0; i < maxId; ++i) {
+ const proto::Type& type = footer.types(i);
+ for (int j = 0; j < type.subtypes_size(); ++j) {
+ int subTypeId = static_cast<int>(type.subtypes(j));
+ if (subTypeId <= i) {
+ msg << "Footer is corrupt: malformed link from type " << i << " to "
+ << subTypeId;
+ throw ParseError(msg.str());
+ }
+ if (subTypeId >= maxId) {
+ msg << "Footer is corrupt: types(" << subTypeId << ") not exists";
+ throw ParseError(msg.str());
+ }
+ if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) {
+ msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j
+ << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= "
+ << subTypeId << ")";
+ throw ParseError(msg.str());
+ }
+ }
+ }
+ }
+
+ /**
+ * Parse the footer from the given buffer.
+ * @param stream the file's stream
+ * @param buffer the buffer to parse the footer from
+ * @param footerOffset the offset within the buffer that contains the footer
+ * @param ps the file's postscript
+ * @param memoryPool the memory pool to use
+ */
+ std::unique_ptr<proto::Footer> readFooter(InputStream* stream,
+ const DataBuffer<char> *buffer,
+ uint64_t footerOffset,
+ const proto::PostScript& ps,
+ MemoryPool& memoryPool) {
+ const char *footerPtr = buffer->data() + footerOffset;
+
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(convertCompressionKind(ps),
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableArrayInputStream(footerPtr,
+ ps.footerlength())),
+ getCompressionBlockSize(ps),
+ memoryPool);
+
+ std::unique_ptr<proto::Footer> footer =
+ std::unique_ptr<proto::Footer>(new proto::Footer());
+ if (!footer->ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError("Failed to parse the footer from " +
+ stream->getName());
+ }
+
+ checkProtoTypeIds(*footer);
+ return REDUNDANT_MOVE(footer);
+ }
+
+ std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream,
+ const ReaderOptions& options) {
+ std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents());
+ contents->pool = options.getMemoryPool();
+ contents->errorStream = options.getErrorStream();
+ std::string serializedFooter = options.getSerializedFileTail();
+ uint64_t fileLength;
+ uint64_t postscriptLength;
+ if (serializedFooter.length() != 0) {
+ // Parse the file tail from the serialized one.
+ proto::FileTail tail;
+ if (!tail.ParseFromString(TString(serializedFooter))) {
+ throw ParseError("Failed to parse the file tail from string");
+ }
+ contents->postscript.reset(new proto::PostScript(tail.postscript()));
+ contents->footer.reset(new proto::Footer(tail.footer()));
+ fileLength = tail.filelength();
+ postscriptLength = tail.postscriptlength();
+ } else {
+ // figure out the size of the file using the option or filesystem
+ fileLength = std::min(options.getTailLocation(),
+ static_cast<uint64_t>(stream->getLength()));
+
+ //read last bytes into buffer to get PostScript
+ uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS);
+ if (readSize < 4) {
+ throw ParseError("File size too small");
+ }
+ std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) );
+ stream->read(buffer->data(), readSize, fileLength - readSize);
+
+ postscriptLength = buffer->data()[readSize - 1] & 0xff;
+ contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(),
+ buffer.get(), postscriptLength));
+ uint64_t footerSize = contents->postscript->footerlength();
+ uint64_t tailSize = 1 + postscriptLength + footerSize;
+ if (tailSize >= fileLength) {
+ std::stringstream msg;
+ msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength;
+ throw ParseError(msg.str());
+ }
+ uint64_t footerOffset;
+
+ if (tailSize > readSize) {
+ buffer->resize(footerSize);
+ stream->read(buffer->data(), footerSize, fileLength - tailSize);
+ footerOffset = 0;
+ } else {
+ footerOffset = readSize - tailSize;
+ }
+
+ contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(),
+ footerOffset, *contents->postscript, *contents->pool));
+ }
+ contents->stream = std::move(stream);
+ return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents),
+ options,
+ fileLength,
+ postscriptLength));
+ }
+
+ std::map<uint32_t, BloomFilterIndex>
+ ReaderImpl::getBloomFilters(uint32_t stripeIndex,
+ const std::set<uint32_t>& included) const {
+ std::map<uint32_t, BloomFilterIndex> ret;
+
+ // find stripe info
+ if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) {
+ throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex)));
+ }
+ const proto::StripeInformation currentStripeInfo =
+ footer->stripes(static_cast<int>(stripeIndex));
+ const proto::StripeFooter currentStripeFooter =
+ getStripeFooter(currentStripeInfo, *contents);
+
+ // iterate stripe footer to get stream of bloomfilter
+ uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset());
+ for (int i = 0; i < currentStripeFooter.streams_size(); i++) {
+ const proto::Stream& stream = currentStripeFooter.streams(i);
+ uint32_t column = static_cast<uint32_t>(stream.column());
+ uint64_t length = static_cast<uint64_t>(stream.length());
+
+ // a bloom filter stream from a selected column is found
+ if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 &&
+ (included.empty() || included.find(column) != included.end())) {
+
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(contents->compression,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream(contents->stream.get(),
+ offset,
+ length,
+ *contents->pool)),
+ contents->blockSize,
+ *(contents->pool));
+
+ proto::BloomFilterIndex pbBFIndex;
+ if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError("Failed to parse BloomFilterIndex");
+ }
+
+ BloomFilterIndex bfIndex;
+ for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) {
+ std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize(
+ stream.kind(),
+ currentStripeFooter.columns(static_cast<int>(stream.column())),
+ pbBFIndex.bloomfilter(j));
+ bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry)));
+ }
+
+ // add bloom filters to result for one column
+ ret[column] = bfIndex;
+ }
+
+ offset += length;
+ }
+
+ return ret;
+ }
+
+ RowReader::~RowReader() {
+ // PASS
+ }
+
+ Reader::~Reader() {
+ // PASS
+ }
+
+ InputStream::~InputStream() {
+ // PASS
+ };
+
+
+
+}// namespace
diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh
index 49e9d033d9..b4ce7f6529 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.hh
+++ b/contrib/libs/apache/orc/c++/src/Reader.hh
@@ -1,155 +1,155 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_READER_IMPL_HH
-#define ORC_READER_IMPL_HH
-
-#include "orc/Int128.hh"
-#include "orc/OrcFile.hh"
-#include "orc/Reader.hh"
-
-#include "ColumnReader.hh"
-#include "orc/Exceptions.hh"
-#include "RLE.hh"
-#include "TypeImpl.hh"
-
-namespace orc {
-
- static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
-
- /**
- * WriterVersion Implementation
- */
- class WriterVersionImpl {
- private:
- WriterVersion version;
- public:
- // Known Versions with issues resolved
- // The static method below is to fix global constructors Clang warning
- static const WriterVersionImpl& VERSION_HIVE_8732();
-
- WriterVersionImpl(WriterVersion ver) : version(ver) {}
-
- bool compareGT(const WriterVersion other) const {
- return version > other;
- }
- };
-
- /**
- * State shared between Reader and Row Reader
- */
- struct FileContents {
- std::unique_ptr<InputStream> stream;
- std::unique_ptr<proto::PostScript> postscript;
- std::unique_ptr<proto::Footer> footer;
- std::unique_ptr<Type> schema;
- uint64_t blockSize;
- CompressionKind compression;
- MemoryPool *pool;
- std::ostream *errorStream;
- };
-
- proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
- const FileContents& contents);
-
- class ReaderImpl;
-
- class ColumnSelector {
- private:
- std::map<std::string, uint64_t> nameIdMap;
- std::map<uint64_t, const Type*> idTypeMap;
- const FileContents* contents;
- std::vector<std::string> columns;
-
- // build map from type name and id, id to Type
- void buildTypeNameIdMap(const Type* type);
- std::string toDotColumnPath();
-
- public:
- // Select a field by name
- void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name);
- // Select a field by id
- void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId);
- // Select a type by id
- void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId);
-
- // Select all of the recursive children of the given type.
- void selectChildren(std::vector<bool>& selectedColumns, const Type& type);
-
- // For each child of type, select it if one of its children
- // is selected.
- bool selectParents(std::vector<bool>& selectedColumns, const Type& type);
- /**
- * Constructor that selects columns.
- * @param contents of the file
- */
- ColumnSelector(const FileContents* contents);
-
- // Select the columns from the RowReaderoptions object
- void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options);
-
- // Select the columns from the Readeroptions object
- void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options);
- };
-
-
- class RowReaderImpl : public RowReader {
- private:
- const Timezone& localTimezone;
-
- // contents
- std::shared_ptr<FileContents> contents;
- const bool throwOnHive11DecimalOverflow;
- const int32_t forcedScaleOnHive11Decimal;
-
- // inputs
- std::vector<bool> selectedColumns;
-
- // footer
- proto::Footer* footer;
- DataBuffer<uint64_t> firstRowOfStripe;
- mutable std::unique_ptr<Type> selectedSchema;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_READER_IMPL_HH
+#define ORC_READER_IMPL_HH
+
+#include "orc/Int128.hh"
+#include "orc/OrcFile.hh"
+#include "orc/Reader.hh"
+
+#include "ColumnReader.hh"
+#include "orc/Exceptions.hh"
+#include "RLE.hh"
+#include "TypeImpl.hh"
+
+namespace orc {
+
+ static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
+
+ /**
+ * WriterVersion Implementation
+ */
+ class WriterVersionImpl {
+ private:
+ WriterVersion version;
+ public:
+ // Known Versions with issues resolved
+ // The static method below is to fix global constructors Clang warning
+ static const WriterVersionImpl& VERSION_HIVE_8732();
+
+ WriterVersionImpl(WriterVersion ver) : version(ver) {}
+
+ bool compareGT(const WriterVersion other) const {
+ return version > other;
+ }
+ };
+
+ /**
+ * State shared between Reader and Row Reader
+ */
+ struct FileContents {
+ std::unique_ptr<InputStream> stream;
+ std::unique_ptr<proto::PostScript> postscript;
+ std::unique_ptr<proto::Footer> footer;
+ std::unique_ptr<Type> schema;
+ uint64_t blockSize;
+ CompressionKind compression;
+ MemoryPool *pool;
+ std::ostream *errorStream;
+ };
+
+ proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
+ const FileContents& contents);
+
+ class ReaderImpl;
+
+ class ColumnSelector {
+ private:
+ std::map<std::string, uint64_t> nameIdMap;
+ std::map<uint64_t, const Type*> idTypeMap;
+ const FileContents* contents;
+ std::vector<std::string> columns;
+
+ // build map from type name and id, id to Type
+ void buildTypeNameIdMap(const Type* type);
+ std::string toDotColumnPath();
+
+ public:
+ // Select a field by name
+ void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name);
+ // Select a field by id
+ void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId);
+ // Select a type by id
+ void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId);
+
+ // Select all of the recursive children of the given type.
+ void selectChildren(std::vector<bool>& selectedColumns, const Type& type);
+
+ // For each child of type, select it if one of its children
+ // is selected.
+ bool selectParents(std::vector<bool>& selectedColumns, const Type& type);
+ /**
+ * Constructor that selects columns.
+ * @param contents of the file
+ */
+ ColumnSelector(const FileContents* contents);
+
+ // Select the columns from the RowReaderoptions object
+ void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options);
+
+ // Select the columns from the Readeroptions object
+ void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options);
+ };
+
+
+ class RowReaderImpl : public RowReader {
+ private:
+ const Timezone& localTimezone;
+
+ // contents
+ std::shared_ptr<FileContents> contents;
+ const bool throwOnHive11DecimalOverflow;
+ const int32_t forcedScaleOnHive11Decimal;
+
+ // inputs
+ std::vector<bool> selectedColumns;
+
+ // footer
+ proto::Footer* footer;
+ DataBuffer<uint64_t> firstRowOfStripe;
+ mutable std::unique_ptr<Type> selectedSchema;
bool skipBloomFilters;
-
- // reading state
- uint64_t previousRow;
- uint64_t firstStripe;
- uint64_t currentStripe;
- uint64_t lastStripe; // the stripe AFTER the last one
- uint64_t currentRowInStripe;
- uint64_t rowsInCurrentStripe;
- proto::StripeInformation currentStripeInfo;
- proto::StripeFooter currentStripeFooter;
- std::unique_ptr<ColumnReader> reader;
-
- bool enableEncodedBlock;
- // internal methods
- void startNextStripe();
-
- // row index of current stripe with column id as the key
- std::unordered_map<uint64_t, proto::RowIndex> rowIndexes;
-
- /**
- * Seek to the start of a row group in the current stripe
- * @param rowGroupEntryId the row group id to seek to
- */
- void seekToRowGroup(uint32_t rowGroupEntryId);
-
+
+ // reading state
+ uint64_t previousRow;
+ uint64_t firstStripe;
+ uint64_t currentStripe;
+ uint64_t lastStripe; // the stripe AFTER the last one
+ uint64_t currentRowInStripe;
+ uint64_t rowsInCurrentStripe;
+ proto::StripeInformation currentStripeInfo;
+ proto::StripeFooter currentStripeFooter;
+ std::unique_ptr<ColumnReader> reader;
+
+ bool enableEncodedBlock;
+ // internal methods
+ void startNextStripe();
+
+ // row index of current stripe with column id as the key
+ std::unordered_map<uint64_t, proto::RowIndex> rowIndexes;
+
+ /**
+ * Seek to the start of a row group in the current stripe
+ * @param rowGroupEntryId the row group id to seek to
+ */
+ void seekToRowGroup(uint32_t rowGroupEntryId);
+
/**
* Check if the file has bad bloom filters. We will skip using them in the
* following reads.
@@ -157,159 +157,159 @@ namespace orc {
*/
bool hasBadBloomFilters();
- public:
- /**
- * Constructor that lets the user specify additional options.
- * @param contents of the file
- * @param options options for reading
- */
- RowReaderImpl(std::shared_ptr<FileContents> contents,
- const RowReaderOptions& options);
-
- // Select the columns from the options object
- void updateSelected();
- const std::vector<bool> getSelectedColumns() const override;
-
- const Type& getSelectedType() const override;
-
- std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size
- ) const override;
-
- bool next(ColumnVectorBatch& data) override;
-
- CompressionKind getCompression() const;
-
- uint64_t getCompressionSize() const;
-
- uint64_t getRowNumber() const override;
-
- void seekToRow(uint64_t rowNumber) override;
-
- const FileContents& getFileContents() const;
- bool getThrowOnHive11DecimalOverflow() const;
- int32_t getForcedScaleOnHive11Decimal() const;
- };
-
- class ReaderImpl : public Reader {
- private:
- // FileContents
- std::shared_ptr<FileContents> contents;
-
- // inputs
- const ReaderOptions options;
- const uint64_t fileLength;
- const uint64_t postscriptLength;
-
- // footer
- proto::Footer* footer;
- uint64_t numberOfStripes;
- uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns);
-
- // internal methods
- void readMetadata() const;
- void checkOrcVersion();
- void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
- const proto::StripeFooter& currentStripeFooter,
- std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;
-
- // metadata
- mutable std::unique_ptr<proto::Metadata> metadata;
- mutable bool isMetadataLoaded;
- public:
- /**
- * Constructor that lets the user specify additional options.
- * @param contents of the file
- * @param options options for reading
- * @param fileLength the length of the file in bytes
- * @param postscriptLength the length of the postscript in bytes
- */
- ReaderImpl(std::shared_ptr<FileContents> contents,
- const ReaderOptions& options,
- uint64_t fileLength,
- uint64_t postscriptLength);
-
- const ReaderOptions& getReaderOptions() const;
-
- CompressionKind getCompression() const override;
-
- FileVersion getFormatVersion() const override;
-
- WriterId getWriterId() const override;
-
- uint32_t getWriterIdValue() const override;
-
+ public:
+ /**
+ * Constructor that lets the user specify additional options.
+ * @param contents of the file
+ * @param options options for reading
+ */
+ RowReaderImpl(std::shared_ptr<FileContents> contents,
+ const RowReaderOptions& options);
+
+ // Select the columns from the options object
+ void updateSelected();
+ const std::vector<bool> getSelectedColumns() const override;
+
+ const Type& getSelectedType() const override;
+
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size
+ ) const override;
+
+ bool next(ColumnVectorBatch& data) override;
+
+ CompressionKind getCompression() const;
+
+ uint64_t getCompressionSize() const;
+
+ uint64_t getRowNumber() const override;
+
+ void seekToRow(uint64_t rowNumber) override;
+
+ const FileContents& getFileContents() const;
+ bool getThrowOnHive11DecimalOverflow() const;
+ int32_t getForcedScaleOnHive11Decimal() const;
+ };
+
+ class ReaderImpl : public Reader {
+ private:
+ // FileContents
+ std::shared_ptr<FileContents> contents;
+
+ // inputs
+ const ReaderOptions options;
+ const uint64_t fileLength;
+ const uint64_t postscriptLength;
+
+ // footer
+ proto::Footer* footer;
+ uint64_t numberOfStripes;
+ uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns);
+
+ // internal methods
+ void readMetadata() const;
+ void checkOrcVersion();
+ void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex,
+ const proto::StripeFooter& currentStripeFooter,
+ std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;
+
+ // metadata
+ mutable std::unique_ptr<proto::Metadata> metadata;
+ mutable bool isMetadataLoaded;
+ public:
+ /**
+ * Constructor that lets the user specify additional options.
+ * @param contents of the file
+ * @param options options for reading
+ * @param fileLength the length of the file in bytes
+ * @param postscriptLength the length of the postscript in bytes
+ */
+ ReaderImpl(std::shared_ptr<FileContents> contents,
+ const ReaderOptions& options,
+ uint64_t fileLength,
+ uint64_t postscriptLength);
+
+ const ReaderOptions& getReaderOptions() const;
+
+ CompressionKind getCompression() const override;
+
+ FileVersion getFormatVersion() const override;
+
+ WriterId getWriterId() const override;
+
+ uint32_t getWriterIdValue() const override;
+
std::string getSoftwareVersion() const override;
- WriterVersion getWriterVersion() const override;
-
- uint64_t getNumberOfRows() const override;
-
- uint64_t getRowIndexStride() const override;
-
- std::list<std::string> getMetadataKeys() const override;
-
- std::string getMetadataValue(const std::string& key) const override;
-
- bool hasMetadataValue(const std::string& key) const override;
-
- uint64_t getCompressionSize() const override;
-
- uint64_t getNumberOfStripes() const override;
-
- std::unique_ptr<StripeInformation> getStripe(uint64_t
- ) const override;
-
- uint64_t getNumberOfStripeStatistics() const override;
-
- const std::string& getStreamName() const override;
-
- std::unique_ptr<StripeStatistics>
- getStripeStatistics(uint64_t stripeIndex) const override;
-
- std::unique_ptr<RowReader> createRowReader() const override;
-
- std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options
- ) const override;
-
- uint64_t getContentLength() const override;
- uint64_t getStripeStatisticsLength() const override;
- uint64_t getFileFooterLength() const override;
- uint64_t getFilePostscriptLength() const override;
- uint64_t getFileLength() const override;
-
- std::unique_ptr<Statistics> getStatistics() const override;
-
- std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId
- ) const override;
-
- std::string getSerializedFileTail() const override;
-
- const Type& getType() const override;
-
- bool hasCorrectStatistics() const override;
-
- const proto::PostScript* getPostscript() const {return contents->postscript.get();}
-
- uint64_t getBlockSize() const {return contents->blockSize;}
-
- const proto::Footer* getFooter() const {return contents->footer.get();}
-
- const Type* getSchema() const {return contents->schema.get();}
-
- InputStream* getStream() const {return contents->stream.get();}
-
- uint64_t getMemoryUse(int stripeIx = -1) override;
-
- uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override;
-
- uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override;
-
- uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override;
-
- std::map<uint32_t, BloomFilterIndex>
- getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
- };
-
-}// namespace
-
-#endif
+ WriterVersion getWriterVersion() const override;
+
+ uint64_t getNumberOfRows() const override;
+
+ uint64_t getRowIndexStride() const override;
+
+ std::list<std::string> getMetadataKeys() const override;
+
+ std::string getMetadataValue(const std::string& key) const override;
+
+ bool hasMetadataValue(const std::string& key) const override;
+
+ uint64_t getCompressionSize() const override;
+
+ uint64_t getNumberOfStripes() const override;
+
+ std::unique_ptr<StripeInformation> getStripe(uint64_t
+ ) const override;
+
+ uint64_t getNumberOfStripeStatistics() const override;
+
+ const std::string& getStreamName() const override;
+
+ std::unique_ptr<StripeStatistics>
+ getStripeStatistics(uint64_t stripeIndex) const override;
+
+ std::unique_ptr<RowReader> createRowReader() const override;
+
+ std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options
+ ) const override;
+
+ uint64_t getContentLength() const override;
+ uint64_t getStripeStatisticsLength() const override;
+ uint64_t getFileFooterLength() const override;
+ uint64_t getFilePostscriptLength() const override;
+ uint64_t getFileLength() const override;
+
+ std::unique_ptr<Statistics> getStatistics() const override;
+
+ std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId
+ ) const override;
+
+ std::string getSerializedFileTail() const override;
+
+ const Type& getType() const override;
+
+ bool hasCorrectStatistics() const override;
+
+ const proto::PostScript* getPostscript() const {return contents->postscript.get();}
+
+ uint64_t getBlockSize() const {return contents->blockSize;}
+
+ const proto::Footer* getFooter() const {return contents->footer.get();}
+
+ const Type* getSchema() const {return contents->schema.get();}
+
+ InputStream* getStream() const {return contents->stream.get();}
+
+ uint64_t getMemoryUse(int stripeIx = -1) override;
+
+ uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override;
+
+ uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override;
+
+ uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override;
+
+ std::map<uint32_t, BloomFilterIndex>
+ getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override;
+ };
+
+}// namespace
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
index c5c6f6a801..2b7acb0bd5 100644
--- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc
@@ -1,426 +1,426 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "Compression.hh"
-#include "RLEv2.hh"
-#include "RLEV2Util.hh"
-
-namespace orc {
-
-int64_t RleDecoderV2::readLongBE(uint64_t bsz) {
- int64_t ret = 0, val;
- uint64_t n = bsz;
- while (n > 0) {
- n--;
- val = readByte();
- ret |= (val << (n * 8));
- }
- return ret;
-}
-
-inline int64_t RleDecoderV2::readVslong() {
- return unZigZag(readVulong());
-}
-
-uint64_t RleDecoderV2::readVulong() {
- uint64_t ret = 0, b;
- uint64_t offset = 0;
- do {
- b = readByte();
- ret |= (0x7f & b) << offset;
- offset += 7;
- } while (b >= 0x80);
- return ret;
-}
-
-RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
- bool _isSigned, MemoryPool& pool
- ): inputStream(std::move(input)),
- isSigned(_isSigned),
- firstByte(0),
- runLength(0),
- runRead(0),
- bufferStart(nullptr),
- bufferEnd(bufferStart),
- deltaBase(0),
- byteSize(0),
- firstValue(0),
- prevValue(0),
- bitSize(0),
- bitsLeft(0),
- curByte(0),
- patchBitSize(0),
- unpackedIdx(0),
- patchIdx(0),
- base(0),
- curGap(0),
- curPatch(0),
- patchMask(0),
- actualGap(0),
- unpacked(pool, 0),
- unpackedPatch(pool, 0) {
- // PASS
-}
-
-void RleDecoderV2::seek(PositionProvider& location) {
- // move the input stream
- inputStream->seek(location);
- // clear state
- bufferEnd = bufferStart = nullptr;
- runRead = runLength = 0;
- // skip ahead the given number of records
- skip(location.next());
-}
-
-void RleDecoderV2::skip(uint64_t numValues) {
- // simple for now, until perf tests indicate something encoding specific is
- // needed
- const uint64_t N = 64;
- int64_t dummy[N];
-
- while (numValues) {
- uint64_t nRead = std::min(N, numValues);
- next(dummy, nRead, nullptr);
- numValues -= nRead;
- }
-}
-
-void RleDecoderV2::next(int64_t* const data,
- const uint64_t numValues,
- const char* const notNull) {
- uint64_t nRead = 0;
-
- while (nRead < numValues) {
- // Skip any nulls before attempting to read first byte.
- while (notNull && !notNull[nRead]) {
- if (++nRead == numValues) {
- return; // ended with null values
- }
- }
-
- if (runRead == runLength) {
- resetRun();
- firstByte = readByte();
- }
-
- uint64_t offset = nRead, length = numValues - nRead;
-
- EncodingType enc = static_cast<EncodingType>
- ((firstByte >> 6) & 0x03);
- switch(static_cast<int64_t>(enc)) {
- case SHORT_REPEAT:
- nRead += nextShortRepeats(data, offset, length, notNull);
- break;
- case DIRECT:
- nRead += nextDirect(data, offset, length, notNull);
- break;
- case PATCHED_BASE:
- nRead += nextPatched(data, offset, length, notNull);
- break;
- case DELTA:
- nRead += nextDelta(data, offset, length, notNull);
- break;
- default:
- throw ParseError("unknown encoding");
- }
- }
-}
-
-uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bytes
- byteSize = (firstByte >> 3) & 0x07;
- byteSize += 1;
-
- runLength = firstByte & 0x07;
- // run lengths values are stored only after MIN_REPEAT value is met
- runLength += MIN_REPEAT;
- runRead = 0;
-
- // read the repeated value which is store using fixed bytes
- firstValue = readLongBE(byteSize);
-
- if (isSigned) {
- firstValue = unZigZag(static_cast<uint64_t>(firstValue));
- }
- }
-
- uint64_t nRead = std::min(runLength - runRead, numValues);
-
- if (notNull) {
- for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
- if (notNull[pos]) {
- data[pos] = firstValue;
- ++runRead;
- }
- }
- } else {
- for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
- data[pos] = firstValue;
- ++runRead;
- }
- }
-
- return nRead;
-}
-
-uint64_t RleDecoderV2::nextDirect(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
- bitSize = decodeBitWidth(fbo);
-
- // extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- // runs are one off
- runLength += 1;
- runRead = 0;
- }
-
- uint64_t nRead = std::min(runLength - runRead, numValues);
-
- runRead += readLongs(data, offset, nRead, bitSize, notNull);
-
- if (isSigned) {
- if (notNull) {
- for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
- if (notNull[pos]) {
- data[pos] = unZigZag(static_cast<uint64_t>(data[pos]));
- }
- }
- } else {
- for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
- data[pos] = unZigZag(static_cast<uint64_t>(data[pos]));
- }
- }
- }
-
- return nRead;
-}
-
-uint64_t RleDecoderV2::nextPatched(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
- bitSize = decodeBitWidth(fbo);
-
- // extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- // runs are one off
- runLength += 1;
- runRead = 0;
-
- // extract the number of bytes occupied by base
- uint64_t thirdByte = readByte();
- byteSize = (thirdByte >> 5) & 0x07;
- // base width is one off
- byteSize += 1;
-
- // extract patch width
- uint32_t pwo = thirdByte & 0x1f;
- patchBitSize = decodeBitWidth(pwo);
-
- // read fourth byte and extract patch gap width
- uint64_t fourthByte = readByte();
- uint32_t pgw = (fourthByte >> 5) & 0x07;
- // patch gap width is one off
- pgw += 1;
-
- // extract the length of the patch list
- size_t pl = fourthByte & 0x1f;
- if (pl == 0) {
- throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!");
- }
-
- // read the next base width number of bytes to extract base value
- base = readLongBE(byteSize);
- int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1));
- // if mask of base value is 1 then base is negative value else positive
- if ((base & mask) != 0) {
- base = base & ~mask;
- base = -base;
- }
-
- // TODO: something more efficient than resize
- unpacked.resize(runLength);
- unpackedIdx = 0;
- readLongs(unpacked.data(), 0, runLength, bitSize);
- // any remaining bits are thrown out
- resetReadLongs();
-
- // TODO: something more efficient than resize
- unpackedPatch.resize(pl);
- patchIdx = 0;
- // TODO: Skip corrupt?
- // if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
- if ((patchBitSize + pgw) > 64) {
- throw ParseError("Corrupt PATCHED_BASE encoded data "
- "(patchBitSize + pgw > 64)!");
- }
- uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
- readLongs(unpackedPatch.data(), 0, pl, cfb);
- // any remaining bits are thrown out
- resetReadLongs();
-
- // apply the patch directly when decoding the packed data
- patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1);
-
- adjustGapAndPatch();
- }
-
- uint64_t nRead = std::min(runLength - runRead, numValues);
-
- for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
- // skip null positions
- if (notNull && !notNull[pos]) {
- continue;
- }
- if (static_cast<int64_t>(unpackedIdx) != actualGap) {
- // no patching required. add base to unpacked value to get final value
- data[pos] = base + unpacked[unpackedIdx];
- } else {
- // extract the patch value
- int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize);
-
- // add base to patched value
- data[pos] = base + patchedVal;
-
- // increment the patch to point to next entry in patch list
- ++patchIdx;
-
- if (patchIdx < unpackedPatch.size()) {
- adjustGapAndPatch();
-
- // next gap is relative to the current gap
- actualGap += unpackedIdx;
- }
- }
-
- ++runRead;
- ++unpackedIdx;
- }
-
- return nRead;
-}
-
-uint64_t RleDecoderV2::nextDelta(int64_t* const data,
- uint64_t offset,
- uint64_t numValues,
- const char* const notNull) {
- if (runRead == runLength) {
- // extract the number of fixed bits
- unsigned char fbo = (firstByte >> 1) & 0x1f;
- if (fbo != 0) {
- bitSize = decodeBitWidth(fbo);
- } else {
- bitSize = 0;
- }
-
- // extract the run length
- runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
- runLength |= readByte();
- ++runLength; // account for first value
- runRead = deltaBase = 0;
-
- // read the first value stored as vint
- if (isSigned) {
- firstValue = static_cast<int64_t>(readVslong());
- } else {
- firstValue = static_cast<int64_t>(readVulong());
- }
-
- prevValue = firstValue;
-
- // read the fixed delta value stored as vint (deltas can be negative even
- // if all number are positive)
- deltaBase = static_cast<int64_t>(readVslong());
- }
-
- uint64_t nRead = std::min(runLength - runRead, numValues);
-
- uint64_t pos = offset;
- for ( ; pos < offset + nRead; ++pos) {
- // skip null positions
- if (!notNull || notNull[pos]) break;
- }
- if (runRead == 0 && pos < offset + nRead) {
- data[pos++] = firstValue;
- ++runRead;
- }
-
- if (bitSize == 0) {
- // add fixed deltas to adjacent values
- for ( ; pos < offset + nRead; ++pos) {
- // skip null positions
- if (notNull && !notNull[pos]) {
- continue;
- }
- prevValue = data[pos] = prevValue + deltaBase;
- ++runRead;
- }
- } else {
- for ( ; pos < offset + nRead; ++pos) {
- // skip null positions
- if (!notNull || notNull[pos]) break;
- }
- if (runRead < 2 && pos < offset + nRead) {
- // add delta base and first value
- prevValue = data[pos++] = firstValue + deltaBase;
- ++runRead;
- }
-
- // write the unpacked values, add it to previous value and store final
- // value to result buffer. if the delta base value is negative then it
- // is a decreasing sequence else an increasing sequence
- uint64_t remaining = (offset + nRead) - pos;
- runRead += readLongs(data, pos, remaining, bitSize, notNull);
-
- if (deltaBase < 0) {
- for ( ; pos < offset + nRead; ++pos) {
- // skip null positions
- if (notNull && !notNull[pos]) {
- continue;
- }
- prevValue = data[pos] = prevValue - data[pos];
- }
- } else {
- for ( ; pos < offset + nRead; ++pos) {
- // skip null positions
- if (notNull && !notNull[pos]) {
- continue;
- }
- prevValue = data[pos] = prevValue + data[pos];
- }
- }
- }
- return nRead;
-}
-
-} // namespace orc
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "RLEv2.hh"
+#include "RLEV2Util.hh"
+
+namespace orc {
+
+int64_t RleDecoderV2::readLongBE(uint64_t bsz) {
+ int64_t ret = 0, val;
+ uint64_t n = bsz;
+ while (n > 0) {
+ n--;
+ val = readByte();
+ ret |= (val << (n * 8));
+ }
+ return ret;
+}
+
+inline int64_t RleDecoderV2::readVslong() {
+ return unZigZag(readVulong());
+}
+
+uint64_t RleDecoderV2::readVulong() {
+ uint64_t ret = 0, b;
+ uint64_t offset = 0;
+ do {
+ b = readByte();
+ ret |= (0x7f & b) << offset;
+ offset += 7;
+ } while (b >= 0x80);
+ return ret;
+}
+
+RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
+ bool _isSigned, MemoryPool& pool
+ ): inputStream(std::move(input)),
+ isSigned(_isSigned),
+ firstByte(0),
+ runLength(0),
+ runRead(0),
+ bufferStart(nullptr),
+ bufferEnd(bufferStart),
+ deltaBase(0),
+ byteSize(0),
+ firstValue(0),
+ prevValue(0),
+ bitSize(0),
+ bitsLeft(0),
+ curByte(0),
+ patchBitSize(0),
+ unpackedIdx(0),
+ patchIdx(0),
+ base(0),
+ curGap(0),
+ curPatch(0),
+ patchMask(0),
+ actualGap(0),
+ unpacked(pool, 0),
+ unpackedPatch(pool, 0) {
+ // PASS
+}
+
+void RleDecoderV2::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // clear state
+ bufferEnd = bufferStart = nullptr;
+ runRead = runLength = 0;
+ // skip ahead the given number of records
+ skip(location.next());
+}
+
+void RleDecoderV2::skip(uint64_t numValues) {
+ // simple for now, until perf tests indicate something encoding specific is
+ // needed
+ const uint64_t N = 64;
+ int64_t dummy[N];
+
+ while (numValues) {
+ uint64_t nRead = std::min(N, numValues);
+ next(dummy, nRead, nullptr);
+ numValues -= nRead;
+ }
+}
+
+void RleDecoderV2::next(int64_t* const data,
+ const uint64_t numValues,
+ const char* const notNull) {
+ uint64_t nRead = 0;
+
+ while (nRead < numValues) {
+ // Skip any nulls before attempting to read first byte.
+ while (notNull && !notNull[nRead]) {
+ if (++nRead == numValues) {
+ return; // ended with null values
+ }
+ }
+
+ if (runRead == runLength) {
+ resetRun();
+ firstByte = readByte();
+ }
+
+ uint64_t offset = nRead, length = numValues - nRead;
+
+ EncodingType enc = static_cast<EncodingType>
+ ((firstByte >> 6) & 0x03);
+ switch(static_cast<int64_t>(enc)) {
+ case SHORT_REPEAT:
+ nRead += nextShortRepeats(data, offset, length, notNull);
+ break;
+ case DIRECT:
+ nRead += nextDirect(data, offset, length, notNull);
+ break;
+ case PATCHED_BASE:
+ nRead += nextPatched(data, offset, length, notNull);
+ break;
+ case DELTA:
+ nRead += nextDelta(data, offset, length, notNull);
+ break;
+ default:
+ throw ParseError("unknown encoding");
+ }
+ }
+}
+
+uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bytes
+ byteSize = (firstByte >> 3) & 0x07;
+ byteSize += 1;
+
+ runLength = firstByte & 0x07;
+ // run lengths values are stored only after MIN_REPEAT value is met
+ runLength += MIN_REPEAT;
+ runRead = 0;
+
+ // read the repeated value which is store using fixed bytes
+ firstValue = readLongBE(byteSize);
+
+ if (isSigned) {
+ firstValue = unZigZag(static_cast<uint64_t>(firstValue));
+ }
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ if (notNull) {
+ for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ data[pos] = firstValue;
+ ++runRead;
+ }
+ }
+ } else {
+ for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = firstValue;
+ ++runRead;
+ }
+ }
+
+ return nRead;
+}
+
+uint64_t RleDecoderV2::nextDirect(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ runRead += readLongs(data, offset, nRead, bitSize, notNull);
+
+ if (isSigned) {
+ if (notNull) {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ data[pos] = unZigZag(static_cast<uint64_t>(data[pos]));
+ }
+ }
+ } else {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = unZigZag(static_cast<uint64_t>(data[pos]));
+ }
+ }
+ }
+
+ return nRead;
+}
+
+uint64_t RleDecoderV2::nextPatched(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+
+ // extract the number of bytes occupied by base
+ uint64_t thirdByte = readByte();
+ byteSize = (thirdByte >> 5) & 0x07;
+ // base width is one off
+ byteSize += 1;
+
+ // extract patch width
+ uint32_t pwo = thirdByte & 0x1f;
+ patchBitSize = decodeBitWidth(pwo);
+
+ // read fourth byte and extract patch gap width
+ uint64_t fourthByte = readByte();
+ uint32_t pgw = (fourthByte >> 5) & 0x07;
+ // patch gap width is one off
+ pgw += 1;
+
+ // extract the length of the patch list
+ size_t pl = fourthByte & 0x1f;
+ if (pl == 0) {
+ throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!");
+ }
+
+ // read the next base width number of bytes to extract base value
+ base = readLongBE(byteSize);
+ int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1));
+ // if mask of base value is 1 then base is negative value else positive
+ if ((base & mask) != 0) {
+ base = base & ~mask;
+ base = -base;
+ }
+
+ // TODO: something more efficient than resize
+ unpacked.resize(runLength);
+ unpackedIdx = 0;
+ readLongs(unpacked.data(), 0, runLength, bitSize);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // TODO: something more efficient than resize
+ unpackedPatch.resize(pl);
+ patchIdx = 0;
+ // TODO: Skip corrupt?
+ // if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
+ if ((patchBitSize + pgw) > 64) {
+ throw ParseError("Corrupt PATCHED_BASE encoded data "
+ "(patchBitSize + pgw > 64)!");
+ }
+ uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
+ readLongs(unpackedPatch.data(), 0, pl, cfb);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // apply the patch directly when decoding the packed data
+ patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1);
+
+ adjustGapAndPatch();
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ if (static_cast<int64_t>(unpackedIdx) != actualGap) {
+ // no patching required. add base to unpacked value to get final value
+ data[pos] = base + unpacked[unpackedIdx];
+ } else {
+ // extract the patch value
+ int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize);
+
+ // add base to patched value
+ data[pos] = base + patchedVal;
+
+ // increment the patch to point to next entry in patch list
+ ++patchIdx;
+
+ if (patchIdx < unpackedPatch.size()) {
+ adjustGapAndPatch();
+
+ // next gap is relative to the current gap
+ actualGap += unpackedIdx;
+ }
+ }
+
+ ++runRead;
+ ++unpackedIdx;
+ }
+
+ return nRead;
+}
+
+uint64_t RleDecoderV2::nextDelta(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ if (fbo != 0) {
+ bitSize = decodeBitWidth(fbo);
+ } else {
+ bitSize = 0;
+ }
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ ++runLength; // account for first value
+ runRead = deltaBase = 0;
+
+ // read the first value stored as vint
+ if (isSigned) {
+ firstValue = static_cast<int64_t>(readVslong());
+ } else {
+ firstValue = static_cast<int64_t>(readVulong());
+ }
+
+ prevValue = firstValue;
+
+ // read the fixed delta value stored as vint (deltas can be negative even
+ // if all number are positive)
+ deltaBase = static_cast<int64_t>(readVslong());
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ uint64_t pos = offset;
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull || notNull[pos]) break;
+ }
+ if (runRead == 0 && pos < offset + nRead) {
+ data[pos++] = firstValue;
+ ++runRead;
+ }
+
+ if (bitSize == 0) {
+ // add fixed deltas to adjacent values
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue + deltaBase;
+ ++runRead;
+ }
+ } else {
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull || notNull[pos]) break;
+ }
+ if (runRead < 2 && pos < offset + nRead) {
+ // add delta base and first value
+ prevValue = data[pos++] = firstValue + deltaBase;
+ ++runRead;
+ }
+
+ // write the unpacked values, add it to previous value and store final
+ // value to result buffer. if the delta base value is negative then it
+ // is a decreasing sequence else an increasing sequence
+ uint64_t remaining = (offset + nRead) - pos;
+ runRead += readLongs(data, pos, remaining, bitSize, notNull);
+
+ if (deltaBase < 0) {
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue - data[pos];
+ }
+ } else {
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue + data[pos];
+ }
+ }
+ }
+ return nRead;
+}
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
index 44e2761b74..f77838a4dd 100644
--- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
@@ -1,773 +1,773 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with option work for additional information
- * regarding copyright ownership. The ASF licenses option file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use option file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "Compression.hh"
-#include "RLEv2.hh"
-#include "RLEV2Util.hh"
-
-#define MAX_LITERAL_SIZE 512
-#define MAX_SHORT_REPEAT_LENGTH 10
-
-namespace orc {
-
-/**
- * Compute the bits required to represent pth percentile value
- * @param data - array
- * @param p - percentile value (>=0.0 to <=1.0)
- * @return pth percentile bits
- */
-uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
- if ((p > 1.0) || (p <= 0.0)) {
- throw InvalidArgument("Invalid p value: " + to_string(p));
- }
-
- if (!reuseHist) {
- // histogram that store the encoded bit requirement for each values.
- // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t));
- // compute the histogram
- for(size_t i = offset; i < (offset + length); i++) {
- uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
- histgram[idx] += 1;
- }
- }
-
- int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
-
- // return the bits required by pth percentile length
- for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
- perLen -= histgram[i];
- if (perLen < 0) {
- return decodeBitWidth(static_cast<uint32_t>(i));
- }
- }
- return 0;
-}
-
-RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
- bool hasSigned, bool alignBitPacking) :
- RleEncoder(std::move(outStream), hasSigned),
- alignedBitPacking(alignBitPacking),
- prevDelta(0){
- literals = new int64_t[MAX_LITERAL_SIZE];
- gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
- zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
- baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
- adjDeltas = new int64_t[MAX_LITERAL_SIZE];
-}
-
-void RleEncoderV2::write(int64_t val) {
- if(numLiterals == 0) {
- initializeLiterals(val);
- return;
- }
-
- if(numLiterals == 1) {
- prevDelta = val - literals[0];
- literals[numLiterals++] = val;
-
- if(val == literals[0]) {
- fixedRunLength = 2;
- variableRunLength = 0;
- } else {
- fixedRunLength = 0;
- variableRunLength = 2;
- }
- return;
- }
-
- int64_t currentDelta = val - literals[numLiterals - 1];
- EncodingOption option = {};
- if (prevDelta == 0 && currentDelta == 0) {
- // case 1: fixed delta run
- literals[numLiterals++] = val;
-
- if (variableRunLength > 0) {
- // if variable run is non-zero then we are seeing repeating
- // values at the end of variable run in which case fixed Run
- // length is 2
- fixedRunLength = 2;
- }
- fixedRunLength++;
-
- // if fixed run met the minimum condition and if variable
- // run is non-zero then flush the variable run and shift the
- // tail fixed runs to start of the buffer
- if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
- numLiterals -= MIN_REPEAT;
- variableRunLength -= (MIN_REPEAT - 1);
-
- determineEncoding(option);
- writeValues(option);
-
- // shift tail fixed runs to beginning of the buffer
- for (size_t i = 0; i < MIN_REPEAT; ++i) {
- literals[i] = val;
- }
- numLiterals = MIN_REPEAT;
- }
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with option work for additional information
+ * regarding copyright ownership. The ASF licenses option file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use option file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "RLEv2.hh"
+#include "RLEV2Util.hh"
+
+#define MAX_LITERAL_SIZE 512
+#define MAX_SHORT_REPEAT_LENGTH 10
+
+namespace orc {
+
+/**
+ * Compute the bits required to represent pth percentile value
+ * @param data - array
+ * @param p - percentile value (>=0.0 to <=1.0)
+ * @return pth percentile bits
+ */
+uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
+ if ((p > 1.0) || (p <= 0.0)) {
+ throw InvalidArgument("Invalid p value: " + to_string(p));
+ }
+
+ if (!reuseHist) {
+ // histogram that store the encoded bit requirement for each values.
+ // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
+ memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t));
+ // compute the histogram
+ for(size_t i = offset; i < (offset + length); i++) {
+ uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
+ histgram[idx] += 1;
+ }
+ }
+
+ int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
+
+ // return the bits required by pth percentile length
+ for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
+ perLen -= histgram[i];
+ if (perLen < 0) {
+ return decodeBitWidth(static_cast<uint32_t>(i));
+ }
+ }
+ return 0;
+}
+
+RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
+ bool hasSigned, bool alignBitPacking) :
+ RleEncoder(std::move(outStream), hasSigned),
+ alignedBitPacking(alignBitPacking),
+ prevDelta(0){
+ literals = new int64_t[MAX_LITERAL_SIZE];
+ gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
+ zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
+ baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
+ adjDeltas = new int64_t[MAX_LITERAL_SIZE];
+}
+
+void RleEncoderV2::write(int64_t val) {
+ if(numLiterals == 0) {
+ initializeLiterals(val);
+ return;
+ }
+
+ if(numLiterals == 1) {
+ prevDelta = val - literals[0];
+ literals[numLiterals++] = val;
+
+ if(val == literals[0]) {
+ fixedRunLength = 2;
+ variableRunLength = 0;
+ } else {
+ fixedRunLength = 0;
+ variableRunLength = 2;
+ }
+ return;
+ }
+
+ int64_t currentDelta = val - literals[numLiterals - 1];
+ EncodingOption option = {};
+ if (prevDelta == 0 && currentDelta == 0) {
+ // case 1: fixed delta run
+ literals[numLiterals++] = val;
+
+ if (variableRunLength > 0) {
+ // if variable run is non-zero then we are seeing repeating
+ // values at the end of variable run in which case fixed Run
+ // length is 2
+ fixedRunLength = 2;
+ }
+ fixedRunLength++;
+
+ // if fixed run met the minimum condition and if variable
+ // run is non-zero then flush the variable run and shift the
+ // tail fixed runs to start of the buffer
+ if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
+ numLiterals -= MIN_REPEAT;
+ variableRunLength -= (MIN_REPEAT - 1);
+
+ determineEncoding(option);
+ writeValues(option);
+
+ // shift tail fixed runs to beginning of the buffer
+ for (size_t i = 0; i < MIN_REPEAT; ++i) {
+ literals[i] = val;
+ }
+ numLiterals = MIN_REPEAT;
+ }
+
if (fixedRunLength == MAX_LITERAL_SIZE) {
- determineEncoding(option);
- writeValues(option);
- }
- return;
- }
-
- // case 2: variable delta run
-
- // if fixed run length is non-zero and if it satisfies the
- // short repeat conditions then write the values as short repeats
- // else use delta encoding
- if (fixedRunLength >= MIN_REPEAT) {
- if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- option.encoding = SHORT_REPEAT;
- } else {
- option.encoding = DELTA;
- option.isFixedDelta = true;
- }
- writeValues(option);
- }
-
- // if fixed run length is <MIN_REPEAT and current value is
- // different from previous then treat it as variable run
- if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- }
-
- // after writing values re-initialize the variables
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- prevDelta = val - literals[numLiterals - 1];
- literals[numLiterals++] = val;
- variableRunLength++;
-
- if (variableRunLength == MAX_LITERAL_SIZE) {
- determineEncoding(option);
- writeValues(option);
- }
- }
-}
-
-void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
- int64_t zzEncVal = 0;
- for (size_t i = 0; i < numLiterals; i++) {
- if (isSigned) {
- zzEncVal = zigZag(literals[i]);
- } else {
- zzEncVal = literals[i];
- }
- zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
- }
-}
-
-void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
- // mask will be max value beyond which patch will be generated
- int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
-
- // since we are considering only 95 percentile, the size of gap and
- // patch array can contain only be 5% values
- option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20)));
-
- // #bit for patch
- option.patchWidth = option.brBits100p - option.brBits95p;
- option.patchWidth = getClosestFixedBits(option.patchWidth);
-
- // if patch bit requirement is 64 then it will not possible to pack
- // gap and patch together in a long. To make sure gap and patch can be
- // packed together adjust the patch width
- if (option.patchWidth == 64) {
- option.patchWidth = 56;
- option.brBits95p = 8;
- mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
- }
-
- uint32_t gapIdx = 0;
- uint32_t patchIdx = 0;
- size_t prev = 0;
- size_t maxGap = 0;
-
- std::vector<int64_t> gapList;
- std::vector<int64_t> patchList;
-
- for(size_t i = 0; i < numLiterals; i++) {
- // if value is above mask then create the patch and record the gap
- if (baseRedLiterals[i] > mask) {
- size_t gap = i - prev;
- if (gap > maxGap) {
- maxGap = gap;
- }
-
- // gaps are relative, so store the previous patched value index
- prev = i;
- gapList.push_back(static_cast<int64_t>(gap));
- gapIdx++;
-
- // extract the most significant bits that are over mask bits
- int64_t patch = baseRedLiterals[i] >> option.brBits95p;
- patchList.push_back(patch);
- patchIdx++;
-
- // strip off the MSB to enable safe bit packing
- baseRedLiterals[i] &= mask;
- }
- }
-
- // adjust the patch length to number of entries in gap list
- option.patchLength = gapIdx;
-
- // if the element to be patched is the first and only element then
- // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
- if (maxGap == 0 && option.patchLength != 0) {
- option.patchGapWidth = 1;
- } else {
- option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap));
- }
-
- // special case: if the patch gap width is greater than 256, then
- // we need 9 bits to encode the gap width. But we only have 3 bits in
- // header to record the gap width. To deal with this case, we will save
- // two entries in patch list in the following way
- // 256 gap width => 0 for patch value
- // actual gap - 256 => actual patch value
- // We will do the same for gap width = 511. If the element to be patched is
- // the last element in the scope then gap width will be 511. In this case we
- // will have 3 entries in the patch list in the following way
- // 255 gap width => 0 for patch value
- // 255 gap width => 0 for patch value
- // 1 gap width => actual patch value
- if (option.patchGapWidth > 8) {
- option.patchGapWidth = 8;
- // for gap = 511, we need two additional entries in patch list
- if (maxGap == 511) {
- option.patchLength += 2;
- } else {
- option.patchLength += 1;
- }
- }
-
- // create gap vs patch list
- gapIdx = 0;
- patchIdx = 0;
- for(size_t i = 0; i < option.patchLength; i++) {
- int64_t g = gapList[gapIdx++];
- int64_t p = patchList[patchIdx++];
- while (g > 255) {
- gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
- i++;
- g -= 255;
- }
-
- // store patch value in LSBs and gap in MSBs
- gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
- }
-}
-
-void RleEncoderV2::determineEncoding(EncodingOption& option) {
- // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
- // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
- // computation when it's determined to be necessary.
-
- // not a big win for shorter runs to determine encoding
- if (numLiterals <= MIN_REPEAT) {
- // we need to compute zigzag values for DIRECT encoding if we decide to
- // break early for delta overflows or for shorter runs
- computeZigZagLiterals(option);
- option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
- option.encoding = DIRECT;
- return;
- }
-
- // DELTA encoding check
-
- // for identifying monotonic sequences
- bool isIncreasing = true;
- bool isDecreasing = true;
- option.isFixedDelta = true;
-
- option.min = literals[0];
- int64_t max = literals[0];
- int64_t initialDelta = literals[1] - literals[0];
- int64_t currDelta = 0;
- int64_t deltaMax = 0;
- adjDeltas[option.adjDeltasCount++] = initialDelta;
-
- for (size_t i = 1; i < numLiterals; i++) {
- const int64_t l1 = literals[i];
- const int64_t l0 = literals[i - 1];
- currDelta = l1 - l0;
- option.min = std::min(option.min, l1);
- max = std::max(max, l1);
-
- isIncreasing &= (l0 <= l1);
- isDecreasing &= (l0 >= l1);
-
- option.isFixedDelta &= (currDelta == initialDelta);
- if (i > 1) {
- adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
- deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
- }
- }
-
- // it's faster to exit under delta overflow condition without checking for
- // PATCHED_BASE condition as encoding using DIRECT is faster and has less
- // overhead than PATCHED_BASE
- if (!isSafeSubtract(max, option.min)) {
- computeZigZagLiterals(option);
- option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
- option.encoding = DIRECT;
- return;
- }
-
- // invariant - subtracting any number from any other in the literals after
- // option point won't overflow
-
- // if min is equal to max then the delta is 0, option condition happens for
- // fixed values run >10 which cannot be encoded with SHORT_REPEAT
- if (option.min == max) {
- if (!option.isFixedDelta) {
- throw InvalidArgument(to_string(option.min) + "==" +
- to_string(max) + ", isFixedDelta cannot be false");
- }
-
- if(currDelta != 0) {
- throw InvalidArgument(to_string(option.min) + "==" +
- to_string(max) + ", currDelta should be zero");
- }
- option.fixedDelta = 0;
- option.encoding = DELTA;
- return;
- }
-
- if (option.isFixedDelta) {
- if (currDelta != initialDelta) {
- throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
- }
-
- option.encoding = DELTA;
- option.fixedDelta = currDelta;
- return;
- }
-
- // if initialDelta is 0 then we cannot delta encode as we cannot identify
- // the sign of deltas (increasing or decreasing)
- if (initialDelta != 0) {
- // stores the number of bits required for packing delta blob in
- // delta encoding
- option.bitsDeltaMax = findClosestNumBits(deltaMax);
-
- // monotonic condition
- if (isIncreasing || isDecreasing) {
- option.encoding = DELTA;
- return;
- }
- }
-
- // PATCHED_BASE encoding check
-
- // percentile values are computed for the zigzag encoded values. if the
- // number of bit requirement between 90th and 100th percentile varies
- // beyond a threshold then we need to patch the values. if the variation
- // is not significant then we can use direct encoding
-
- computeZigZagLiterals(option);
- option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
- option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true);
- uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
-
- // if the difference between 90th percentile and 100th percentile fixed
- // bits is > 1 then we need patch the values
- if (diffBitsLH > 1) {
-
- // patching is done only on base reduced values.
- // remove base from literals
- for (size_t i = 0; i < numLiterals; i++) {
- baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
- }
-
- // 95th percentile width is used to determine max allowed value
- // after which patching will be done
- option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
-
- // 100th percentile is used to compute the max patch width
- option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
-
- // after base reducing the values, if the difference in bits between
- // 95th percentile and 100th percentile value is zero then there
- // is no point in patching the values, in which case we will
- // fallback to DIRECT encoding.
- // The decision to use patched base was based on zigzag values, but the
- // actual patching is done on base reduced literals.
- if ((option.brBits100p - option.brBits95p) != 0) {
- option.encoding = PATCHED_BASE;
- preparePatchedBlob(option);
- return;
- } else {
- option.encoding = DIRECT;
- return;
- }
- } else {
- // if difference in bits between 95th percentile and 100th percentile is
- // 0, then patch length will become 0. Hence we will fallback to direct
- option.encoding = DIRECT;
- return;
- }
-}
-
-uint64_t RleEncoderV2::flush() {
- if (numLiterals != 0) {
- EncodingOption option = {};
- if (variableRunLength != 0) {
- determineEncoding(option);
- writeValues(option);
- } else if (fixedRunLength != 0) {
- if (fixedRunLength < MIN_REPEAT) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- determineEncoding(option);
- writeValues(option);
- } else if (fixedRunLength >= MIN_REPEAT
- && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- option.encoding = SHORT_REPEAT;
- writeValues(option);
- } else {
- option.encoding = DELTA;
- option.isFixedDelta = true;
- writeValues(option);
- }
- }
- }
-
- outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
- uint64_t dataSize = outputStream->flush();
- bufferLength = bufferPosition = 0;
- return dataSize;
-}
-
-void RleEncoderV2::writeValues(EncodingOption& option) {
- if (numLiterals != 0) {
- switch (option.encoding) {
- case SHORT_REPEAT:
- writeShortRepeatValues(option);
- break;
- case DIRECT:
- writeDirectValues(option);
- break;
- case PATCHED_BASE:
- writePatchedBasedValues(option);
- break;
- case DELTA:
- writeDeltaValues(option);
- break;
- default:
- throw NotImplementedYet("Not implemented yet");
- }
-
- numLiterals = 0;
- prevDelta = 0;
- }
-}
-
-void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
- int64_t repeatVal;
- if (isSigned) {
- repeatVal = zigZag(literals[0]);
- } else {
- repeatVal = literals[0];
- }
-
- const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal);
- const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1);
-
- uint32_t header = getOpCode(SHORT_REPEAT);
-
- fixedRunLength -= MIN_REPEAT;
- header |= fixedRunLength;
- header |= ((numBytesRepeatVal - 1) << 3);
-
- writeByte(static_cast<char>(header));
-
- for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) {
- int64_t b = ((repeatVal >> (i * 8)) & 0xff);
- writeByte(static_cast<char>(b));
- }
-
- fixedRunLength = 0;
-}
-
-void RleEncoderV2::writeDirectValues(EncodingOption& option) {
- // write the number of fixed bits required in next 5 bits
- uint32_t fb = option.zzBits100p;
- if (alignedBitPacking) {
- fb = getClosestAlignedFixedBits(fb);
- }
-
- const uint32_t efb = encodeBitWidth(fb) << 1;
-
- // adjust variable run length
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
-
- // create first byte of the header
- const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits);
-
- // second byte of the header stores the remaining 8 bits of runlength
- const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
-
- // write header
- writeByte(headerFirstByte);
- writeByte(headerSecondByte);
-
- // bit packing the zigzag encoded literals
- writeInts(zigzagLiterals, 0, numLiterals, fb);
-
- // reset run length
- variableRunLength = 0;
-}
-
-void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
- // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
- // because patch is applied to MSB bits. For example: If fixed bit width of
- // base value is 7 bits and if patch is 3 bits, the actual value is
- // constructed by shifting the patch to left by 7 positions.
- // actual_value = patch << 7 | base_value
- // So, if we align base_value then actual_value can not be reconstructed.
-
- // write the number of fixed bits required in next 5 bits
- const uint32_t efb = encodeBitWidth(option.brBits95p) << 1;
-
- // adjust variable run length, they are one off
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
-
- // create first byte of the header
- const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits);
-
- // second byte of the header stores the remaining 8 bits of runlength
- const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
-
- // if the min value is negative toggle the sign
- const bool isNegative = (option.min < 0);
- if (isNegative) {
- option.min = -option.min;
- }
-
- // find the number of bytes required for base and shift it by 5 bits
- // to accommodate patch width. The additional bit is used to store the sign
- // of the base value.
- const uint32_t baseWidth = findClosestNumBits(option.min) + 1;
- const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
- const uint32_t bb = (baseBytes - 1) << 5;
-
- // if the base value is negative then set MSB to 1
- if (isNegative) {
- option.min |= (1LL << ((baseBytes * 8) - 1));
- }
-
- // third byte contains 3 bits for number of bytes occupied by base
- // and 5 bits for patchWidth
- const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth));
-
- // fourth byte contains 3 bits for page gap width and 5 bits for
- // patch length
- const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength);
-
- // write header
- writeByte(headerFirstByte);
- writeByte(headerSecondByte);
- writeByte(headerThirdByte);
- writeByte(headerFourthByte);
-
- // write the base value using fixed bytes in big endian order
- for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) {
- char b = static_cast<char>(((option.min >> (i * 8)) & 0xff));
- writeByte(b);
- }
-
- // base reduced literals are bit packed
- uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p);
-
- writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits);
-
- // write patch list
- closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth);
-
- writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits);
-
- // reset run length
- variableRunLength = 0;
-}
-
-void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
- uint32_t len = 0;
- uint32_t fb = option.bitsDeltaMax;
- uint32_t efb = 0;
-
- if (alignedBitPacking) {
- fb = getClosestAlignedFixedBits(fb);
- }
-
- if (option.isFixedDelta) {
- // if fixed run length is greater than threshold then it will be fixed
- // delta sequence with delta value 0 else fixed delta sequence with
- // non-zero delta value
- if (fixedRunLength > MIN_REPEAT) {
- // ex. sequence: 2 2 2 2 2 2 2 2
- len = fixedRunLength - 1;
- fixedRunLength = 0;
- } else {
- // ex. sequence: 4 6 8 10 12 14 16
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
- } else {
- // fixed width 0 is used for long repeating values.
- // sequences that require only 1 bit to encode will have an additional bit
- if (fb == 1) {
- fb = 2;
- }
- efb = encodeBitWidth(fb) << 1;
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
-
- // extract the 9th bit of run length
- const uint32_t tailBits = (len & 0x100) >> 8;
-
- // create first byte of the header
- const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits);
-
- // second byte of the header stores the remaining 8 bits of runlength
- const char headerSecondByte = static_cast<char>(len & 0xff);
-
- // write header
- writeByte(headerFirstByte);
- writeByte(headerSecondByte);
-
- // store the first value from zigzag literal array
- if (isSigned) {
- writeVslong(literals[0]);
- } else {
- writeVulong(literals[0]);
- }
-
- if (option.isFixedDelta) {
- // if delta is fixed then we don't need to store delta blob
- writeVslong(option.fixedDelta);
- } else {
- // store the first value as delta value using zigzag encoding
- writeVslong(adjDeltas[0]);
-
- // adjacent delta values are bit packed. The length of adjDeltas array is
- // always one less than the number of literals (delta difference for n
- // elements is n-1). We have already written one element, write the
- // remaining numLiterals - 2 elements here
- writeInts(adjDeltas, 1, numLiterals - 2, fb);
- }
-}
-
-void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) {
- if(input == nullptr || len < 1 || bitSize < 1) {
- return;
- }
-
- if (getClosestAlignedFixedBits(bitSize) == bitSize) {
- uint32_t numBytes;
- uint32_t endOffSet = static_cast<uint32_t>(offset + len);
+ determineEncoding(option);
+ writeValues(option);
+ }
+ return;
+ }
+
+ // case 2: variable delta run
+
+ // if fixed run length is non-zero and if it satisfies the
+ // short repeat conditions then write the values as short repeats
+ // else use delta encoding
+ if (fixedRunLength >= MIN_REPEAT) {
+ if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ option.encoding = SHORT_REPEAT;
+ } else {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ }
+ writeValues(option);
+ }
+
+ // if fixed run length is <MIN_REPEAT and current value is
+ // different from previous then treat it as variable run
+ if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ }
+
+ // after writing values re-initialize the variables
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ } else {
+ prevDelta = val - literals[numLiterals - 1];
+ literals[numLiterals++] = val;
+ variableRunLength++;
+
+ if (variableRunLength == MAX_LITERAL_SIZE) {
+ determineEncoding(option);
+ writeValues(option);
+ }
+ }
+}
+
+void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
+ int64_t zzEncVal = 0;
+ for (size_t i = 0; i < numLiterals; i++) {
+ if (isSigned) {
+ zzEncVal = zigZag(literals[i]);
+ } else {
+ zzEncVal = literals[i];
+ }
+ zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
+ }
+}
+
+void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
+ // mask will be max value beyond which patch will be generated
+ int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
+
+ // since we are considering only 95 percentile, the size of gap and
+ // patch array can contain only be 5% values
+ option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20)));
+
+ // #bit for patch
+ option.patchWidth = option.brBits100p - option.brBits95p;
+ option.patchWidth = getClosestFixedBits(option.patchWidth);
+
+ // if patch bit requirement is 64 then it will not possible to pack
+ // gap and patch together in a long. To make sure gap and patch can be
+ // packed together adjust the patch width
+ if (option.patchWidth == 64) {
+ option.patchWidth = 56;
+ option.brBits95p = 8;
+ mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
+ }
+
+ uint32_t gapIdx = 0;
+ uint32_t patchIdx = 0;
+ size_t prev = 0;
+ size_t maxGap = 0;
+
+ std::vector<int64_t> gapList;
+ std::vector<int64_t> patchList;
+
+ for(size_t i = 0; i < numLiterals; i++) {
+ // if value is above mask then create the patch and record the gap
+ if (baseRedLiterals[i] > mask) {
+ size_t gap = i - prev;
+ if (gap > maxGap) {
+ maxGap = gap;
+ }
+
+ // gaps are relative, so store the previous patched value index
+ prev = i;
+ gapList.push_back(static_cast<int64_t>(gap));
+ gapIdx++;
+
+ // extract the most significant bits that are over mask bits
+ int64_t patch = baseRedLiterals[i] >> option.brBits95p;
+ patchList.push_back(patch);
+ patchIdx++;
+
+ // strip off the MSB to enable safe bit packing
+ baseRedLiterals[i] &= mask;
+ }
+ }
+
+ // adjust the patch length to number of entries in gap list
+ option.patchLength = gapIdx;
+
+ // if the element to be patched is the first and only element then
+ // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
+ if (maxGap == 0 && option.patchLength != 0) {
+ option.patchGapWidth = 1;
+ } else {
+ option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap));
+ }
+
+ // special case: if the patch gap width is greater than 256, then
+ // we need 9 bits to encode the gap width. But we only have 3 bits in
+ // header to record the gap width. To deal with this case, we will save
+ // two entries in patch list in the following way
+ // 256 gap width => 0 for patch value
+ // actual gap - 256 => actual patch value
+ // We will do the same for gap width = 511. If the element to be patched is
+ // the last element in the scope then gap width will be 511. In this case we
+ // will have 3 entries in the patch list in the following way
+ // 255 gap width => 0 for patch value
+ // 255 gap width => 0 for patch value
+ // 1 gap width => actual patch value
+ if (option.patchGapWidth > 8) {
+ option.patchGapWidth = 8;
+ // for gap = 511, we need two additional entries in patch list
+ if (maxGap == 511) {
+ option.patchLength += 2;
+ } else {
+ option.patchLength += 1;
+ }
+ }
+
+ // create gap vs patch list
+ gapIdx = 0;
+ patchIdx = 0;
+ for(size_t i = 0; i < option.patchLength; i++) {
+ int64_t g = gapList[gapIdx++];
+ int64_t p = patchList[patchIdx++];
+ while (g > 255) {
+ gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
+ i++;
+ g -= 255;
+ }
+
+ // store patch value in LSBs and gap in MSBs
+ gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
+ }
+}
+
+void RleEncoderV2::determineEncoding(EncodingOption& option) {
+ // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
+ // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
+ // computation when it's determined to be necessary.
+
+ // not a big win for shorter runs to determine encoding
+ if (numLiterals <= MIN_REPEAT) {
+ // we need to compute zigzag values for DIRECT encoding if we decide to
+ // break early for delta overflows or for shorter runs
+ computeZigZagLiterals(option);
+ option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+ option.encoding = DIRECT;
+ return;
+ }
+
+ // DELTA encoding check
+
+ // for identifying monotonic sequences
+ bool isIncreasing = true;
+ bool isDecreasing = true;
+ option.isFixedDelta = true;
+
+ option.min = literals[0];
+ int64_t max = literals[0];
+ int64_t initialDelta = literals[1] - literals[0];
+ int64_t currDelta = 0;
+ int64_t deltaMax = 0;
+ adjDeltas[option.adjDeltasCount++] = initialDelta;
+
+ for (size_t i = 1; i < numLiterals; i++) {
+ const int64_t l1 = literals[i];
+ const int64_t l0 = literals[i - 1];
+ currDelta = l1 - l0;
+ option.min = std::min(option.min, l1);
+ max = std::max(max, l1);
+
+ isIncreasing &= (l0 <= l1);
+ isDecreasing &= (l0 >= l1);
+
+ option.isFixedDelta &= (currDelta == initialDelta);
+ if (i > 1) {
+ adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
+ deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
+ }
+ }
+
+ // it's faster to exit under delta overflow condition without checking for
+ // PATCHED_BASE condition as encoding using DIRECT is faster and has less
+ // overhead than PATCHED_BASE
+ if (!isSafeSubtract(max, option.min)) {
+ computeZigZagLiterals(option);
+ option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+ option.encoding = DIRECT;
+ return;
+ }
+
+ // invariant - subtracting any number from any other in the literals after
+ // option point won't overflow
+
+ // if min is equal to max then the delta is 0, option condition happens for
+ // fixed values run >10 which cannot be encoded with SHORT_REPEAT
+ if (option.min == max) {
+ if (!option.isFixedDelta) {
+ throw InvalidArgument(to_string(option.min) + "==" +
+ to_string(max) + ", isFixedDelta cannot be false");
+ }
+
+ if(currDelta != 0) {
+ throw InvalidArgument(to_string(option.min) + "==" +
+ to_string(max) + ", currDelta should be zero");
+ }
+ option.fixedDelta = 0;
+ option.encoding = DELTA;
+ return;
+ }
+
+ if (option.isFixedDelta) {
+ if (currDelta != initialDelta) {
+ throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
+ }
+
+ option.encoding = DELTA;
+ option.fixedDelta = currDelta;
+ return;
+ }
+
+ // if initialDelta is 0 then we cannot delta encode as we cannot identify
+ // the sign of deltas (increasing or decreasing)
+ if (initialDelta != 0) {
+ // stores the number of bits required for packing delta blob in
+ // delta encoding
+ option.bitsDeltaMax = findClosestNumBits(deltaMax);
+
+ // monotonic condition
+ if (isIncreasing || isDecreasing) {
+ option.encoding = DELTA;
+ return;
+ }
+ }
+
+ // PATCHED_BASE encoding check
+
+ // percentile values are computed for the zigzag encoded values. if the
+ // number of bit requirement between 90th and 100th percentile varies
+ // beyond a threshold then we need to patch the values. if the variation
+ // is not significant then we can use direct encoding
+
+ computeZigZagLiterals(option);
+ option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+ option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true);
+ uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
+
+ // if the difference between 90th percentile and 100th percentile fixed
+ // bits is > 1 then we need patch the values
+ if (diffBitsLH > 1) {
+
+ // patching is done only on base reduced values.
+ // remove base from literals
+ for (size_t i = 0; i < numLiterals; i++) {
+ baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
+ }
+
+ // 95th percentile width is used to determine max allowed value
+ // after which patching will be done
+ option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
+
+ // 100th percentile is used to compute the max patch width
+ option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
+
+ // after base reducing the values, if the difference in bits between
+ // 95th percentile and 100th percentile value is zero then there
+ // is no point in patching the values, in which case we will
+ // fallback to DIRECT encoding.
+ // The decision to use patched base was based on zigzag values, but the
+ // actual patching is done on base reduced literals.
+ if ((option.brBits100p - option.brBits95p) != 0) {
+ option.encoding = PATCHED_BASE;
+ preparePatchedBlob(option);
+ return;
+ } else {
+ option.encoding = DIRECT;
+ return;
+ }
+ } else {
+ // if difference in bits between 95th percentile and 100th percentile is
+ // 0, then patch length will become 0. Hence we will fallback to direct
+ option.encoding = DIRECT;
+ return;
+ }
+}
+
+uint64_t RleEncoderV2::flush() {
+ if (numLiterals != 0) {
+ EncodingOption option = {};
+ if (variableRunLength != 0) {
+ determineEncoding(option);
+ writeValues(option);
+ } else if (fixedRunLength != 0) {
+ if (fixedRunLength < MIN_REPEAT) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ determineEncoding(option);
+ writeValues(option);
+ } else if (fixedRunLength >= MIN_REPEAT
+ && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ option.encoding = SHORT_REPEAT;
+ writeValues(option);
+ } else {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ writeValues(option);
+ }
+ }
+ }
+
+ outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
+ uint64_t dataSize = outputStream->flush();
+ bufferLength = bufferPosition = 0;
+ return dataSize;
+}
+
+void RleEncoderV2::writeValues(EncodingOption& option) {
+ if (numLiterals != 0) {
+ switch (option.encoding) {
+ case SHORT_REPEAT:
+ writeShortRepeatValues(option);
+ break;
+ case DIRECT:
+ writeDirectValues(option);
+ break;
+ case PATCHED_BASE:
+ writePatchedBasedValues(option);
+ break;
+ case DELTA:
+ writeDeltaValues(option);
+ break;
+ default:
+ throw NotImplementedYet("Not implemented yet");
+ }
+
+ numLiterals = 0;
+ prevDelta = 0;
+ }
+}
+
+void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
+ int64_t repeatVal;
+ if (isSigned) {
+ repeatVal = zigZag(literals[0]);
+ } else {
+ repeatVal = literals[0];
+ }
+
+ const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal);
+ const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1);
+
+ uint32_t header = getOpCode(SHORT_REPEAT);
+
+ fixedRunLength -= MIN_REPEAT;
+ header |= fixedRunLength;
+ header |= ((numBytesRepeatVal - 1) << 3);
+
+ writeByte(static_cast<char>(header));
+
+ for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) {
+ int64_t b = ((repeatVal >> (i * 8)) & 0xff);
+ writeByte(static_cast<char>(b));
+ }
+
+ fixedRunLength = 0;
+}
+
+void RleEncoderV2::writeDirectValues(EncodingOption& option) {
+ // write the number of fixed bits required in next 5 bits
+ uint32_t fb = option.zzBits100p;
+ if (alignedBitPacking) {
+ fb = getClosestAlignedFixedBits(fb);
+ }
+
+ const uint32_t efb = encodeBitWidth(fb) << 1;
+
+ // adjust variable run length
+ variableRunLength -= 1;
+
+ // extract the 9th bit of run length
+ const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
+
+ // create first byte of the header
+ const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits);
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
+
+ // write header
+ writeByte(headerFirstByte);
+ writeByte(headerSecondByte);
+
+ // bit packing the zigzag encoded literals
+ writeInts(zigzagLiterals, 0, numLiterals, fb);
+
+ // reset run length
+ variableRunLength = 0;
+}
+
+void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
+ // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
+ // because patch is applied to MSB bits. For example: If fixed bit width of
+ // base value is 7 bits and if patch is 3 bits, the actual value is
+ // constructed by shifting the patch to left by 7 positions.
+ // actual_value = patch << 7 | base_value
+ // So, if we align base_value then actual_value can not be reconstructed.
+
+ // write the number of fixed bits required in next 5 bits
+ const uint32_t efb = encodeBitWidth(option.brBits95p) << 1;
+
+ // adjust variable run length, they are one off
+ variableRunLength -= 1;
+
+ // extract the 9th bit of run length
+ const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
+
+ // create first byte of the header
+ const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits);
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
+
+ // if the min value is negative toggle the sign
+ const bool isNegative = (option.min < 0);
+ if (isNegative) {
+ option.min = -option.min;
+ }
+
+ // find the number of bytes required for base and shift it by 5 bits
+ // to accommodate patch width. The additional bit is used to store the sign
+ // of the base value.
+ const uint32_t baseWidth = findClosestNumBits(option.min) + 1;
+ const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
+ const uint32_t bb = (baseBytes - 1) << 5;
+
+ // if the base value is negative then set MSB to 1
+ if (isNegative) {
+ option.min |= (1LL << ((baseBytes * 8) - 1));
+ }
+
+ // third byte contains 3 bits for number of bytes occupied by base
+ // and 5 bits for patchWidth
+ const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth));
+
+ // fourth byte contains 3 bits for page gap width and 5 bits for
+ // patch length
+ const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength);
+
+ // write header
+ writeByte(headerFirstByte);
+ writeByte(headerSecondByte);
+ writeByte(headerThirdByte);
+ writeByte(headerFourthByte);
+
+ // write the base value using fixed bytes in big endian order
+ for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) {
+ char b = static_cast<char>(((option.min >> (i * 8)) & 0xff));
+ writeByte(b);
+ }
+
+ // base reduced literals are bit packed
+ uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p);
+
+ writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits);
+
+ // write patch list
+ closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth);
+
+ writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits);
+
+ // reset run length
+ variableRunLength = 0;
+}
+
+void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
+ uint32_t len = 0;
+ uint32_t fb = option.bitsDeltaMax;
+ uint32_t efb = 0;
+
+ if (alignedBitPacking) {
+ fb = getClosestAlignedFixedBits(fb);
+ }
+
+ if (option.isFixedDelta) {
+ // if fixed run length is greater than threshold then it will be fixed
+ // delta sequence with delta value 0 else fixed delta sequence with
+ // non-zero delta value
+ if (fixedRunLength > MIN_REPEAT) {
+ // ex. sequence: 2 2 2 2 2 2 2 2
+ len = fixedRunLength - 1;
+ fixedRunLength = 0;
+ } else {
+ // ex. sequence: 4 6 8 10 12 14 16
+ len = variableRunLength - 1;
+ variableRunLength = 0;
+ }
+ } else {
+ // fixed width 0 is used for long repeating values.
+ // sequences that require only 1 bit to encode will have an additional bit
+ if (fb == 1) {
+ fb = 2;
+ }
+ efb = encodeBitWidth(fb) << 1;
+ len = variableRunLength - 1;
+ variableRunLength = 0;
+ }
+
+ // extract the 9th bit of run length
+ const uint32_t tailBits = (len & 0x100) >> 8;
+
+ // create first byte of the header
+ const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits);
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ const char headerSecondByte = static_cast<char>(len & 0xff);
+
+ // write header
+ writeByte(headerFirstByte);
+ writeByte(headerSecondByte);
+
+ // store the first value from zigzag literal array
+ if (isSigned) {
+ writeVslong(literals[0]);
+ } else {
+ writeVulong(literals[0]);
+ }
+
+ if (option.isFixedDelta) {
+ // if delta is fixed then we don't need to store delta blob
+ writeVslong(option.fixedDelta);
+ } else {
+ // store the first value as delta value using zigzag encoding
+ writeVslong(adjDeltas[0]);
+
+ // adjacent delta values are bit packed. The length of adjDeltas array is
+ // always one less than the number of literals (delta difference for n
+ // elements is n-1). We have already written one element, write the
+ // remaining numLiterals - 2 elements here
+ writeInts(adjDeltas, 1, numLiterals - 2, fb);
+ }
+}
+
+void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) {
+ if(input == nullptr || len < 1 || bitSize < 1) {
+ return;
+ }
+
+ if (getClosestAlignedFixedBits(bitSize) == bitSize) {
+ uint32_t numBytes;
+ uint32_t endOffSet = static_cast<uint32_t>(offset + len);
if (bitSize < 8 ) {
- char bitMask = static_cast<char>((1 << bitSize) - 1);
- uint32_t numHops = 8 / bitSize;
- uint32_t remainder = static_cast<uint32_t>(len % numHops);
- uint32_t endUnroll = endOffSet - remainder;
- for (uint32_t i = offset; i < endUnroll; i+=numHops) {
- char toWrite = 0;
- for (uint32_t j = 0; j < numHops; ++j) {
- toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize));
- }
- writeByte(toWrite);
- }
-
- if (remainder > 0) {
- uint32_t startShift = 8 - bitSize;
- char toWrite = 0;
- for (uint32_t i = endUnroll; i < endOffSet; ++i) {
- toWrite |= static_cast<char>((input[i] & bitMask) << startShift);
- startShift -= bitSize;
- }
- writeByte(toWrite);
- }
-
- } else {
- numBytes = bitSize / 8;
-
- for (uint32_t i = offset; i < endOffSet; ++i) {
- for (uint32_t j = 0; j < numBytes; ++j) {
- char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255);
- writeByte(toWrite);
- }
- }
- }
-
- return;
- }
-
- // write for unaligned bit size
- uint32_t bitsLeft = 8;
- char current = 0;
- for(uint32_t i = offset; i < (offset + len); i++) {
- int64_t value = input[i];
- uint32_t bitsToWrite = bitSize;
- while (bitsToWrite > bitsLeft) {
- // add the bits to the bottom of the current word
- current |= static_cast<char>(value >> (bitsToWrite - bitsLeft));
- // subtract out the bits we just added
- bitsToWrite -= bitsLeft;
- // zero out the bits above bitsToWrite
- value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1;
- writeByte(current);
- current = 0;
- bitsLeft = 8;
- }
- bitsLeft -= bitsToWrite;
- current |= static_cast<char>(value << bitsLeft);
- if (bitsLeft == 0) {
- writeByte(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- // flush
- if (bitsLeft != 8) {
- writeByte(current);
- }
-}
-
-void RleEncoderV2::initializeLiterals(int64_t val) {
- literals[numLiterals++] = val;
- fixedRunLength = 1;
- variableRunLength = 1;
-}
-}
+ char bitMask = static_cast<char>((1 << bitSize) - 1);
+ uint32_t numHops = 8 / bitSize;
+ uint32_t remainder = static_cast<uint32_t>(len % numHops);
+ uint32_t endUnroll = endOffSet - remainder;
+ for (uint32_t i = offset; i < endUnroll; i+=numHops) {
+ char toWrite = 0;
+ for (uint32_t j = 0; j < numHops; ++j) {
+ toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize));
+ }
+ writeByte(toWrite);
+ }
+
+ if (remainder > 0) {
+ uint32_t startShift = 8 - bitSize;
+ char toWrite = 0;
+ for (uint32_t i = endUnroll; i < endOffSet; ++i) {
+ toWrite |= static_cast<char>((input[i] & bitMask) << startShift);
+ startShift -= bitSize;
+ }
+ writeByte(toWrite);
+ }
+
+ } else {
+ numBytes = bitSize / 8;
+
+ for (uint32_t i = offset; i < endOffSet; ++i) {
+ for (uint32_t j = 0; j < numBytes; ++j) {
+ char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255);
+ writeByte(toWrite);
+ }
+ }
+ }
+
+ return;
+ }
+
+ // write for unaligned bit size
+ uint32_t bitsLeft = 8;
+ char current = 0;
+ for(uint32_t i = offset; i < (offset + len); i++) {
+ int64_t value = input[i];
+ uint32_t bitsToWrite = bitSize;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= static_cast<char>(value >> (bitsToWrite - bitsLeft));
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1;
+ writeByte(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ bitsLeft -= bitsToWrite;
+ current |= static_cast<char>(value << bitsLeft);
+ if (bitsLeft == 0) {
+ writeByte(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ }
+
+ // flush
+ if (bitsLeft != 8) {
+ writeByte(current);
+ }
+}
+
+void RleEncoderV2::initializeLiterals(int64_t val) {
+ literals[numLiterals++] = val;
+ fixedRunLength = 1;
+ variableRunLength = 1;
+}
+}
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc
index 2401f5e0cb..f13381b5b0 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.cc
+++ b/contrib/libs/apache/orc/c++/src/Statistics.cc
@@ -1,408 +1,408 @@
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Exceptions.hh"
-#include "RLE.hh"
-#include "Statistics.hh"
-
-#include "wrap/coded-stream-wrapper.h"
-
-namespace orc {
-
- ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
- const StatContext& statContext) {
- if (s.has_intstatistics()) {
- return new IntegerColumnStatisticsImpl(s);
- } else if (s.has_doublestatistics()) {
- return new DoubleColumnStatisticsImpl(s);
- } else if (s.has_stringstatistics()) {
- return new StringColumnStatisticsImpl(s, statContext);
- } else if (s.has_bucketstatistics()) {
- return new BooleanColumnStatisticsImpl(s, statContext);
- } else if (s.has_decimalstatistics()) {
- return new DecimalColumnStatisticsImpl(s, statContext);
- } else if (s.has_timestampstatistics()) {
- return new TimestampColumnStatisticsImpl(s, statContext);
- } else if (s.has_datestatistics()) {
- return new DateColumnStatisticsImpl(s, statContext);
- } else if (s.has_binarystatistics()) {
- return new BinaryColumnStatisticsImpl(s, statContext);
- } else {
- return new ColumnStatisticsImpl(s);
- }
- }
-
- StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats,
- const StatContext& statContext) {
- for(int i = 0; i < stripeStats.colstats_size(); i++) {
- colStats.push_back(
- convertColumnStatistics(stripeStats.colstats(i), statContext));
- }
- }
-
- StatisticsImpl::StatisticsImpl(const proto::Footer& footer,
- const StatContext& statContext) {
- for(int i = 0; i < footer.statistics_size(); i++) {
- colStats.push_back(
- convertColumnStatistics(footer.statistics(i), statContext));
- }
- }
-
- StatisticsImpl::~StatisticsImpl() {
- for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin();
- ptr != colStats.end();
- ++ptr) {
- delete *ptr;
- }
- }
-
- Statistics::~Statistics() {
- // PASS
- }
-
- StripeStatistics::~StripeStatistics() {
- // PASS
- }
-
- StripeStatisticsImpl::~StripeStatisticsImpl() {
- // PASS
- }
-
- StripeStatisticsImpl::StripeStatisticsImpl(
- const proto::StripeStatistics& stripeStats,
- std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
- const StatContext& statContext) {
- columnStats.reset(new StatisticsImpl(stripeStats, statContext));
- rowIndexStats.resize(indexStats.size());
- for(size_t i = 0; i < rowIndexStats.size(); i++) {
- for(size_t j = 0; j < indexStats[i].size(); j++) {
- rowIndexStats[i].push_back(
- std::shared_ptr<const ColumnStatistics>(
- convertColumnStatistics(indexStats[i][j], statContext)));
- }
- }
- }
-
-
- ColumnStatistics::~ColumnStatistics() {
- // PASS
- }
-
- BinaryColumnStatistics::~BinaryColumnStatistics() {
- // PASS
- }
-
- BooleanColumnStatistics::~BooleanColumnStatistics() {
- // PASS
- }
-
- DateColumnStatistics::~DateColumnStatistics() {
- // PASS
- }
-
- DecimalColumnStatistics::~DecimalColumnStatistics() {
- // PASS
- }
-
- DoubleColumnStatistics::~DoubleColumnStatistics() {
- // PASS
- }
-
- IntegerColumnStatistics::~IntegerColumnStatistics() {
- // PASS
- }
-
- StringColumnStatistics::~StringColumnStatistics() {
- // PASS
- }
-
- TimestampColumnStatistics::~TimestampColumnStatistics() {
- // PASS
- }
-
- MutableColumnStatistics::~MutableColumnStatistics() {
- // PASS
- }
-
- ColumnStatisticsImpl::~ColumnStatisticsImpl() {
- // PASS
- }
-
- BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() {
- // PASS
- }
-
- BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() {
- // PASS
- }
-
- DateColumnStatisticsImpl::~DateColumnStatisticsImpl() {
- // PASS
- }
-
- DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() {
- // PASS
- }
-
- DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() {
- // PASS
- }
-
- IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() {
- // PASS
- }
-
- StringColumnStatisticsImpl::~StringColumnStatisticsImpl() {
- // PASS
- }
-
- TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() {
- // PASS
- }
-
- ColumnStatisticsImpl::ColumnStatisticsImpl
- (const proto::ColumnStatistics& pb) {
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- }
-
- BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (pb.has_binarystatistics() && statContext.correctStats) {
- _stats.setHasTotalLength(pb.binarystatistics().has_sum());
- _stats.setTotalLength(
- static_cast<uint64_t>(pb.binarystatistics().sum()));
- }
- }
-
- BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (pb.has_bucketstatistics() && statContext.correctStats) {
- _hasCount = true;
- _trueCount = pb.bucketstatistics().count(0);
- } else {
- _hasCount = false;
- _trueCount = 0;
- }
- }
-
- DateColumnStatisticsImpl::DateColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_datestatistics() || !statContext.correctStats) {
- // hasMinimum_ is false by default;
- // hasMaximum_ is false by default;
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- } else {
- _stats.setHasMinimum(pb.datestatistics().has_minimum());
- _stats.setHasMaximum(pb.datestatistics().has_maximum());
- _stats.setMinimum(pb.datestatistics().minimum());
- _stats.setMaximum(pb.datestatistics().maximum());
- }
- }
-
- DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (pb.has_decimalstatistics() && statContext.correctStats) {
- const proto::DecimalStatistics& stats = pb.decimalstatistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasSum(stats.has_sum());
-
- _stats.setMinimum(Decimal(stats.minimum()));
- _stats.setMaximum(Decimal(stats.maximum()));
- _stats.setSum(Decimal(stats.sum()));
- }
- }
-
- DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl
- (const proto::ColumnStatistics& pb){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_doublestatistics()) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _stats.setSum(0);
- }else{
- const proto::DoubleStatistics& stats = pb.doublestatistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasSum(stats.has_sum());
-
- _stats.setMinimum(stats.minimum());
- _stats.setMaximum(stats.maximum());
- _stats.setSum(stats.sum());
- }
- }
-
- IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl
- (const proto::ColumnStatistics& pb){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_intstatistics()) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _stats.setSum(0);
- }else{
- const proto::IntegerStatistics& stats = pb.intstatistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasSum(stats.has_sum());
-
- _stats.setMinimum(stats.minimum());
- _stats.setMaximum(stats.maximum());
- _stats.setSum(stats.sum());
- }
- }
-
- StringColumnStatisticsImpl::StringColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext){
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_stringstatistics() || !statContext.correctStats) {
- _stats.setTotalLength(0);
- }else{
- const proto::StringStatistics& stats = pb.stringstatistics();
- _stats.setHasMinimum(stats.has_minimum());
- _stats.setHasMaximum(stats.has_maximum());
- _stats.setHasTotalLength(stats.has_sum());
-
- _stats.setMinimum(stats.minimum());
- _stats.setMaximum(stats.maximum());
- _stats.setTotalLength(static_cast<uint64_t>(stats.sum()));
- }
- }
-
- TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl
- (const proto::ColumnStatistics& pb, const StatContext& statContext) {
- _stats.setNumberOfValues(pb.numberofvalues());
- _stats.setHasNull(pb.hasnull());
- if (!pb.has_timestampstatistics() || !statContext.correctStats) {
- _stats.setMinimum(0);
- _stats.setMaximum(0);
- _lowerBound = 0;
- _upperBound = 0;
- }else{
- const proto::TimestampStatistics& stats = pb.timestampstatistics();
- _stats.setHasMinimum(
- stats.has_minimumutc() ||
- (stats.has_minimum() && (statContext.writerTimezone != nullptr)));
- _stats.setHasMaximum(
- stats.has_maximumutc() ||
- (stats.has_maximum() && (statContext.writerTimezone != nullptr)));
- _hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
- _hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
-
- // Timestamp stats are stored in milliseconds
- if (stats.has_minimumutc()) {
- int64_t minimum = stats.minimumutc();
- _stats.setMinimum(minimum);
- _lowerBound = minimum;
- } else if (statContext.writerTimezone) {
- int64_t writerTimeSec = stats.minimum() / 1000;
- // multiply the offset by 1000 to convert to millisecond
- int64_t minimum =
- stats.minimum() +
- (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
- * 1000;
- _stats.setMinimum(minimum);
- _lowerBound = minimum;
- } else {
- _stats.setMinimum(0);
- // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown
- // TZ and daylight savings
- _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
- }
-
- // Timestamp stats are stored in milliseconds
- if (stats.has_maximumutc()) {
- int64_t maximum = stats.maximumutc();
- _stats.setMaximum(maximum);
- _upperBound = maximum;
- } else if (statContext.writerTimezone) {
- int64_t writerTimeSec = stats.maximum() / 1000;
- // multiply the offset by 1000 to convert to millisecond
- int64_t maximum = stats.maximum() +
- (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
- * 1000;
- _stats.setMaximum(maximum);
- _upperBound = maximum;
- } else {
- _stats.setMaximum(0);
- // add 1 day 1 hour (25 hours) in milliseconds to handle unknown
- // TZ and daylight savings
- _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
- }
- // Add 1 millisecond to account for microsecond precision of values
- _upperBound += 1;
- }
- }
-
- std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
- const Type& type) {
- switch (static_cast<int64_t>(type.getKind())) {
- case BOOLEAN:
- return std::unique_ptr<MutableColumnStatistics>(
- new BooleanColumnStatisticsImpl());
- case BYTE:
- case INT:
- case LONG:
- case SHORT:
- return std::unique_ptr<MutableColumnStatistics>(
- new IntegerColumnStatisticsImpl());
- case STRUCT:
- case MAP:
- case LIST:
- case UNION:
- return std::unique_ptr<MutableColumnStatistics>(
- new ColumnStatisticsImpl());
- case FLOAT:
- case DOUBLE:
- return std::unique_ptr<MutableColumnStatistics>(
- new DoubleColumnStatisticsImpl());
- case BINARY:
- return std::unique_ptr<MutableColumnStatistics>(
- new BinaryColumnStatisticsImpl());
- case STRING:
- case CHAR:
- case VARCHAR:
- return std::unique_ptr<MutableColumnStatistics>(
- new StringColumnStatisticsImpl());
- case DATE:
- return std::unique_ptr<MutableColumnStatistics>(
- new DateColumnStatisticsImpl());
- case TIMESTAMP:
- return std::unique_ptr<MutableColumnStatistics>(
- new TimestampColumnStatisticsImpl());
- case DECIMAL:
- return std::unique_ptr<MutableColumnStatistics>(
- new DecimalColumnStatisticsImpl());
- default:
- throw NotImplementedYet("Not supported type: " + type.toString());
- }
- }
-
-}// namespace
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "RLE.hh"
+#include "Statistics.hh"
+
+#include "wrap/coded-stream-wrapper.h"
+
+namespace orc {
+
+ ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
+ const StatContext& statContext) {
+ if (s.has_intstatistics()) {
+ return new IntegerColumnStatisticsImpl(s);
+ } else if (s.has_doublestatistics()) {
+ return new DoubleColumnStatisticsImpl(s);
+ } else if (s.has_stringstatistics()) {
+ return new StringColumnStatisticsImpl(s, statContext);
+ } else if (s.has_bucketstatistics()) {
+ return new BooleanColumnStatisticsImpl(s, statContext);
+ } else if (s.has_decimalstatistics()) {
+ return new DecimalColumnStatisticsImpl(s, statContext);
+ } else if (s.has_timestampstatistics()) {
+ return new TimestampColumnStatisticsImpl(s, statContext);
+ } else if (s.has_datestatistics()) {
+ return new DateColumnStatisticsImpl(s, statContext);
+ } else if (s.has_binarystatistics()) {
+ return new BinaryColumnStatisticsImpl(s, statContext);
+ } else {
+ return new ColumnStatisticsImpl(s);
+ }
+ }
+
+ StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats,
+ const StatContext& statContext) {
+ for(int i = 0; i < stripeStats.colstats_size(); i++) {
+ colStats.push_back(
+ convertColumnStatistics(stripeStats.colstats(i), statContext));
+ }
+ }
+
+ StatisticsImpl::StatisticsImpl(const proto::Footer& footer,
+ const StatContext& statContext) {
+ for(int i = 0; i < footer.statistics_size(); i++) {
+ colStats.push_back(
+ convertColumnStatistics(footer.statistics(i), statContext));
+ }
+ }
+
+ StatisticsImpl::~StatisticsImpl() {
+ for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin();
+ ptr != colStats.end();
+ ++ptr) {
+ delete *ptr;
+ }
+ }
+
+ Statistics::~Statistics() {
+ // PASS
+ }
+
+ StripeStatistics::~StripeStatistics() {
+ // PASS
+ }
+
+ StripeStatisticsImpl::~StripeStatisticsImpl() {
+ // PASS
+ }
+
+ StripeStatisticsImpl::StripeStatisticsImpl(
+ const proto::StripeStatistics& stripeStats,
+ std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
+ const StatContext& statContext) {
+ columnStats.reset(new StatisticsImpl(stripeStats, statContext));
+ rowIndexStats.resize(indexStats.size());
+ for(size_t i = 0; i < rowIndexStats.size(); i++) {
+ for(size_t j = 0; j < indexStats[i].size(); j++) {
+ rowIndexStats[i].push_back(
+ std::shared_ptr<const ColumnStatistics>(
+ convertColumnStatistics(indexStats[i][j], statContext)));
+ }
+ }
+ }
+
+
+ ColumnStatistics::~ColumnStatistics() {
+ // PASS
+ }
+
+ BinaryColumnStatistics::~BinaryColumnStatistics() {
+ // PASS
+ }
+
+ BooleanColumnStatistics::~BooleanColumnStatistics() {
+ // PASS
+ }
+
+ DateColumnStatistics::~DateColumnStatistics() {
+ // PASS
+ }
+
+ DecimalColumnStatistics::~DecimalColumnStatistics() {
+ // PASS
+ }
+
+ DoubleColumnStatistics::~DoubleColumnStatistics() {
+ // PASS
+ }
+
+ IntegerColumnStatistics::~IntegerColumnStatistics() {
+ // PASS
+ }
+
+ StringColumnStatistics::~StringColumnStatistics() {
+ // PASS
+ }
+
+ TimestampColumnStatistics::~TimestampColumnStatistics() {
+ // PASS
+ }
+
+ MutableColumnStatistics::~MutableColumnStatistics() {
+ // PASS
+ }
+
+ ColumnStatisticsImpl::~ColumnStatisticsImpl() {
+ // PASS
+ }
+
+ BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() {
+ // PASS
+ }
+
+ BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() {
+ // PASS
+ }
+
+ DateColumnStatisticsImpl::~DateColumnStatisticsImpl() {
+ // PASS
+ }
+
+ DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() {
+ // PASS
+ }
+
+ DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() {
+ // PASS
+ }
+
+ IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() {
+ // PASS
+ }
+
+ StringColumnStatisticsImpl::~StringColumnStatisticsImpl() {
+ // PASS
+ }
+
+ TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() {
+ // PASS
+ }
+
+ ColumnStatisticsImpl::ColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb) {
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ }
+
+ BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb, const StatContext& statContext){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (pb.has_binarystatistics() && statContext.correctStats) {
+ _stats.setHasTotalLength(pb.binarystatistics().has_sum());
+ _stats.setTotalLength(
+ static_cast<uint64_t>(pb.binarystatistics().sum()));
+ }
+ }
+
+ BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb, const StatContext& statContext){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (pb.has_bucketstatistics() && statContext.correctStats) {
+ _hasCount = true;
+ _trueCount = pb.bucketstatistics().count(0);
+ } else {
+ _hasCount = false;
+ _trueCount = 0;
+ }
+ }
+
+ DateColumnStatisticsImpl::DateColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb, const StatContext& statContext){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (!pb.has_datestatistics() || !statContext.correctStats) {
+ // hasMinimum_ is false by default;
+ // hasMaximum_ is false by default;
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ } else {
+ _stats.setHasMinimum(pb.datestatistics().has_minimum());
+ _stats.setHasMaximum(pb.datestatistics().has_maximum());
+ _stats.setMinimum(pb.datestatistics().minimum());
+ _stats.setMaximum(pb.datestatistics().maximum());
+ }
+ }
+
+ DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb, const StatContext& statContext){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (pb.has_decimalstatistics() && statContext.correctStats) {
+ const proto::DecimalStatistics& stats = pb.decimalstatistics();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasSum(stats.has_sum());
+
+ _stats.setMinimum(Decimal(stats.minimum()));
+ _stats.setMaximum(Decimal(stats.maximum()));
+ _stats.setSum(Decimal(stats.sum()));
+ }
+ }
+
+ DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (!pb.has_doublestatistics()) {
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _stats.setSum(0);
+ }else{
+ const proto::DoubleStatistics& stats = pb.doublestatistics();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasSum(stats.has_sum());
+
+ _stats.setMinimum(stats.minimum());
+ _stats.setMaximum(stats.maximum());
+ _stats.setSum(stats.sum());
+ }
+ }
+
+ IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (!pb.has_intstatistics()) {
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _stats.setSum(0);
+ }else{
+ const proto::IntegerStatistics& stats = pb.intstatistics();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasSum(stats.has_sum());
+
+ _stats.setMinimum(stats.minimum());
+ _stats.setMaximum(stats.maximum());
+ _stats.setSum(stats.sum());
+ }
+ }
+
+ StringColumnStatisticsImpl::StringColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb, const StatContext& statContext){
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (!pb.has_stringstatistics() || !statContext.correctStats) {
+ _stats.setTotalLength(0);
+ }else{
+ const proto::StringStatistics& stats = pb.stringstatistics();
+ _stats.setHasMinimum(stats.has_minimum());
+ _stats.setHasMaximum(stats.has_maximum());
+ _stats.setHasTotalLength(stats.has_sum());
+
+ _stats.setMinimum(stats.minimum());
+ _stats.setMaximum(stats.maximum());
+ _stats.setTotalLength(static_cast<uint64_t>(stats.sum()));
+ }
+ }
+
+ TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb, const StatContext& statContext) {
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (!pb.has_timestampstatistics() || !statContext.correctStats) {
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _lowerBound = 0;
+ _upperBound = 0;
+ }else{
+ const proto::TimestampStatistics& stats = pb.timestampstatistics();
+ _stats.setHasMinimum(
+ stats.has_minimumutc() ||
+ (stats.has_minimum() && (statContext.writerTimezone != nullptr)));
+ _stats.setHasMaximum(
+ stats.has_maximumutc() ||
+ (stats.has_maximum() && (statContext.writerTimezone != nullptr)));
+ _hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
+ _hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
+
+ // Timestamp stats are stored in milliseconds
+ if (stats.has_minimumutc()) {
+ int64_t minimum = stats.minimumutc();
+ _stats.setMinimum(minimum);
+ _lowerBound = minimum;
+ } else if (statContext.writerTimezone) {
+ int64_t writerTimeSec = stats.minimum() / 1000;
+ // multiply the offset by 1000 to convert to millisecond
+ int64_t minimum =
+ stats.minimum() +
+ (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
+ * 1000;
+ _stats.setMinimum(minimum);
+ _lowerBound = minimum;
+ } else {
+ _stats.setMinimum(0);
+ // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown
+ // TZ and daylight savings
+ _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
+ }
+
+ // Timestamp stats are stored in milliseconds
+ if (stats.has_maximumutc()) {
+ int64_t maximum = stats.maximumutc();
+ _stats.setMaximum(maximum);
+ _upperBound = maximum;
+ } else if (statContext.writerTimezone) {
+ int64_t writerTimeSec = stats.maximum() / 1000;
+ // multiply the offset by 1000 to convert to millisecond
+ int64_t maximum = stats.maximum() +
+ (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
+ * 1000;
+ _stats.setMaximum(maximum);
+ _upperBound = maximum;
+ } else {
+ _stats.setMaximum(0);
+ // add 1 day 1 hour (25 hours) in milliseconds to handle unknown
+ // TZ and daylight savings
+ _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
+ }
+ // Add 1 millisecond to account for microsecond precision of values
+ _upperBound += 1;
+ }
+ }
+
+ std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
+ const Type& type) {
+ switch (static_cast<int64_t>(type.getKind())) {
+ case BOOLEAN:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new BooleanColumnStatisticsImpl());
+ case BYTE:
+ case INT:
+ case LONG:
+ case SHORT:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new IntegerColumnStatisticsImpl());
+ case STRUCT:
+ case MAP:
+ case LIST:
+ case UNION:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new ColumnStatisticsImpl());
+ case FLOAT:
+ case DOUBLE:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new DoubleColumnStatisticsImpl());
+ case BINARY:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new BinaryColumnStatisticsImpl());
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new StringColumnStatisticsImpl());
+ case DATE:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new DateColumnStatisticsImpl());
+ case TIMESTAMP:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new TimestampColumnStatisticsImpl());
+ case DECIMAL:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new DecimalColumnStatisticsImpl());
+ default:
+ throw NotImplementedYet("Not supported type: " + type.toString());
+ }
+ }
+
+}// namespace
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh
index ee9db23f86..849019d8d7 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.hh
+++ b/contrib/libs/apache/orc/c++/src/Statistics.hh
@@ -1,971 +1,971 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_STATISTICS_IMPL_HH
-#define ORC_STATISTICS_IMPL_HH
-
-#include "orc/Common.hh"
-#include "orc/Int128.hh"
-#include "orc/OrcFile.hh"
-#include "orc/Reader.hh"
-
-#include "Timezone.hh"
-#include "TypeImpl.hh"
-
-namespace orc {
-
-/**
- * StatContext contains fields required to compute statistics
- */
-
- struct StatContext {
- const bool correctStats;
- const Timezone* const writerTimezone;
- StatContext() : correctStats(false), writerTimezone(nullptr) {}
- StatContext(bool cStat, const Timezone* const timezone = nullptr) :
- correctStats(cStat), writerTimezone(timezone) {}
- };
-
-/**
- * Internal Statistics Implementation
- */
-
- template <typename T>
- class InternalStatisticsImpl {
- private:
- bool _hasNull;
- bool _hasMinimum;
- bool _hasMaximum;
- bool _hasSum;
- bool _hasTotalLength;
- uint64_t _totalLength;
- uint64_t _valueCount;
- T _minimum;
- T _maximum;
- T _sum;
- public:
- InternalStatisticsImpl() {
- _hasNull = false;
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
- _hasTotalLength = false;
- _totalLength = 0;
- _valueCount = 0;
- }
-
- ~InternalStatisticsImpl() {}
-
- // GET / SET _totalLength
- bool hasTotalLength() const { return _hasTotalLength; }
-
- void setHasTotalLength(bool hasTotalLength) {
- _hasTotalLength = hasTotalLength;
- }
-
- uint64_t getTotalLength() const { return _totalLength; }
-
- void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
-
- // GET / SET _sum
- bool hasSum() const { return _hasSum; }
-
- void setHasSum(bool hasSum) { _hasSum = hasSum; }
-
- T getSum() const { return _sum; }
-
- void setSum(T sum) { _sum = sum; }
-
- // GET / SET _maximum
- bool hasMaximum() const { return _hasMaximum; }
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_STATISTICS_IMPL_HH
+#define ORC_STATISTICS_IMPL_HH
+
+#include "orc/Common.hh"
+#include "orc/Int128.hh"
+#include "orc/OrcFile.hh"
+#include "orc/Reader.hh"
+
+#include "Timezone.hh"
+#include "TypeImpl.hh"
+
+namespace orc {
+
+/**
+ * StatContext contains fields required to compute statistics
+ */
+
+ struct StatContext {
+ const bool correctStats;
+ const Timezone* const writerTimezone;
+ StatContext() : correctStats(false), writerTimezone(nullptr) {}
+ StatContext(bool cStat, const Timezone* const timezone = nullptr) :
+ correctStats(cStat), writerTimezone(timezone) {}
+ };
+
+/**
+ * Internal Statistics Implementation
+ */
+
+ template <typename T>
+ class InternalStatisticsImpl {
+ private:
+ bool _hasNull;
+ bool _hasMinimum;
+ bool _hasMaximum;
+ bool _hasSum;
+ bool _hasTotalLength;
+ uint64_t _totalLength;
+ uint64_t _valueCount;
+ T _minimum;
+ T _maximum;
+ T _sum;
+ public:
+ InternalStatisticsImpl() {
+ _hasNull = false;
+ _hasMinimum = false;
+ _hasMaximum = false;
+ _hasSum = false;
+ _hasTotalLength = false;
+ _totalLength = 0;
+ _valueCount = 0;
+ }
+
+ ~InternalStatisticsImpl() {}
+
+ // GET / SET _totalLength
+ bool hasTotalLength() const { return _hasTotalLength; }
+
+ void setHasTotalLength(bool hasTotalLength) {
+ _hasTotalLength = hasTotalLength;
+ }
+
+ uint64_t getTotalLength() const { return _totalLength; }
+
+ void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
+
+ // GET / SET _sum
+ bool hasSum() const { return _hasSum; }
+
+ void setHasSum(bool hasSum) { _hasSum = hasSum; }
+
+ T getSum() const { return _sum; }
+
+ void setSum(T sum) { _sum = sum; }
+
+ // GET / SET _maximum
+ bool hasMaximum() const { return _hasMaximum; }
+
const T & getMaximum() const { return _maximum; }
-
- void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
-
- void setMaximum(T max) { _maximum = max; }
-
- // GET / SET _minimum
- bool hasMinimum() const { return _hasMinimum; }
-
- void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
-
+
+ void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
+
+ void setMaximum(T max) { _maximum = max; }
+
+ // GET / SET _minimum
+ bool hasMinimum() const { return _hasMinimum; }
+
+ void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
+
const T & getMinimum() const { return _minimum; }
-
- void setMinimum(T min) { _minimum = min; }
-
- // GET / SET _valueCount
- uint64_t getNumberOfValues() const { return _valueCount; }
-
- void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
-
- // GET / SET _hasNullValue
- bool hasNull() const { return _hasNull; }
-
- void setHasNull(bool hasNull) { _hasNull = hasNull; }
-
- void reset() {
- _hasNull = false;
- _hasMinimum = false;
- _hasMaximum = false;
- _hasSum = false;
- _hasTotalLength = false;
- _totalLength = 0;
- _valueCount = 0;
- }
-
- void updateMinMax(T value) {
- if (!_hasMinimum) {
- _hasMinimum = _hasMaximum = true;
- _minimum = _maximum = value;
- } else if (compare(value, _minimum)) {
- _minimum = value;
- } else if (compare(_maximum, value)) {
- _maximum = value;
- }
- }
-
- // sum is not merged here as we need to check overflow
- void merge(const InternalStatisticsImpl& other) {
- _hasNull = _hasNull || other._hasNull;
- _valueCount += other._valueCount;
-
- if (other._hasMinimum) {
- if (!_hasMinimum) {
- _hasMinimum = _hasMaximum = true;
- _minimum = other._minimum;
- _maximum = other._maximum;
- } else {
- // all template types should support operator<
- if (compare(_maximum, other._maximum)) {
- _maximum = other._maximum;
- }
- if (compare(other._minimum, _minimum)) {
- _minimum = other._minimum;
- }
- }
- }
-
- _hasTotalLength = _hasTotalLength && other._hasTotalLength;
- _totalLength += other._totalLength;
- }
- };
-
- typedef InternalStatisticsImpl<char> InternalCharStatistics;
- typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
- typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
- typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
- typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
- typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
- typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
-
- /**
- * Mutable column statistics for use by the writer.
- */
- class MutableColumnStatistics {
- public:
- virtual ~MutableColumnStatistics();
-
- virtual void increase(uint64_t count) = 0;
-
- virtual void setNumberOfValues(uint64_t value) = 0;
-
- virtual void setHasNull(bool hasNull) = 0;
-
- virtual void merge(const MutableColumnStatistics& other) = 0;
-
- virtual void reset() = 0;
-
- virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
- };
-
-/**
- * ColumnStatistics Implementation
- */
-
- class ColumnStatisticsImpl: public ColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalCharStatistics _stats;
- public:
- ColumnStatisticsImpl() { reset(); }
- ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
- virtual ~ColumnStatisticsImpl() override;
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- void merge(const MutableColumnStatistics& other) override {
- _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
- }
-
- void reset() override {
- _stats.reset();
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Column has " << getNumberOfValues() << " values"
- << " and has null value: " << (hasNull() ? "yes" : "no")
- << std::endl;
- return buffer.str();
- }
- };
-
- class BinaryColumnStatisticsImpl: public BinaryColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalCharStatistics _stats;
- public:
- BinaryColumnStatisticsImpl() { reset(); }
- BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
- virtual ~BinaryColumnStatisticsImpl() override;
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- bool hasTotalLength() const override {
- return _stats.hasTotalLength();
- }
-
- uint64_t getTotalLength() const override {
- if(hasTotalLength()){
- return _stats.getTotalLength();
- }else{
- throw ParseError("Total length is not defined.");
- }
- }
-
- void setTotalLength(uint64_t length) {
- _stats.setHasTotalLength(true);
- _stats.setTotalLength(length);
- }
-
- void update(size_t length) {
- _stats.setTotalLength(_stats.getTotalLength() + length);
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const BinaryColumnStatisticsImpl& binStats =
- dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
- _stats.merge(binStats._stats);
- }
-
- void reset() override {
- _stats.reset();
- setTotalLength(0);
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics();
- binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: Binary" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasTotalLength()){
- buffer << "Total length: " << getTotalLength() << std::endl;
- }else{
- buffer << "Total length: not defined" << std::endl;
- }
- return buffer.str();
- }
- };
-
- class BooleanColumnStatisticsImpl: public BooleanColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalBooleanStatistics _stats;
- bool _hasCount;
- uint64_t _trueCount;
-
- public:
- BooleanColumnStatisticsImpl() { reset(); }
- BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
- virtual ~BooleanColumnStatisticsImpl() override;
-
- bool hasCount() const override {
- return _hasCount;
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- _hasCount = true;
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- uint64_t getFalseCount() const override {
- if(hasCount()){
- return getNumberOfValues() - _trueCount;
- }else{
- throw ParseError("False count is not defined.");
- }
- }
-
- uint64_t getTrueCount() const override {
- if(hasCount()){
- return _trueCount;
- }else{
- throw ParseError("True count is not defined.");
- }
- }
-
- void setTrueCount(uint64_t trueCount) {
- _hasCount = true;
- _trueCount = trueCount;
- }
-
- void update(bool value, size_t repetitions) {
- if (value) {
- _trueCount += repetitions;
- }
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const BooleanColumnStatisticsImpl& boolStats =
- dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
- _stats.merge(boolStats._stats);
- _hasCount = _hasCount && boolStats._hasCount;
- _trueCount += boolStats._trueCount;
- }
-
- void reset() override {
- _stats.reset();
- setTrueCount(0);
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
- if (_hasCount) {
- bucketStats->add_count(_trueCount);
- } else {
- bucketStats->clear_count();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: Boolean" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasCount()){
- buffer << "(true: " << getTrueCount() << "; false: "
- << getFalseCount() << ")" << std::endl;
- } else {
- buffer << "(true: not defined; false: not defined)" << std::endl;
- buffer << "True and false counts are not defined" << std::endl;
- }
- return buffer.str();
- }
- };
-
- class DateColumnStatisticsImpl: public DateColumnStatistics,
- public MutableColumnStatistics{
- private:
- InternalDateStatistics _stats;
- public:
- DateColumnStatisticsImpl() { reset(); }
- DateColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
- virtual ~DateColumnStatisticsImpl() override;
-
- bool hasMinimum() const override {
- return _stats.hasMinimum();
- }
-
- bool hasMaximum() const override {
- return _stats.hasMaximum();
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- int32_t getMinimum() const override {
- if(hasMinimum()){
- return _stats.getMinimum();
- }else{
- throw ParseError("Minimum is not defined.");
- }
- }
-
- int32_t getMaximum() const override {
- if(hasMaximum()){
- return _stats.getMaximum();
- }else{
- throw ParseError("Maximum is not defined.");
- }
- }
-
- void setMinimum(int32_t minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
- }
-
- void setMaximum(int32_t maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
- }
-
- void update(int32_t value) {
- _stats.updateMinMax(value);
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const DateColumnStatisticsImpl& dateStats =
- dynamic_cast<const DateColumnStatisticsImpl&>(other);
- _stats.merge(dateStats._stats);
- }
-
- void reset() override {
- _stats.reset();
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::DateStatistics* dateStatistics =
- pbStats.mutable_datestatistics();
- if (_stats.hasMinimum()) {
- dateStatistics->set_maximum(_stats.getMaximum());
- dateStatistics->set_minimum(_stats.getMinimum());
- } else {
- dateStatistics->clear_minimum();
- dateStatistics->clear_maximum();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: Date" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
- buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
- buffer << "Minimum: not defined" << std::endl;
- }
-
- if(hasMaximum()){
- buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
- buffer << "Maximum: not defined" << std::endl;
- }
- return buffer.str();
- }
- };
-
- class DecimalColumnStatisticsImpl: public DecimalColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalDecimalStatistics _stats;
-
- public:
- DecimalColumnStatisticsImpl() { reset(); }
- DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
- virtual ~DecimalColumnStatisticsImpl() override;
-
- bool hasMinimum() const override {
- return _stats.hasMinimum();
- }
-
- bool hasMaximum() const override {
- return _stats.hasMaximum();
- }
-
- bool hasSum() const override {
- return _stats.hasSum();
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- Decimal getMinimum() const override {
- if(hasMinimum()){
- return _stats.getMinimum();
- }else{
- throw ParseError("Minimum is not defined.");
- }
- }
-
- Decimal getMaximum() const override {
- if(hasMaximum()){
- return _stats.getMaximum();
- }else{
- throw ParseError("Maximum is not defined.");
- }
- }
-
- void setMinimum(Decimal minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
- }
-
- void setMaximum(Decimal maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
- }
-
- Decimal getSum() const override {
- if(hasSum()){
- return _stats.getSum();
- }else{
- throw ParseError("Sum is not defined.");
- }
- }
-
- void setSum(Decimal sum) {
- _stats.setHasSum(true);
- _stats.setSum(sum);
- }
-
- void update(const Decimal& value) {
- _stats.updateMinMax(value);
-
- if (_stats.hasSum()) {
- updateSum(value);
- }
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const DecimalColumnStatisticsImpl& decStats =
- dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
-
- _stats.merge(decStats._stats);
-
- _stats.setHasSum(_stats.hasSum() && decStats.hasSum());
- if (_stats.hasSum()) {
- updateSum(decStats.getSum());
- }
- }
-
- void reset() override {
- _stats.reset();
- setSum(Decimal());
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics();
- if (_stats.hasMinimum()) {
- decStats->set_minimum(TString(_stats.getMinimum().toString()));
- decStats->set_maximum(TString(_stats.getMaximum().toString()));
- } else {
- decStats->clear_minimum();
- decStats->clear_maximum();
- }
- if (_stats.hasSum()) {
- decStats->set_sum(TString(_stats.getSum().toString()));
- } else {
- decStats->clear_sum();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: Decimal" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
- buffer << "Minimum: " << getMinimum().toString() << std::endl;
- }else{
- buffer << "Minimum: not defined" << std::endl;
- }
-
- if(hasMaximum()){
- buffer << "Maximum: " << getMaximum().toString() << std::endl;
- }else{
- buffer << "Maximum: not defined" << std::endl;
- }
-
- if(hasSum()){
- buffer << "Sum: " << getSum().toString() << std::endl;
- }else{
- buffer << "Sum: not defined" << std::endl;
- }
-
- return buffer.str();
- }
-
- private:
- void updateSum(Decimal value) {
- if (_stats.hasSum()) {
- bool overflow = false;
- Decimal sum = _stats.getSum();
- if (sum.scale > value.scale) {
- value.value = scaleUpInt128ByPowerOfTen(value.value,
- sum.scale - value.scale,
- overflow);
- } else if (sum.scale < value.scale) {
- sum.value = scaleUpInt128ByPowerOfTen(sum.value,
- value.scale - sum.scale,
- overflow);
- sum.scale = value.scale;
- }
-
- if (!overflow) {
- bool wasPositive = sum.value >= 0;
- sum.value += value.value;
- if ((value.value >= 0) == wasPositive) {
- _stats.setHasSum((sum.value >= 0) == wasPositive);
- }
- } else {
- _stats.setHasSum(false);
- }
-
- if (_stats.hasSum()) {
- _stats.setSum(sum);
- }
- }
- }
- };
-
- class DoubleColumnStatisticsImpl: public DoubleColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalDoubleStatistics _stats;
- public:
- DoubleColumnStatisticsImpl() { reset(); }
- DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
- virtual ~DoubleColumnStatisticsImpl() override;
-
- bool hasMinimum() const override {
- return _stats.hasMinimum();
- }
-
- bool hasMaximum() const override {
- return _stats.hasMaximum();
- }
-
- bool hasSum() const override {
- return _stats.hasSum();
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- double getMinimum() const override {
- if(hasMinimum()){
- return _stats.getMinimum();
- }else{
- throw ParseError("Minimum is not defined.");
- }
- }
-
- double getMaximum() const override {
- if(hasMaximum()){
- return _stats.getMaximum();
- }else{
- throw ParseError("Maximum is not defined.");
- }
- }
-
- void setMinimum(double minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
- }
-
- void setMaximum(double maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
- }
-
- double getSum() const override {
- if(hasSum()){
- return _stats.getSum();
- }else{
- throw ParseError("Sum is not defined.");
- }
- }
-
- void setSum(double sum) {
- _stats.setHasSum(true);
- _stats.setSum(sum);
- }
-
- void update(double value) {
- _stats.updateMinMax(value);
- _stats.setSum(_stats.getSum() + value);
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const DoubleColumnStatisticsImpl& doubleStats =
- dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
- _stats.merge(doubleStats._stats);
-
- _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
- if (_stats.hasSum()) {
- _stats.setSum(_stats.getSum() + doubleStats.getSum());
- }
- }
-
- void reset() override {
- _stats.reset();
- setSum(0.0);
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics();
- if (_stats.hasMinimum()) {
- doubleStats->set_minimum(_stats.getMinimum());
- doubleStats->set_maximum(_stats.getMaximum());
- } else {
- doubleStats->clear_minimum();
- doubleStats->clear_maximum();
- }
- if (_stats.hasSum()) {
- doubleStats->set_sum(_stats.getSum());
- } else {
- doubleStats->clear_sum();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: Double" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
- buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
- buffer << "Minimum: not defined" << std::endl;
- }
-
- if(hasMaximum()){
- buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
- buffer << "Maximum: not defined" << std::endl;
- }
-
- if(hasSum()){
- buffer << "Sum: " << getSum() << std::endl;
- }else{
- buffer << "Sum: not defined" << std::endl;
- }
- return buffer.str();
- }
- };
-
- class IntegerColumnStatisticsImpl: public IntegerColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalIntegerStatistics _stats;
- public:
- IntegerColumnStatisticsImpl() { reset(); }
- IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
- virtual ~IntegerColumnStatisticsImpl() override;
-
- bool hasMinimum() const override {
- return _stats.hasMinimum();
- }
-
- bool hasMaximum() const override {
- return _stats.hasMaximum();
- }
-
- bool hasSum() const override {
- return _stats.hasSum();
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- int64_t getMinimum() const override {
- if(hasMinimum()){
- return _stats.getMinimum();
- }else{
- throw ParseError("Minimum is not defined.");
- }
- }
-
- int64_t getMaximum() const override {
- if(hasMaximum()){
- return _stats.getMaximum();
- }else{
- throw ParseError("Maximum is not defined.");
- }
- }
-
- void setMinimum(int64_t minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
- }
-
- void setMaximum(int64_t maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
- }
-
- int64_t getSum() const override {
- if(hasSum()){
- return _stats.getSum();
- }else{
- throw ParseError("Sum is not defined.");
- }
- }
-
- void setSum(int64_t sum) {
- _stats.setHasSum(true);
- _stats.setSum(sum);
- }
-
+
+ void setMinimum(T min) { _minimum = min; }
+
+ // GET / SET _valueCount
+ uint64_t getNumberOfValues() const { return _valueCount; }
+
+ void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
+
+ // GET / SET _hasNullValue
+ bool hasNull() const { return _hasNull; }
+
+ void setHasNull(bool hasNull) { _hasNull = hasNull; }
+
+ void reset() {
+ _hasNull = false;
+ _hasMinimum = false;
+ _hasMaximum = false;
+ _hasSum = false;
+ _hasTotalLength = false;
+ _totalLength = 0;
+ _valueCount = 0;
+ }
+
+ void updateMinMax(T value) {
+ if (!_hasMinimum) {
+ _hasMinimum = _hasMaximum = true;
+ _minimum = _maximum = value;
+ } else if (compare(value, _minimum)) {
+ _minimum = value;
+ } else if (compare(_maximum, value)) {
+ _maximum = value;
+ }
+ }
+
+ // sum is not merged here as we need to check overflow
+ void merge(const InternalStatisticsImpl& other) {
+ _hasNull = _hasNull || other._hasNull;
+ _valueCount += other._valueCount;
+
+ if (other._hasMinimum) {
+ if (!_hasMinimum) {
+ _hasMinimum = _hasMaximum = true;
+ _minimum = other._minimum;
+ _maximum = other._maximum;
+ } else {
+ // all template types should support operator<
+ if (compare(_maximum, other._maximum)) {
+ _maximum = other._maximum;
+ }
+ if (compare(other._minimum, _minimum)) {
+ _minimum = other._minimum;
+ }
+ }
+ }
+
+ _hasTotalLength = _hasTotalLength && other._hasTotalLength;
+ _totalLength += other._totalLength;
+ }
+ };
+
+ typedef InternalStatisticsImpl<char> InternalCharStatistics;
+ typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
+ typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
+ typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
+ typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
+ typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
+ typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
+
+ /**
+ * Mutable column statistics for use by the writer.
+ */
+ class MutableColumnStatistics {
+ public:
+ virtual ~MutableColumnStatistics();
+
+ virtual void increase(uint64_t count) = 0;
+
+ virtual void setNumberOfValues(uint64_t value) = 0;
+
+ virtual void setHasNull(bool hasNull) = 0;
+
+ virtual void merge(const MutableColumnStatistics& other) = 0;
+
+ virtual void reset() = 0;
+
+ virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
+ };
+
+/**
+ * ColumnStatistics Implementation
+ */
+
+ class ColumnStatisticsImpl: public ColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalCharStatistics _stats;
+ public:
+ ColumnStatisticsImpl() { reset(); }
+ ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
+ virtual ~ColumnStatisticsImpl() override;
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
+ }
+
+ void reset() override {
+ _stats.reset();
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Column has " << getNumberOfValues() << " values"
+ << " and has null value: " << (hasNull() ? "yes" : "no")
+ << std::endl;
+ return buffer.str();
+ }
+ };
+
+ class BinaryColumnStatisticsImpl: public BinaryColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalCharStatistics _stats;
+ public:
+ BinaryColumnStatisticsImpl() { reset(); }
+ BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ const StatContext& statContext);
+ virtual ~BinaryColumnStatisticsImpl() override;
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ bool hasTotalLength() const override {
+ return _stats.hasTotalLength();
+ }
+
+ uint64_t getTotalLength() const override {
+ if(hasTotalLength()){
+ return _stats.getTotalLength();
+ }else{
+ throw ParseError("Total length is not defined.");
+ }
+ }
+
+ void setTotalLength(uint64_t length) {
+ _stats.setHasTotalLength(true);
+ _stats.setTotalLength(length);
+ }
+
+ void update(size_t length) {
+ _stats.setTotalLength(_stats.getTotalLength() + length);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const BinaryColumnStatisticsImpl& binStats =
+ dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
+ _stats.merge(binStats._stats);
+ }
+
+ void reset() override {
+ _stats.reset();
+ setTotalLength(0);
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics();
+ binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Binary" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasTotalLength()){
+ buffer << "Total length: " << getTotalLength() << std::endl;
+ }else{
+ buffer << "Total length: not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
+ class BooleanColumnStatisticsImpl: public BooleanColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalBooleanStatistics _stats;
+ bool _hasCount;
+ uint64_t _trueCount;
+
+ public:
+ BooleanColumnStatisticsImpl() { reset(); }
+ BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ const StatContext& statContext);
+ virtual ~BooleanColumnStatisticsImpl() override;
+
+ bool hasCount() const override {
+ return _hasCount;
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ _hasCount = true;
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ uint64_t getFalseCount() const override {
+ if(hasCount()){
+ return getNumberOfValues() - _trueCount;
+ }else{
+ throw ParseError("False count is not defined.");
+ }
+ }
+
+ uint64_t getTrueCount() const override {
+ if(hasCount()){
+ return _trueCount;
+ }else{
+ throw ParseError("True count is not defined.");
+ }
+ }
+
+ void setTrueCount(uint64_t trueCount) {
+ _hasCount = true;
+ _trueCount = trueCount;
+ }
+
+ void update(bool value, size_t repetitions) {
+ if (value) {
+ _trueCount += repetitions;
+ }
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const BooleanColumnStatisticsImpl& boolStats =
+ dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
+ _stats.merge(boolStats._stats);
+ _hasCount = _hasCount && boolStats._hasCount;
+ _trueCount += boolStats._trueCount;
+ }
+
+ void reset() override {
+ _stats.reset();
+ setTrueCount(0);
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
+ if (_hasCount) {
+ bucketStats->add_count(_trueCount);
+ } else {
+ bucketStats->clear_count();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Boolean" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasCount()){
+ buffer << "(true: " << getTrueCount() << "; false: "
+ << getFalseCount() << ")" << std::endl;
+ } else {
+ buffer << "(true: not defined; false: not defined)" << std::endl;
+ buffer << "True and false counts are not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
+ class DateColumnStatisticsImpl: public DateColumnStatistics,
+ public MutableColumnStatistics{
+ private:
+ InternalDateStatistics _stats;
+ public:
+ DateColumnStatisticsImpl() { reset(); }
+ DateColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ const StatContext& statContext);
+ virtual ~DateColumnStatisticsImpl() override;
+
+ bool hasMinimum() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximum() const override {
+ return _stats.hasMaximum();
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ int32_t getMinimum() const override {
+ if(hasMinimum()){
+ return _stats.getMinimum();
+ }else{
+ throw ParseError("Minimum is not defined.");
+ }
+ }
+
+ int32_t getMaximum() const override {
+ if(hasMaximum()){
+ return _stats.getMaximum();
+ }else{
+ throw ParseError("Maximum is not defined.");
+ }
+ }
+
+ void setMinimum(int32_t minimum) {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximum(int32_t maximum) {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ void update(int32_t value) {
+ _stats.updateMinMax(value);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const DateColumnStatisticsImpl& dateStats =
+ dynamic_cast<const DateColumnStatisticsImpl&>(other);
+ _stats.merge(dateStats._stats);
+ }
+
+ void reset() override {
+ _stats.reset();
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::DateStatistics* dateStatistics =
+ pbStats.mutable_datestatistics();
+ if (_stats.hasMinimum()) {
+ dateStatistics->set_maximum(_stats.getMaximum());
+ dateStatistics->set_minimum(_stats.getMinimum());
+ } else {
+ dateStatistics->clear_minimum();
+ dateStatistics->clear_maximum();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Date" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
+ }else{
+ buffer << "Minimum: not defined" << std::endl;
+ }
+
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
+ }else{
+ buffer << "Maximum: not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
+ class DecimalColumnStatisticsImpl: public DecimalColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalDecimalStatistics _stats;
+
+ public:
+ DecimalColumnStatisticsImpl() { reset(); }
+ DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ const StatContext& statContext);
+ virtual ~DecimalColumnStatisticsImpl() override;
+
+ bool hasMinimum() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximum() const override {
+ return _stats.hasMaximum();
+ }
+
+ bool hasSum() const override {
+ return _stats.hasSum();
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ Decimal getMinimum() const override {
+ if(hasMinimum()){
+ return _stats.getMinimum();
+ }else{
+ throw ParseError("Minimum is not defined.");
+ }
+ }
+
+ Decimal getMaximum() const override {
+ if(hasMaximum()){
+ return _stats.getMaximum();
+ }else{
+ throw ParseError("Maximum is not defined.");
+ }
+ }
+
+ void setMinimum(Decimal minimum) {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximum(Decimal maximum) {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ Decimal getSum() const override {
+ if(hasSum()){
+ return _stats.getSum();
+ }else{
+ throw ParseError("Sum is not defined.");
+ }
+ }
+
+ void setSum(Decimal sum) {
+ _stats.setHasSum(true);
+ _stats.setSum(sum);
+ }
+
+ void update(const Decimal& value) {
+ _stats.updateMinMax(value);
+
+ if (_stats.hasSum()) {
+ updateSum(value);
+ }
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const DecimalColumnStatisticsImpl& decStats =
+ dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
+
+ _stats.merge(decStats._stats);
+
+ _stats.setHasSum(_stats.hasSum() && decStats.hasSum());
+ if (_stats.hasSum()) {
+ updateSum(decStats.getSum());
+ }
+ }
+
+ void reset() override {
+ _stats.reset();
+ setSum(Decimal());
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics();
+ if (_stats.hasMinimum()) {
+ decStats->set_minimum(TString(_stats.getMinimum().toString()));
+ decStats->set_maximum(TString(_stats.getMaximum().toString()));
+ } else {
+ decStats->clear_minimum();
+ decStats->clear_maximum();
+ }
+ if (_stats.hasSum()) {
+ decStats->set_sum(TString(_stats.getSum().toString()));
+ } else {
+ decStats->clear_sum();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Decimal" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum().toString() << std::endl;
+ }else{
+ buffer << "Minimum: not defined" << std::endl;
+ }
+
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum().toString() << std::endl;
+ }else{
+ buffer << "Maximum: not defined" << std::endl;
+ }
+
+ if(hasSum()){
+ buffer << "Sum: " << getSum().toString() << std::endl;
+ }else{
+ buffer << "Sum: not defined" << std::endl;
+ }
+
+ return buffer.str();
+ }
+
+ private:
+ void updateSum(Decimal value) {
+ if (_stats.hasSum()) {
+ bool overflow = false;
+ Decimal sum = _stats.getSum();
+ if (sum.scale > value.scale) {
+ value.value = scaleUpInt128ByPowerOfTen(value.value,
+ sum.scale - value.scale,
+ overflow);
+ } else if (sum.scale < value.scale) {
+ sum.value = scaleUpInt128ByPowerOfTen(sum.value,
+ value.scale - sum.scale,
+ overflow);
+ sum.scale = value.scale;
+ }
+
+ if (!overflow) {
+ bool wasPositive = sum.value >= 0;
+ sum.value += value.value;
+ if ((value.value >= 0) == wasPositive) {
+ _stats.setHasSum((sum.value >= 0) == wasPositive);
+ }
+ } else {
+ _stats.setHasSum(false);
+ }
+
+ if (_stats.hasSum()) {
+ _stats.setSum(sum);
+ }
+ }
+ }
+ };
+
+ class DoubleColumnStatisticsImpl: public DoubleColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalDoubleStatistics _stats;
+ public:
+ DoubleColumnStatisticsImpl() { reset(); }
+ DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
+ virtual ~DoubleColumnStatisticsImpl() override;
+
+ bool hasMinimum() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximum() const override {
+ return _stats.hasMaximum();
+ }
+
+ bool hasSum() const override {
+ return _stats.hasSum();
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ double getMinimum() const override {
+ if(hasMinimum()){
+ return _stats.getMinimum();
+ }else{
+ throw ParseError("Minimum is not defined.");
+ }
+ }
+
+ double getMaximum() const override {
+ if(hasMaximum()){
+ return _stats.getMaximum();
+ }else{
+ throw ParseError("Maximum is not defined.");
+ }
+ }
+
+ void setMinimum(double minimum) {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximum(double maximum) {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ double getSum() const override {
+ if(hasSum()){
+ return _stats.getSum();
+ }else{
+ throw ParseError("Sum is not defined.");
+ }
+ }
+
+ void setSum(double sum) {
+ _stats.setHasSum(true);
+ _stats.setSum(sum);
+ }
+
+ void update(double value) {
+ _stats.updateMinMax(value);
+ _stats.setSum(_stats.getSum() + value);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const DoubleColumnStatisticsImpl& doubleStats =
+ dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
+ _stats.merge(doubleStats._stats);
+
+ _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
+ if (_stats.hasSum()) {
+ _stats.setSum(_stats.getSum() + doubleStats.getSum());
+ }
+ }
+
+ void reset() override {
+ _stats.reset();
+ setSum(0.0);
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics();
+ if (_stats.hasMinimum()) {
+ doubleStats->set_minimum(_stats.getMinimum());
+ doubleStats->set_maximum(_stats.getMaximum());
+ } else {
+ doubleStats->clear_minimum();
+ doubleStats->clear_maximum();
+ }
+ if (_stats.hasSum()) {
+ doubleStats->set_sum(_stats.getSum());
+ } else {
+ doubleStats->clear_sum();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Double" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
+ }else{
+ buffer << "Minimum: not defined" << std::endl;
+ }
+
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
+ }else{
+ buffer << "Maximum: not defined" << std::endl;
+ }
+
+ if(hasSum()){
+ buffer << "Sum: " << getSum() << std::endl;
+ }else{
+ buffer << "Sum: not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
+ class IntegerColumnStatisticsImpl: public IntegerColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalIntegerStatistics _stats;
+ public:
+ IntegerColumnStatisticsImpl() { reset(); }
+ IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
+ virtual ~IntegerColumnStatisticsImpl() override;
+
+ bool hasMinimum() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximum() const override {
+ return _stats.hasMaximum();
+ }
+
+ bool hasSum() const override {
+ return _stats.hasSum();
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ int64_t getMinimum() const override {
+ if(hasMinimum()){
+ return _stats.getMinimum();
+ }else{
+ throw ParseError("Minimum is not defined.");
+ }
+ }
+
+ int64_t getMaximum() const override {
+ if(hasMaximum()){
+ return _stats.getMaximum();
+ }else{
+ throw ParseError("Maximum is not defined.");
+ }
+ }
+
+ void setMinimum(int64_t minimum) {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximum(int64_t maximum) {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ int64_t getSum() const override {
+ if(hasSum()){
+ return _stats.getSum();
+ }else{
+ throw ParseError("Sum is not defined.");
+ }
+ }
+
+ void setSum(int64_t sum) {
+ _stats.setHasSum(true);
+ _stats.setSum(sum);
+ }
+
void update(int64_t value, int repetitions) {
_stats.updateMinMax(value);
-
+
if (_stats.hasSum()) {
if (repetitions > 1) {
_stats.setHasSum(multiplyExact(value, repetitions, &value));
@@ -981,498 +981,498 @@ namespace orc {
}
}
- void merge(const MutableColumnStatistics& other) override {
- const IntegerColumnStatisticsImpl& intStats =
- dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
-
- _stats.merge(intStats._stats);
-
- // update sum and check overflow
- _stats.setHasSum(_stats.hasSum() && intStats.hasSum());
- if (_stats.hasSum()) {
+ void merge(const MutableColumnStatistics& other) override {
+ const IntegerColumnStatisticsImpl& intStats =
+ dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
+
+ _stats.merge(intStats._stats);
+
+ // update sum and check overflow
+ _stats.setHasSum(_stats.hasSum() && intStats.hasSum());
+ if (_stats.hasSum()) {
int64_t value;
_stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value));
if (_stats.hasSum()) {
_stats.setSum(value);
- }
- }
- }
-
- void reset() override {
- _stats.reset();
- setSum(0);
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics();
- if (_stats.hasMinimum()) {
- intStats->set_minimum(_stats.getMinimum());
- intStats->set_maximum(_stats.getMaximum());
- } else {
- intStats->clear_minimum();
- intStats->clear_maximum();
- }
- if (_stats.hasSum()) {
- intStats->set_sum(_stats.getSum());
- } else {
- intStats->clear_sum();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: Integer" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
- buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
- buffer << "Minimum: not defined" << std::endl;
- }
-
- if(hasMaximum()){
- buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
- buffer << "Maximum: not defined" << std::endl;
- }
-
- if(hasSum()){
- buffer << "Sum: " << getSum() << std::endl;
- }else{
- buffer << "Sum: not defined" << std::endl;
- }
- return buffer.str();
- }
- };
-
- class StringColumnStatisticsImpl: public StringColumnStatistics,
- public MutableColumnStatistics{
- private:
- InternalStringStatistics _stats;
-
- public:
- StringColumnStatisticsImpl() {
- reset();
- }
- StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
- virtual ~StringColumnStatisticsImpl() override;
-
- bool hasMinimum() const override {
- return _stats.hasMinimum();
- }
-
- bool hasMaximum() const override {
- return _stats.hasMaximum();
- }
-
- bool hasTotalLength() const override {
- return _stats.hasTotalLength();
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
+ }
+ }
+ }
+
+ void reset() override {
+ _stats.reset();
+ setSum(0);
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics();
+ if (_stats.hasMinimum()) {
+ intStats->set_minimum(_stats.getMinimum());
+ intStats->set_maximum(_stats.getMaximum());
+ } else {
+ intStats->clear_minimum();
+ intStats->clear_maximum();
+ }
+ if (_stats.hasSum()) {
+ intStats->set_sum(_stats.getSum());
+ } else {
+ intStats->clear_sum();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Integer" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
+ }else{
+ buffer << "Minimum: not defined" << std::endl;
+ }
+
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
+ }else{
+ buffer << "Maximum: not defined" << std::endl;
+ }
+
+ if(hasSum()){
+ buffer << "Sum: " << getSum() << std::endl;
+ }else{
+ buffer << "Sum: not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
+ class StringColumnStatisticsImpl: public StringColumnStatistics,
+ public MutableColumnStatistics{
+ private:
+ InternalStringStatistics _stats;
+
+ public:
+ StringColumnStatisticsImpl() {
+ reset();
+ }
+ StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ const StatContext& statContext);
+ virtual ~StringColumnStatisticsImpl() override;
+
+ bool hasMinimum() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximum() const override {
+ return _stats.hasMaximum();
+ }
+
+ bool hasTotalLength() const override {
+ return _stats.hasTotalLength();
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
const std::string & getMinimum() const override {
- if(hasMinimum()){
- return _stats.getMinimum();
- }else{
- throw ParseError("Minimum is not defined.");
- }
- }
-
+ if(hasMinimum()){
+ return _stats.getMinimum();
+ }else{
+ throw ParseError("Minimum is not defined.");
+ }
+ }
+
const std::string & getMaximum() const override {
- if(hasMaximum()){
- return _stats.getMaximum();
- }else{
- throw ParseError("Maximum is not defined.");
- }
- }
-
- void setMinimum(std::string minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
- }
-
- void setMaximum(std::string maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
- }
-
- uint64_t getTotalLength() const override {
- if(hasTotalLength()){
- return _stats.getTotalLength();
- }else{
- throw ParseError("Total length is not defined.");
- }
- }
-
- void setTotalLength(uint64_t length) {
- _stats.setHasTotalLength(true);
- _stats.setTotalLength(length);
- }
-
- void update(const char* value, size_t length) {
- if (value != nullptr) {
- if (!_stats.hasMinimum()) {
- std::string tempStr(value, value + length);
- setMinimum(tempStr);
- setMaximum(tempStr);
- } else {
- // update min
- int minCmp = strncmp(_stats.getMinimum().c_str(),
- value,
- std::min(_stats.getMinimum().length(), length));
- if (minCmp > 0 ||
- (minCmp == 0 && length < _stats.getMinimum().length())) {
- setMinimum(std::string(value, value + length));
- }
-
- // update max
- int maxCmp = strncmp(_stats.getMaximum().c_str(),
- value,
- std::min(_stats.getMaximum().length(), length));
- if (maxCmp < 0 ||
- (maxCmp == 0 && length > _stats.getMaximum().length())) {
- setMaximum(std::string(value, value + length));
- }
- }
- }
-
- _stats.setTotalLength(_stats.getTotalLength() + length);
- }
-
- void update(std::string value) {
- update(value.c_str(), value.length());
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const StringColumnStatisticsImpl& strStats =
- dynamic_cast<const StringColumnStatisticsImpl&>(other);
- _stats.merge(strStats._stats);
- }
-
- void reset() override {
- _stats.reset();
- setTotalLength(0);
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::StringStatistics* strStats = pbStats.mutable_stringstatistics();
- if (_stats.hasMinimum()) {
- strStats->set_minimum(TString(_stats.getMinimum()));
- strStats->set_maximum(TString(_stats.getMaximum()));
- } else {
- strStats->clear_minimum();
- strStats->clear_maximum();
- }
- if (_stats.hasTotalLength()) {
- strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
- } else {
- strStats->clear_sum();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- buffer << "Data type: String" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
- buffer << "Minimum: " << getMinimum() << std::endl;
- }else{
- buffer << "Minimum is not defined" << std::endl;
- }
-
- if(hasMaximum()){
- buffer << "Maximum: " << getMaximum() << std::endl;
- }else{
- buffer << "Maximum is not defined" << std::endl;
- }
-
- if(hasTotalLength()){
- buffer << "Total length: " << getTotalLength() << std::endl;
- }else{
- buffer << "Total length is not defined" << std::endl;
- }
- return buffer.str();
- }
- };
-
- class TimestampColumnStatisticsImpl: public TimestampColumnStatistics,
- public MutableColumnStatistics {
- private:
- InternalIntegerStatistics _stats;
- bool _hasLowerBound;
- bool _hasUpperBound;
- int64_t _lowerBound;
- int64_t _upperBound;
-
- public:
- TimestampColumnStatisticsImpl() { reset(); }
- TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
- const StatContext& statContext);
- virtual ~TimestampColumnStatisticsImpl() override;
-
- bool hasMinimum() const override {
- return _stats.hasMinimum();
- }
-
- bool hasMaximum() const override {
- return _stats.hasMaximum();
- }
-
- uint64_t getNumberOfValues() const override {
- return _stats.getNumberOfValues();
- }
-
- void setNumberOfValues(uint64_t value) override {
- _stats.setNumberOfValues(value);
- }
-
- void increase(uint64_t count) override {
- _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
- }
-
- bool hasNull() const override {
- return _stats.hasNull();
- }
-
- void setHasNull(bool hasNull) override {
- _stats.setHasNull(hasNull);
- }
-
- int64_t getMinimum() const override {
- if(hasMinimum()){
- return _stats.getMinimum();
- }else{
- throw ParseError("Minimum is not defined.");
- }
- }
-
- int64_t getMaximum() const override {
- if(hasMaximum()){
- return _stats.getMaximum();
- }else{
- throw ParseError("Maximum is not defined.");
- }
- }
-
- void setMinimum(int64_t minimum) {
- _stats.setHasMinimum(true);
- _stats.setMinimum(minimum);
- }
-
- void setMaximum(int64_t maximum) {
- _stats.setHasMaximum(true);
- _stats.setMaximum(maximum);
- }
-
- void update(int64_t value) {
- _stats.updateMinMax(value);
- }
-
- void merge(const MutableColumnStatistics& other) override {
- const TimestampColumnStatisticsImpl& tsStats =
- dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
- _stats.merge(tsStats._stats);
- }
-
- void reset() override {
- _stats.reset();
- }
-
- void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
- pbStats.set_hasnull(_stats.hasNull());
- pbStats.set_numberofvalues(_stats.getNumberOfValues());
-
- proto::TimestampStatistics* tsStats =
- pbStats.mutable_timestampstatistics();
- if (_stats.hasMinimum()) {
- tsStats->set_minimumutc(_stats.getMinimum());
- tsStats->set_maximumutc(_stats.getMaximum());
- } else {
- tsStats->clear_minimumutc();
- tsStats->clear_maximumutc();
- }
- }
-
- std::string toString() const override {
- std::ostringstream buffer;
- struct tm tmValue;
- char timeBuffer[20];
- time_t secs = 0;
-
- buffer << "Data type: Timestamp" << std::endl
- << "Values: " << getNumberOfValues() << std::endl
- << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
- if(hasMinimum()){
- secs = static_cast<time_t>(getMinimum() / 1000);
- gmtime_r(&secs, &tmValue);
- strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "Minimum: " << timeBuffer << "."
- << (getMinimum() % 1000) << std::endl;
- }else{
- buffer << "Minimum is not defined" << std::endl;
- }
-
- if(hasLowerBound()){
- secs = static_cast<time_t>(getLowerBound() / 1000);
- gmtime_r(&secs, &tmValue);
- strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "LowerBound: " << timeBuffer << "."
- << (getLowerBound() % 1000) << std::endl;
- }else{
- buffer << "LowerBound is not defined" << std::endl;
- }
-
- if(hasMaximum()){
- secs = static_cast<time_t>(getMaximum()/1000);
- gmtime_r(&secs, &tmValue);
- strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "Maximum: " << timeBuffer << "."
- << (getMaximum() % 1000) << std::endl;
- }else{
- buffer << "Maximum is not defined" << std::endl;
- }
-
- if(hasUpperBound()){
- secs = static_cast<time_t>(getUpperBound() / 1000);
- gmtime_r(&secs, &tmValue);
- strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
- buffer << "UpperBound: " << timeBuffer << "."
- << (getUpperBound() % 1000) << std::endl;
- }else{
- buffer << "UpperBound is not defined" << std::endl;
- }
-
- return buffer.str();
- }
-
- bool hasLowerBound() const override {
- return _hasLowerBound;
- }
-
- bool hasUpperBound() const override {
- return _hasUpperBound;
- }
-
- int64_t getLowerBound() const override {
- if(hasLowerBound()){
- return _lowerBound;
- }else{
- throw ParseError("LowerBound is not defined.");
- }
- }
-
- int64_t getUpperBound() const override {
- if(hasUpperBound()){
- return _upperBound;
- }else{
- throw ParseError("UpperBound is not defined.");
- }
- }
- };
-
- ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
- const StatContext& statContext);
-
- class StatisticsImpl: public Statistics {
- private:
- std::vector<ColumnStatistics*> colStats;
-
- // DELIBERATELY NOT IMPLEMENTED
- StatisticsImpl(const StatisticsImpl&);
- StatisticsImpl& operator=(const StatisticsImpl&);
-
- public:
- StatisticsImpl(const proto::StripeStatistics& stripeStats,
- const StatContext& statContext);
-
- StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
-
- virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
- ) const override {
- return colStats[columnId];
- }
-
- virtual ~StatisticsImpl() override;
-
- uint32_t getNumberOfColumns() const override {
- return static_cast<uint32_t>(colStats.size());
- }
- };
-
- class StripeStatisticsImpl: public StripeStatistics {
- private:
- std::unique_ptr<StatisticsImpl> columnStats;
- std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > >
- rowIndexStats;
-
- // DELIBERATELY NOT IMPLEMENTED
- StripeStatisticsImpl(const StripeStatisticsImpl&);
- StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
-
- public:
- StripeStatisticsImpl(
- const proto::StripeStatistics& stripeStats,
- std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
- const StatContext& statContext);
-
- virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
- ) const override {
- return columnStats->getColumnStatistics(columnId);
- }
-
- uint32_t getNumberOfColumns() const override {
- return columnStats->getNumberOfColumns();
- }
-
- virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
- uint32_t rowIndex
- ) const override {
- // check id indices are valid
- return rowIndexStats[columnId][rowIndex].get();
- }
-
- virtual ~StripeStatisticsImpl() override;
-
- uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
- return static_cast<uint32_t>(rowIndexStats[columnId].size());
- }
- };
-
- /**
- * Create ColumnStatistics for writers
- * @param type of column
- * @return MutableColumnStatistics instances
- */
- std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
- const Type& type);
-
-}// namespace
-
-#endif
+ if(hasMaximum()){
+ return _stats.getMaximum();
+ }else{
+ throw ParseError("Maximum is not defined.");
+ }
+ }
+
+ void setMinimum(std::string minimum) {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximum(std::string maximum) {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ uint64_t getTotalLength() const override {
+ if(hasTotalLength()){
+ return _stats.getTotalLength();
+ }else{
+ throw ParseError("Total length is not defined.");
+ }
+ }
+
+ void setTotalLength(uint64_t length) {
+ _stats.setHasTotalLength(true);
+ _stats.setTotalLength(length);
+ }
+
+ void update(const char* value, size_t length) {
+ if (value != nullptr) {
+ if (!_stats.hasMinimum()) {
+ std::string tempStr(value, value + length);
+ setMinimum(tempStr);
+ setMaximum(tempStr);
+ } else {
+ // update min
+ int minCmp = strncmp(_stats.getMinimum().c_str(),
+ value,
+ std::min(_stats.getMinimum().length(), length));
+ if (minCmp > 0 ||
+ (minCmp == 0 && length < _stats.getMinimum().length())) {
+ setMinimum(std::string(value, value + length));
+ }
+
+ // update max
+ int maxCmp = strncmp(_stats.getMaximum().c_str(),
+ value,
+ std::min(_stats.getMaximum().length(), length));
+ if (maxCmp < 0 ||
+ (maxCmp == 0 && length > _stats.getMaximum().length())) {
+ setMaximum(std::string(value, value + length));
+ }
+ }
+ }
+
+ _stats.setTotalLength(_stats.getTotalLength() + length);
+ }
+
+ void update(std::string value) {
+ update(value.c_str(), value.length());
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const StringColumnStatisticsImpl& strStats =
+ dynamic_cast<const StringColumnStatisticsImpl&>(other);
+ _stats.merge(strStats._stats);
+ }
+
+ void reset() override {
+ _stats.reset();
+ setTotalLength(0);
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::StringStatistics* strStats = pbStats.mutable_stringstatistics();
+ if (_stats.hasMinimum()) {
+ strStats->set_minimum(TString(_stats.getMinimum()));
+ strStats->set_maximum(TString(_stats.getMaximum()));
+ } else {
+ strStats->clear_minimum();
+ strStats->clear_maximum();
+ }
+ if (_stats.hasTotalLength()) {
+ strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+ } else {
+ strStats->clear_sum();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: String" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ buffer << "Minimum: " << getMinimum() << std::endl;
+ }else{
+ buffer << "Minimum is not defined" << std::endl;
+ }
+
+ if(hasMaximum()){
+ buffer << "Maximum: " << getMaximum() << std::endl;
+ }else{
+ buffer << "Maximum is not defined" << std::endl;
+ }
+
+ if(hasTotalLength()){
+ buffer << "Total length: " << getTotalLength() << std::endl;
+ }else{
+ buffer << "Total length is not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
+ class TimestampColumnStatisticsImpl: public TimestampColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalIntegerStatistics _stats;
+ bool _hasLowerBound;
+ bool _hasUpperBound;
+ int64_t _lowerBound;
+ int64_t _upperBound;
+
+ public:
+ TimestampColumnStatisticsImpl() { reset(); }
+ TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ const StatContext& statContext);
+ virtual ~TimestampColumnStatisticsImpl() override;
+
+ bool hasMinimum() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximum() const override {
+ return _stats.hasMaximum();
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ int64_t getMinimum() const override {
+ if(hasMinimum()){
+ return _stats.getMinimum();
+ }else{
+ throw ParseError("Minimum is not defined.");
+ }
+ }
+
+ int64_t getMaximum() const override {
+ if(hasMaximum()){
+ return _stats.getMaximum();
+ }else{
+ throw ParseError("Maximum is not defined.");
+ }
+ }
+
+ void setMinimum(int64_t minimum) {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximum(int64_t maximum) {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ void update(int64_t value) {
+ _stats.updateMinMax(value);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const TimestampColumnStatisticsImpl& tsStats =
+ dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
+ _stats.merge(tsStats._stats);
+ }
+
+ void reset() override {
+ _stats.reset();
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::TimestampStatistics* tsStats =
+ pbStats.mutable_timestampstatistics();
+ if (_stats.hasMinimum()) {
+ tsStats->set_minimumutc(_stats.getMinimum());
+ tsStats->set_maximumutc(_stats.getMaximum());
+ } else {
+ tsStats->clear_minimumutc();
+ tsStats->clear_maximumutc();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ struct tm tmValue;
+ char timeBuffer[20];
+ time_t secs = 0;
+
+ buffer << "Data type: Timestamp" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if(hasMinimum()){
+ secs = static_cast<time_t>(getMinimum() / 1000);
+ gmtime_r(&secs, &tmValue);
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
+ buffer << "Minimum: " << timeBuffer << "."
+ << (getMinimum() % 1000) << std::endl;
+ }else{
+ buffer << "Minimum is not defined" << std::endl;
+ }
+
+ if(hasLowerBound()){
+ secs = static_cast<time_t>(getLowerBound() / 1000);
+ gmtime_r(&secs, &tmValue);
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
+ buffer << "LowerBound: " << timeBuffer << "."
+ << (getLowerBound() % 1000) << std::endl;
+ }else{
+ buffer << "LowerBound is not defined" << std::endl;
+ }
+
+ if(hasMaximum()){
+ secs = static_cast<time_t>(getMaximum()/1000);
+ gmtime_r(&secs, &tmValue);
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
+ buffer << "Maximum: " << timeBuffer << "."
+ << (getMaximum() % 1000) << std::endl;
+ }else{
+ buffer << "Maximum is not defined" << std::endl;
+ }
+
+ if(hasUpperBound()){
+ secs = static_cast<time_t>(getUpperBound() / 1000);
+ gmtime_r(&secs, &tmValue);
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
+ buffer << "UpperBound: " << timeBuffer << "."
+ << (getUpperBound() % 1000) << std::endl;
+ }else{
+ buffer << "UpperBound is not defined" << std::endl;
+ }
+
+ return buffer.str();
+ }
+
+ bool hasLowerBound() const override {
+ return _hasLowerBound;
+ }
+
+ bool hasUpperBound() const override {
+ return _hasUpperBound;
+ }
+
+ int64_t getLowerBound() const override {
+ if(hasLowerBound()){
+ return _lowerBound;
+ }else{
+ throw ParseError("LowerBound is not defined.");
+ }
+ }
+
+ int64_t getUpperBound() const override {
+ if(hasUpperBound()){
+ return _upperBound;
+ }else{
+ throw ParseError("UpperBound is not defined.");
+ }
+ }
+ };
+
+ ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
+ const StatContext& statContext);
+
+ class StatisticsImpl: public Statistics {
+ private:
+ std::vector<ColumnStatistics*> colStats;
+
+ // DELIBERATELY NOT IMPLEMENTED
+ StatisticsImpl(const StatisticsImpl&);
+ StatisticsImpl& operator=(const StatisticsImpl&);
+
+ public:
+ StatisticsImpl(const proto::StripeStatistics& stripeStats,
+ const StatContext& statContext);
+
+ StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
+
+ virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
+ ) const override {
+ return colStats[columnId];
+ }
+
+ virtual ~StatisticsImpl() override;
+
+ uint32_t getNumberOfColumns() const override {
+ return static_cast<uint32_t>(colStats.size());
+ }
+ };
+
+ class StripeStatisticsImpl: public StripeStatistics {
+ private:
+ std::unique_ptr<StatisticsImpl> columnStats;
+ std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > >
+ rowIndexStats;
+
+ // DELIBERATELY NOT IMPLEMENTED
+ StripeStatisticsImpl(const StripeStatisticsImpl&);
+ StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
+
+ public:
+ StripeStatisticsImpl(
+ const proto::StripeStatistics& stripeStats,
+ std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
+ const StatContext& statContext);
+
+ virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
+ ) const override {
+ return columnStats->getColumnStatistics(columnId);
+ }
+
+ uint32_t getNumberOfColumns() const override {
+ return columnStats->getNumberOfColumns();
+ }
+
+ virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
+ uint32_t rowIndex
+ ) const override {
+ // check id indices are valid
+ return rowIndexStats[columnId][rowIndex].get();
+ }
+
+ virtual ~StripeStatisticsImpl() override;
+
+ uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
+ return static_cast<uint32_t>(rowIndexStats[columnId].size());
+ }
+ };
+
+ /**
+ * Create ColumnStatistics for writers
+ * @param type of column
+ * @return MutableColumnStatistics instances
+ */
+ std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
+ const Type& type);
+
+}// namespace
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc
index b63f19d28e..f9d82f30e0 100644
--- a/contrib/libs/apache/orc/c++/src/StripeStream.cc
+++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc
@@ -1,161 +1,161 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Exceptions.hh"
-#include "RLE.hh"
-#include "Reader.hh"
-#include "StripeStream.hh"
-
-#include "wrap/coded-stream-wrapper.h"
-
-namespace orc {
-
- StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index,
- const proto::StripeInformation& _stripeInfo,
- const proto::StripeFooter& _footer,
- uint64_t _stripeStart,
- InputStream& _input,
- const Timezone& _writerTimezone
- ): reader(_reader),
- stripeInfo(_stripeInfo),
- footer(_footer),
- stripeIndex(_index),
- stripeStart(_stripeStart),
- input(_input),
- writerTimezone(_writerTimezone) {
- // PASS
- }
-
- StripeStreamsImpl::~StripeStreamsImpl() {
- // PASS
- }
-
- StreamInformation::~StreamInformation() {
- // PASS
- }
-
- StripeInformation::~StripeInformation() {
- // PASS
- }
-
-
- StreamInformationImpl::~StreamInformationImpl() {
- // PASS
- }
-
- const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const {
- return reader.getSelectedColumns();
- }
-
- proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId
- ) const {
- return footer.columns(static_cast<int>(columnId));
- }
-
- const Timezone& StripeStreamsImpl::getWriterTimezone() const {
- return writerTimezone;
- }
-
- std::ostream* StripeStreamsImpl::getErrorStream() const {
- return reader.getFileContents().errorStream;
- }
-
- std::unique_ptr<SeekableInputStream>
- StripeStreamsImpl::getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const {
- uint64_t offset = stripeStart;
- uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength();
- MemoryPool *pool = reader.getFileContents().pool;
- for(int i = 0; i < footer.streams_size(); ++i) {
- const proto::Stream& stream = footer.streams(i);
- if (stream.has_kind() &&
- stream.kind() == kind &&
- stream.column() == static_cast<uint64_t>(columnId)) {
- uint64_t streamLength = stream.length();
- uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength;
- if (offset + streamLength > dataEnd) {
- std::stringstream msg;
- msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex
- << ": streamOffset=" << offset << ", streamLength=" << streamLength
- << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength="
- << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength();
- throw ParseError(msg.str());
- }
- return createDecompressor(reader.getCompression(),
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream
- (&input,
- offset,
- stream.length(),
- *pool,
- myBlock)),
- reader.getCompressionSize(),
- *pool);
- }
- offset += stream.length();
- }
- return std::unique_ptr<SeekableInputStream>();
- }
-
- MemoryPool& StripeStreamsImpl::getMemoryPool() const {
- return *reader.getFileContents().pool;
- }
-
- bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const {
- return reader.getThrowOnHive11DecimalOverflow();
- }
-
- int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const {
- return reader.getForcedScaleOnHive11Decimal();
- }
-
- void StripeInformationImpl::ensureStripeFooterLoaded() const {
- if (stripeFooter.get() == nullptr) {
- std::unique_ptr<SeekableInputStream> pbStream =
- createDecompressor(compression,
- std::unique_ptr<SeekableInputStream>
- (new SeekableFileInputStream(stream,
- offset +
- indexLength +
- dataLength,
- footerLength,
- memory)),
- blockSize,
- memory);
- stripeFooter.reset(new proto::StripeFooter());
- if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
- throw ParseError("Failed to parse the stripe footer");
- }
- }
- }
-
- std::unique_ptr<StreamInformation>
- StripeInformationImpl::getStreamInformation(uint64_t streamId) const {
- ensureStripeFooterLoaded();
- uint64_t streamOffset = offset;
- for(uint64_t s=0; s < streamId; ++s) {
- streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
- }
- return ORC_UNIQUE_PTR<StreamInformation>
- (new StreamInformationImpl(streamOffset,
- stripeFooter->
- streams(static_cast<int>(streamId))));
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "RLE.hh"
+#include "Reader.hh"
+#include "StripeStream.hh"
+
+#include "wrap/coded-stream-wrapper.h"
+
+namespace orc {
+
+ StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index,
+ const proto::StripeInformation& _stripeInfo,
+ const proto::StripeFooter& _footer,
+ uint64_t _stripeStart,
+ InputStream& _input,
+ const Timezone& _writerTimezone
+ ): reader(_reader),
+ stripeInfo(_stripeInfo),
+ footer(_footer),
+ stripeIndex(_index),
+ stripeStart(_stripeStart),
+ input(_input),
+ writerTimezone(_writerTimezone) {
+ // PASS
+ }
+
+ StripeStreamsImpl::~StripeStreamsImpl() {
+ // PASS
+ }
+
+ StreamInformation::~StreamInformation() {
+ // PASS
+ }
+
+ StripeInformation::~StripeInformation() {
+ // PASS
+ }
+
+
+ StreamInformationImpl::~StreamInformationImpl() {
+ // PASS
+ }
+
+ const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const {
+ return reader.getSelectedColumns();
+ }
+
+ proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId
+ ) const {
+ return footer.columns(static_cast<int>(columnId));
+ }
+
+ const Timezone& StripeStreamsImpl::getWriterTimezone() const {
+ return writerTimezone;
+ }
+
+ std::ostream* StripeStreamsImpl::getErrorStream() const {
+ return reader.getFileContents().errorStream;
+ }
+
+ std::unique_ptr<SeekableInputStream>
+ StripeStreamsImpl::getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const {
+ uint64_t offset = stripeStart;
+ uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength();
+ MemoryPool *pool = reader.getFileContents().pool;
+ for(int i = 0; i < footer.streams_size(); ++i) {
+ const proto::Stream& stream = footer.streams(i);
+ if (stream.has_kind() &&
+ stream.kind() == kind &&
+ stream.column() == static_cast<uint64_t>(columnId)) {
+ uint64_t streamLength = stream.length();
+ uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength;
+ if (offset + streamLength > dataEnd) {
+ std::stringstream msg;
+ msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex
+ << ": streamOffset=" << offset << ", streamLength=" << streamLength
+ << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength="
+ << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength();
+ throw ParseError(msg.str());
+ }
+ return createDecompressor(reader.getCompression(),
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream
+ (&input,
+ offset,
+ stream.length(),
+ *pool,
+ myBlock)),
+ reader.getCompressionSize(),
+ *pool);
+ }
+ offset += stream.length();
+ }
+ return std::unique_ptr<SeekableInputStream>();
+ }
+
+ MemoryPool& StripeStreamsImpl::getMemoryPool() const {
+ return *reader.getFileContents().pool;
+ }
+
+ bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const {
+ return reader.getThrowOnHive11DecimalOverflow();
+ }
+
+ int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const {
+ return reader.getForcedScaleOnHive11Decimal();
+ }
+
+ void StripeInformationImpl::ensureStripeFooterLoaded() const {
+ if (stripeFooter.get() == nullptr) {
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(compression,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream(stream,
+ offset +
+ indexLength +
+ dataLength,
+ footerLength,
+ memory)),
+ blockSize,
+ memory);
+ stripeFooter.reset(new proto::StripeFooter());
+ if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError("Failed to parse the stripe footer");
+ }
+ }
+ }
+
+ std::unique_ptr<StreamInformation>
+ StripeInformationImpl::getStreamInformation(uint64_t streamId) const {
+ ensureStripeFooterLoaded();
+ uint64_t streamOffset = offset;
+ for(uint64_t s=0; s < streamId; ++s) {
+ streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
+ }
+ return ORC_UNIQUE_PTR<StreamInformation>
+ (new StreamInformationImpl(streamOffset,
+ stripeFooter->
+ streams(static_cast<int>(streamId))));
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh
index 5cbaf60a69..da5cb16f37 100644
--- a/contrib/libs/apache/orc/c++/src/StripeStream.hh
+++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh
@@ -1,213 +1,213 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_STRIPE_STREAM_HH
-#define ORC_STRIPE_STREAM_HH
-
-#include "orc/Int128.hh"
-#include "orc/OrcFile.hh"
-#include "orc/Reader.hh"
-
-#include "Timezone.hh"
-#include "TypeImpl.hh"
-
-namespace orc {
-
- class RowReaderImpl;
-
- /**
- * StripeStream Implementation
- */
-
- class StripeStreamsImpl: public StripeStreams {
- private:
- const RowReaderImpl& reader;
- const proto::StripeInformation& stripeInfo;
- const proto::StripeFooter& footer;
- const uint64_t stripeIndex;
- const uint64_t stripeStart;
- InputStream& input;
- const Timezone& writerTimezone;
-
- public:
- StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index,
- const proto::StripeInformation& stripeInfo,
- const proto::StripeFooter& footer,
- uint64_t stripeStart,
- InputStream& input,
- const Timezone& writerTimezone);
-
- virtual ~StripeStreamsImpl() override;
-
- virtual const std::vector<bool> getSelectedColumns() const override;
-
- virtual proto::ColumnEncoding getEncoding(uint64_t columnId
- ) const override;
-
- virtual std::unique_ptr<SeekableInputStream>
- getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const override;
-
- MemoryPool& getMemoryPool() const override;
-
- const Timezone& getWriterTimezone() const override;
-
- std::ostream* getErrorStream() const override;
-
- bool getThrowOnHive11DecimalOverflow() const override;
-
- int32_t getForcedScaleOnHive11Decimal() const override;
- };
-
- /**
- * StreamInformation Implementation
- */
-
- class StreamInformationImpl: public StreamInformation {
- private:
- StreamKind kind;
- uint64_t column;
- uint64_t offset;
- uint64_t length;
- public:
- StreamInformationImpl(uint64_t _offset,
- const proto::Stream& stream
- ): kind(static_cast<StreamKind>(stream.kind())),
- column(stream.column()),
- offset(_offset),
- length(stream.length()) {
- // PASS
- }
-
- ~StreamInformationImpl() override;
-
- StreamKind getKind() const override {
- return kind;
- }
-
- uint64_t getColumnId() const override {
- return column;
- }
-
- uint64_t getOffset() const override {
- return offset;
- }
-
- uint64_t getLength() const override {
- return length;
- }
- };
-
- /**
- * StripeInformation Implementation
- */
-
- class StripeInformationImpl : public StripeInformation {
- uint64_t offset;
- uint64_t indexLength;
- uint64_t dataLength;
- uint64_t footerLength;
- uint64_t numRows;
- InputStream* stream;
- MemoryPool& memory;
- CompressionKind compression;
- uint64_t blockSize;
- mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
- void ensureStripeFooterLoaded() const;
- public:
-
- StripeInformationImpl(uint64_t _offset,
- uint64_t _indexLength,
- uint64_t _dataLength,
- uint64_t _footerLength,
- uint64_t _numRows,
- InputStream* _stream,
- MemoryPool& _memory,
- CompressionKind _compression,
- uint64_t _blockSize
- ) : offset(_offset),
- indexLength(_indexLength),
- dataLength(_dataLength),
- footerLength(_footerLength),
- numRows(_numRows),
- stream(_stream),
- memory(_memory),
- compression(_compression),
- blockSize(_blockSize) {
- // PASS
- }
-
- virtual ~StripeInformationImpl() override {
- // PASS
- }
-
- uint64_t getOffset() const override {
- return offset;
- }
-
- uint64_t getLength() const override {
- return indexLength + dataLength + footerLength;
- }
- uint64_t getIndexLength() const override {
- return indexLength;
- }
-
- uint64_t getDataLength()const override {
- return dataLength;
- }
-
- uint64_t getFooterLength() const override {
- return footerLength;
- }
-
- uint64_t getNumberOfRows() const override {
- return numRows;
- }
-
- uint64_t getNumberOfStreams() const override {
- ensureStripeFooterLoaded();
- return static_cast<uint64_t>(stripeFooter->streams_size());
- }
-
- std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId
- ) const override;
-
- ColumnEncodingKind getColumnEncoding(uint64_t colId) const override {
- ensureStripeFooterLoaded();
- return static_cast<ColumnEncodingKind>(stripeFooter->
- columns(static_cast<int>(colId))
- .kind());
- }
-
- uint64_t getDictionarySize(uint64_t colId) const override {
- ensureStripeFooterLoaded();
- return static_cast<ColumnEncodingKind>(stripeFooter->
- columns(static_cast<int>(colId))
- .dictionarysize());
- }
-
- const std::string& getWriterTimezone() const override {
- ensureStripeFooterLoaded();
- return stripeFooter->writertimezone();
- }
- };
-
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_STRIPE_STREAM_HH
+#define ORC_STRIPE_STREAM_HH
+
+#include "orc/Int128.hh"
+#include "orc/OrcFile.hh"
+#include "orc/Reader.hh"
+
+#include "Timezone.hh"
+#include "TypeImpl.hh"
+
+namespace orc {
+
+ class RowReaderImpl;
+
+ /**
+ * StripeStream Implementation
+ */
+
+ class StripeStreamsImpl: public StripeStreams {
+ private:
+ const RowReaderImpl& reader;
+ const proto::StripeInformation& stripeInfo;
+ const proto::StripeFooter& footer;
+ const uint64_t stripeIndex;
+ const uint64_t stripeStart;
+ InputStream& input;
+ const Timezone& writerTimezone;
+
+ public:
+ StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index,
+ const proto::StripeInformation& stripeInfo,
+ const proto::StripeFooter& footer,
+ uint64_t stripeStart,
+ InputStream& input,
+ const Timezone& writerTimezone);
+
+ virtual ~StripeStreamsImpl() override;
+
+ virtual const std::vector<bool> getSelectedColumns() const override;
+
+ virtual proto::ColumnEncoding getEncoding(uint64_t columnId
+ ) const override;
+
+ virtual std::unique_ptr<SeekableInputStream>
+ getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const override;
+
+ MemoryPool& getMemoryPool() const override;
+
+ const Timezone& getWriterTimezone() const override;
+
+ std::ostream* getErrorStream() const override;
+
+ bool getThrowOnHive11DecimalOverflow() const override;
+
+ int32_t getForcedScaleOnHive11Decimal() const override;
+ };
+
+ /**
+ * StreamInformation Implementation
+ */
+
+ class StreamInformationImpl: public StreamInformation {
+ private:
+ StreamKind kind;
+ uint64_t column;
+ uint64_t offset;
+ uint64_t length;
+ public:
+ StreamInformationImpl(uint64_t _offset,
+ const proto::Stream& stream
+ ): kind(static_cast<StreamKind>(stream.kind())),
+ column(stream.column()),
+ offset(_offset),
+ length(stream.length()) {
+ // PASS
+ }
+
+ ~StreamInformationImpl() override;
+
+ StreamKind getKind() const override {
+ return kind;
+ }
+
+ uint64_t getColumnId() const override {
+ return column;
+ }
+
+ uint64_t getOffset() const override {
+ return offset;
+ }
+
+ uint64_t getLength() const override {
+ return length;
+ }
+ };
+
+ /**
+ * StripeInformation Implementation
+ */
+
+ class StripeInformationImpl : public StripeInformation {
+ uint64_t offset;
+ uint64_t indexLength;
+ uint64_t dataLength;
+ uint64_t footerLength;
+ uint64_t numRows;
+ InputStream* stream;
+ MemoryPool& memory;
+ CompressionKind compression;
+ uint64_t blockSize;
+ mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
+ void ensureStripeFooterLoaded() const;
+ public:
+
+ StripeInformationImpl(uint64_t _offset,
+ uint64_t _indexLength,
+ uint64_t _dataLength,
+ uint64_t _footerLength,
+ uint64_t _numRows,
+ InputStream* _stream,
+ MemoryPool& _memory,
+ CompressionKind _compression,
+ uint64_t _blockSize
+ ) : offset(_offset),
+ indexLength(_indexLength),
+ dataLength(_dataLength),
+ footerLength(_footerLength),
+ numRows(_numRows),
+ stream(_stream),
+ memory(_memory),
+ compression(_compression),
+ blockSize(_blockSize) {
+ // PASS
+ }
+
+ virtual ~StripeInformationImpl() override {
+ // PASS
+ }
+
+ uint64_t getOffset() const override {
+ return offset;
+ }
+
+ uint64_t getLength() const override {
+ return indexLength + dataLength + footerLength;
+ }
+ uint64_t getIndexLength() const override {
+ return indexLength;
+ }
+
+ uint64_t getDataLength()const override {
+ return dataLength;
+ }
+
+ uint64_t getFooterLength() const override {
+ return footerLength;
+ }
+
+ uint64_t getNumberOfRows() const override {
+ return numRows;
+ }
+
+ uint64_t getNumberOfStreams() const override {
+ ensureStripeFooterLoaded();
+ return static_cast<uint64_t>(stripeFooter->streams_size());
+ }
+
+ std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId
+ ) const override;
+
+ ColumnEncodingKind getColumnEncoding(uint64_t colId) const override {
+ ensureStripeFooterLoaded();
+ return static_cast<ColumnEncodingKind>(stripeFooter->
+ columns(static_cast<int>(colId))
+ .kind());
+ }
+
+ uint64_t getDictionarySize(uint64_t colId) const override {
+ ensureStripeFooterLoaded();
+ return static_cast<ColumnEncodingKind>(stripeFooter->
+ columns(static_cast<int>(colId))
+ .dictionarysize());
+ }
+
+ const std::string& getWriterTimezone() const override {
+ ensureStripeFooterLoaded();
+ return stripeFooter->writertimezone();
+ }
+ };
+
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc
index 318e5bcc12..0aa66ef71c 100644
--- a/contrib/libs/apache/orc/c++/src/Timezone.cc
+++ b/contrib/libs/apache/orc/c++/src/Timezone.cc
@@ -1,936 +1,936 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/OrcFile.hh"
-#include "Timezone.hh"
-
-#include <errno.h>
-#include <map>
-#include <sstream>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-namespace orc {
-
- // default location of the timezone files
- static const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo";
-
- // location of a symlink to the local timezone
- static const char LOCAL_TIMEZONE[] = "/etc/localtime";
-
- enum TransitionKind {
- TRANSITION_JULIAN,
- TRANSITION_DAY,
- TRANSITION_MONTH
- };
-
- static const int64_t MONTHS_PER_YEAR = 12;
- /**
- * The number of days in each month in non-leap and leap years.
- */
- static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] =
- {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
- {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
- static const int64_t DAYS_PER_WEEK = 7;
-
- // Leap years and day of the week repeat every 400 years, which makes it
- // a good cycle length.
- static const int64_t SECONDS_PER_400_YEARS =
- SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3));
-
- /**
- * Is the given year a leap year?
- */
- bool isLeap(int64_t year) {
- return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0));
- }
-
- /**
- * Find the position that is the closest and less than or equal to the
- * target.
- * @return -1 if the target < array[0] or array is empty or
- * i if array[i] <= target and (i == n or array[i] < array[i+1])
- */
- int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) {
- uint64_t size = array.size();
- if (size == 0) {
- return -1;
- }
- uint64_t min = 0;
- uint64_t max = size - 1;
- uint64_t mid = (min + max) / 2;
- while ((array[mid] != target) && (min < max)) {
- if (array[mid] < target) {
- min = mid + 1;
- } else if (mid == 0) {
- max = 0;
- } else {
- max = mid - 1;
- }
- mid = (min + max) / 2;
- }
- if (target < array[mid]) {
- return static_cast<int64_t>(mid) - 1;
- } else {
- return static_cast<int64_t>(mid);
- }
- }
-
- struct Transition {
- TransitionKind kind;
- int64_t day;
- int64_t week;
- int64_t month;
- int64_t time;
-
- std::string toString() const {
- std::stringstream buffer;
- switch (kind) {
- case TRANSITION_JULIAN:
- buffer << "julian " << day;
- break;
- case TRANSITION_DAY:
- buffer << "day " << day;
- break;
- case TRANSITION_MONTH:
- buffer << "month " << month << " week " << week << " day " << day;
- break;
- }
- buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60)
- << ":" << (time % 60);
- return buffer.str();
- }
-
- /**
- * Get the transition time for the given year.
- * @param year the year
- * @return the number of seconds past local Jan 1 00:00:00 that the
- * transition happens.
- */
- int64_t getTime(int64_t year) const {
- int64_t result = time;
- switch (kind) {
- case TRANSITION_JULIAN:
- result += SECONDS_PER_DAY * day;
- if (day > 60 && isLeap(year)) {
- result += SECONDS_PER_DAY;
- }
- break;
- case TRANSITION_DAY:
- result += SECONDS_PER_DAY * day;
- break;
- case TRANSITION_MONTH: {
- bool inLeap = isLeap(year);
- int64_t adjustedMonth = (month + 9) % 12 + 1;
- int64_t adjustedYear = (month <= 2) ? (year - 1) : year;
- int64_t adjustedCentury = adjustedYear / 100;
- int64_t adjustedRemainder = adjustedYear % 100;
-
- // day of the week of the first day of month
- int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 +
- 1 + adjustedRemainder + adjustedRemainder / 4 +
- adjustedCentury / 4 - 2 * adjustedCentury) % 7;
- if (dayOfWeek < 0) {
- dayOfWeek += DAYS_PER_WEEK;
- }
-
- int64_t d = day - dayOfWeek;
- if (d < 0) {
- d += DAYS_PER_WEEK;
- }
- for (int w = 1; w < week; ++w) {
- if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) {
- break;
- }
- d += DAYS_PER_WEEK;
- }
- result += d * SECONDS_PER_DAY;
-
- // Add in the time for the month
- for(int m=0; m < month - 1; ++m) {
- result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY;
- }
- break;
- }
- }
- return result;
- }
- };
-
- /**
- * The current rule for finding timezone variants arbitrarily far in
- * the future. They are based on a string representation that
- * specifies the standard name and offset. For timezones with
- * daylight savings, the string specifies the daylight variant name
- * and offset and the rules for switching between them.
- *
- * rule = <standard name><standard offset><daylight>?
- * name = string with no numbers or '+', '-', or ','
- * offset = [-+]?hh(:mm(:ss)?)?
- * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)?
- * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week>
- */
- class FutureRuleImpl: public FutureRule {
- std::string ruleString;
- TimezoneVariant standard;
- bool hasDst;
- TimezoneVariant dst;
- Transition start;
- Transition end;
-
- // expanded time_t offsets of transitions
- std::vector<int64_t> offsets;
-
- // Is the epoch (1 Jan 1970 00:00) in standard time?
- // This code assumes that the transition dates fall in the same order
- // each year. Hopefully no timezone regions decide to move across the
- // equator, which is about what it would take.
- bool startInStd;
-
- void computeOffsets() {
- if (!hasDst) {
- startInStd = true;
- offsets.resize(1);
- } else {
- // Insert a transition for the epoch and two per a year for the next
- // 400 years. We assume that the all even positions are in standard
- // time if and only if startInStd and the odd ones are the reverse.
- offsets.resize(400 * 2 + 1);
- startInStd = start.getTime(1970) < end.getTime(1970);
- int64_t base = 0;
- for(int64_t year = 1970; year < 1970 + 400; ++year) {
- if (startInStd) {
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
- base + start.getTime(year) - standard.gmtOffset;
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
- base + end.getTime(year) - dst.gmtOffset;
- } else {
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
- base + end.getTime(year) - dst.gmtOffset;
- offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
- base + start.getTime(year) - standard.gmtOffset;
- }
- base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY;
- }
- }
- offsets[0] = 0;
- }
-
- public:
- virtual ~FutureRuleImpl() override;
- bool isDefined() const override;
- const TimezoneVariant& getVariant(int64_t clk) const override;
- void print(std::ostream& out) const override;
-
- friend class FutureRuleParser;
- };
-
- FutureRule::~FutureRule() {
- // PASS
- }
-
- FutureRuleImpl::~FutureRuleImpl() {
- // PASS
- }
-
- bool FutureRuleImpl::isDefined() const {
- return ruleString.size() > 0;
- }
-
- const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const {
- if (!hasDst) {
- return standard;
- } else {
- int64_t adjusted = clk % SECONDS_PER_400_YEARS;
- if (adjusted < 0) {
- adjusted += SECONDS_PER_400_YEARS;
- }
- int64_t idx = binarySearch(offsets, adjusted);
- if (startInStd == (idx % 2 == 0)) {
- return standard;
- } else {
- return dst;
- }
- }
- }
-
- void FutureRuleImpl::print(std::ostream& out) const {
- if (isDefined()) {
- out << " Future rule: " << ruleString << "\n";
- out << " standard " << standard.toString() << "\n";
- if (hasDst) {
- out << " dst " << dst.toString() << "\n";
- out << " start " << start.toString() << "\n";
- out << " end " << end.toString() << "\n";
- }
- }
- }
-
- /**
- * A parser for the future rule strings.
- */
- class FutureRuleParser {
- public:
- FutureRuleParser(const std::string& str,
- FutureRuleImpl* rule
- ): ruleString(str),
- length(str.size()),
- position(0),
- output(*rule) {
- output.ruleString = str;
- if (position != length) {
- parseName(output.standard.name);
- output.standard.gmtOffset = -parseOffset();
- output.standard.isDst = false;
- output.hasDst = position < length;
- if (output.hasDst) {
- parseName(output.dst.name);
- output.dst.isDst = true;
- if (ruleString[position] != ',') {
- output.dst.gmtOffset = -parseOffset();
- } else {
- output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60;
- }
- parseTransition(output.start);
- parseTransition(output.end);
- }
- if (position != length) {
- throwError("Extra text");
- }
- output.computeOffsets();
- }
- }
-
- private:
-
- const std::string& ruleString;
- size_t length;
- size_t position;
- FutureRuleImpl &output;
-
- void throwError(const char *msg) {
- std::stringstream buffer;
- buffer << msg << " at " << position << " in '" << ruleString << "'";
- throw TimezoneError(buffer.str());
- }
-
- /**
- * Parse the names of the form:
- * ([^-+0-9,]+|<[^>]+>)
- * and set the output string.
- */
- void parseName(std::string& result) {
- if (position == length) {
- throwError("name required");
- }
- size_t start = position;
- if (ruleString[position] == '<') {
- while (position < length && ruleString[position] != '>') {
- position += 1;
- }
- if (position == length) {
- throwError("missing close '>'");
- }
- position +=1;
- } else {
- while (position < length) {
- char ch = ruleString[position];
- if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') {
- break;
- }
- position += 1;
- }
- }
- if (position == start) {
- throwError("empty string not allowed");
- }
- result = ruleString.substr(start, position - start);
- }
-
- /**
- * Parse an integer of the form [0-9]+ and return it.
- */
- int64_t parseNumber() {
- if (position >= length) {
- throwError("missing number");
- }
- int64_t result = 0;
- while (position < length) {
- char ch = ruleString[position];
- if (isdigit(ch)) {
- result = result * 10 + (ch - '0');
- position += 1;
- } else {
- break;
- }
- }
- return result;
- }
-
- /**
- * Parse the offsets of the form:
- * [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)?
- * and convert it into a number of seconds.
- */
- int64_t parseOffset() {
- int64_t scale = 3600;
- bool isNegative = false;
- if (position < length) {
- char ch = ruleString[position];
- isNegative = ch == '-';
- if (ch == '-' || ch == '+') {
- position += 1;
- }
- }
- int64_t result = parseNumber() * scale;
- while (position < length && scale > 1 && ruleString[position] == ':') {
- scale /= 60;
- position += 1;
- result += parseNumber() * scale;
- }
- if (isNegative) {
- result = -result;
- }
- return result;
- }
-
- /**
- * Parse a transition of the following form:
- * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)?
- */
- void parseTransition(Transition& transition) {
- if (length - position < 2 || ruleString[position] != ',') {
- throwError("missing transition");
- }
- position += 1;
- char ch = ruleString[position];
- if (ch == 'J') {
- transition.kind = TRANSITION_JULIAN;
- position += 1;
- transition.day = parseNumber();
- } else if (ch == 'M') {
- transition.kind = TRANSITION_MONTH;
- position += 1;
- transition.month = parseNumber();
- if (position == length || ruleString[position] != '.') {
- throwError("missing first .");
- }
- position += 1;
- transition.week = parseNumber();
- if (position == length || ruleString[position] != '.') {
- throwError("missing second .");
- }
- position += 1;
- transition.day = parseNumber();
- } else {
- transition.kind = TRANSITION_DAY;
- transition.day = parseNumber();
- }
- if (position < length && ruleString[position] == '/') {
- position += 1;
- transition.time = parseOffset();
- } else {
- transition.time = 2 * 60 * 60;
- }
- }
- };
-
- /**
- * Parse the POSIX TZ string.
- */
- std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) {
- std::shared_ptr<FutureRule> result(new FutureRuleImpl());
- FutureRuleParser parser(ruleString,
- dynamic_cast<FutureRuleImpl*>(result.get()));
- return result;
- }
-
- std::string TimezoneVariant::toString() const {
- std::stringstream buffer;
- buffer << name << " " << gmtOffset;
- if (isDst) {
- buffer << " (dst)";
- }
- return buffer.str();
- }
-
- /**
- * An abstraction of the differences between versions.
- */
- class VersionParser {
- public:
- virtual ~VersionParser();
-
- /**
- * Get the version number.
- */
- virtual uint64_t getVersion() const = 0;
-
- /**
- * Get the number of bytes
- */
- virtual uint64_t getTimeSize() const = 0;
-
- /**
- * Parse the time at the given location.
- */
- virtual int64_t parseTime(const unsigned char* ptr) const = 0;
-
- /**
- * Parse the future string
- */
- virtual std::string parseFutureString(const unsigned char *ptr,
- uint64_t offset,
- uint64_t length) const = 0;
- };
-
- VersionParser::~VersionParser() {
- // PASS
- }
-
- static uint32_t decode32(const unsigned char* ptr) {
- return static_cast<uint32_t>(ptr[0] << 24) |
- static_cast<uint32_t>(ptr[1] << 16) |
- static_cast<uint32_t>(ptr[2] << 8) |
- static_cast<uint32_t>(ptr[3]);
- }
-
- class Version1Parser: public VersionParser {
- public:
- virtual ~Version1Parser() override;
-
- virtual uint64_t getVersion() const override {
- return 1;
- }
-
- /**
- * Get the number of bytes
- */
- virtual uint64_t getTimeSize() const override {
- return 4;
- }
-
- /**
- * Parse the time at the given location.
- */
- virtual int64_t parseTime(const unsigned char* ptr) const override {
- // sign extend from 32 bits
- return static_cast<int32_t>(decode32(ptr));
- }
-
- virtual std::string parseFutureString(const unsigned char *,
- uint64_t,
- uint64_t) const override {
- return "";
- }
- };
-
- Version1Parser::~Version1Parser() {
- // PASS
- }
-
- class Version2Parser: public VersionParser {
- public:
- virtual ~Version2Parser() override;
-
- virtual uint64_t getVersion() const override {
- return 2;
- }
-
- /**
- * Get the number of bytes
- */
- virtual uint64_t getTimeSize() const override {
- return 8;
- }
-
- /**
- * Parse the time at the given location.
- */
- virtual int64_t parseTime(const unsigned char* ptr) const override {
- return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4);
- }
-
- virtual std::string parseFutureString(const unsigned char *ptr,
- uint64_t offset,
- uint64_t length) const override {
- return std::string(reinterpret_cast<const char*>(ptr) + offset + 1,
- length - 2);
- }
- };
-
- Version2Parser::~Version2Parser() {
- // PASS
- }
-
- class TimezoneImpl: public Timezone {
- public:
- TimezoneImpl(const std::string& name,
- const std::vector<unsigned char> bytes);
- virtual ~TimezoneImpl() override;
-
- /**
- * Get the variant for the given time (time_t).
- */
- const TimezoneVariant& getVariant(int64_t clk) const override;
-
- void print(std::ostream&) const override;
-
- uint64_t getVersion() const override {
- return version;
- }
-
- int64_t getEpoch() const override {
- return epoch;
- }
-
- int64_t convertToUTC(int64_t clk) const override {
- return clk + getVariant(clk).gmtOffset;
- }
-
- private:
- void parseTimeVariants(const unsigned char* ptr,
- uint64_t variantOffset,
- uint64_t variantCount,
- uint64_t nameOffset,
- uint64_t nameCount);
- void parseZoneFile(const unsigned char* ptr,
- uint64_t sectionOffset,
- uint64_t fileLength,
- const VersionParser& version);
- // filename
- std::string filename;
-
- // the version of the file
- uint64_t version;
-
- // the list of variants for this timezone
- std::vector<TimezoneVariant> variants;
-
- // the list of the times where the local rules change
- std::vector<int64_t> transitions;
-
- // the variant that starts at this transition.
- std::vector<uint64_t> currentVariant;
-
- // the variant before the first transition
- uint64_t ancientVariant;
-
- // the rule for future times
- std::shared_ptr<FutureRule> futureRule;
-
- // the last explicit transition after which we use the future rule
- int64_t lastTransition;
-
- // The ORC epoch time in this timezone.
- int64_t epoch;
- };
-
- DIAGNOSTIC_PUSH
- #ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wglobal-constructors")
- DIAGNOSTIC_IGNORE("-Wexit-time-destructors")
- #endif
- static std::mutex timezone_mutex;
- static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache;
- DIAGNOSTIC_POP
-
- Timezone::~Timezone() {
- // PASS
- }
-
- TimezoneImpl::TimezoneImpl(const std::string& _filename,
- const std::vector<unsigned char> buffer
- ): filename(_filename) {
- parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser());
- // Build the literal for the ORC epoch
- // 2015 Jan 1 00:00:00
- tm epochStruct;
- epochStruct.tm_sec = 0;
- epochStruct.tm_min = 0;
- epochStruct.tm_hour = 0;
- epochStruct.tm_mday = 1;
- epochStruct.tm_mon = 0;
- epochStruct.tm_year = 2015 - 1900;
- epochStruct.tm_isdst = 0;
- time_t utcEpoch = timegm(&epochStruct);
- epoch = utcEpoch - getVariant(utcEpoch).gmtOffset;
- }
-
- const char* getTimezoneDirectory() {
- const char *dir = getenv("TZDIR");
- if (!dir) {
- dir = DEFAULT_TZDIR;
- }
- return dir;
- }
-
- /**
- * Get a timezone by absolute filename.
- * Results are cached.
- */
- const Timezone& getTimezoneByFilename(const std::string& filename) {
- // ORC-110
- std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
- std::map<std::string, std::shared_ptr<Timezone> >::iterator itr =
- timezoneCache.find(filename);
- if (itr != timezoneCache.end()) {
- return *(itr->second).get();
- }
- try {
- ORC_UNIQUE_PTR<InputStream> file = readFile(filename);
- size_t size = static_cast<size_t>(file->getLength());
- std::vector<unsigned char> buffer(size);
- file->read(&buffer[0], size, 0);
- timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer));
- } catch(ParseError& err) {
- throw TimezoneError(err.what());
- }
- return *timezoneCache[filename].get();
- }
-
- /**
- * Get the local timezone.
- */
- const Timezone& getLocalTimezone() {
-#ifdef _MSC_VER
- return getTimezoneByName("UTC");
-#else
- return getTimezoneByFilename(LOCAL_TIMEZONE);
-#endif
- }
-
- /**
- * Get a timezone by name (eg. America/Los_Angeles).
- * Results are cached.
- */
- const Timezone& getTimezoneByName(const std::string& zone) {
- std::string filename(getTimezoneDirectory());
- filename += "/";
- filename += zone;
- return getTimezoneByFilename(filename);
- }
-
- /**
- * Parse a set of bytes as a timezone file as if they came from filename.
- */
- std::unique_ptr<Timezone> getTimezone(const std::string& filename,
- const std::vector<unsigned char>& b){
- return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b));
- }
-
- TimezoneImpl::~TimezoneImpl() {
- // PASS
- }
-
- void TimezoneImpl::parseTimeVariants(const unsigned char* ptr,
- uint64_t variantOffset,
- uint64_t variantCount,
- uint64_t nameOffset,
- uint64_t nameCount) {
- for(uint64_t variant=0; variant < variantCount; ++variant) {
- variants[variant].gmtOffset =
- static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
- variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0;
- uint64_t nameStart = ptr[variantOffset + 6 * variant + 5];
- if (nameStart >= nameCount) {
- std::stringstream buffer;
- buffer << "name out of range in variant " << variant
- << " - " << nameStart << " >= " << nameCount;
- throw TimezoneError(buffer.str());
- }
- variants[variant].name = std::string(reinterpret_cast<const char*>(ptr)
- + nameOffset + nameStart);
- }
- }
-
- /**
- * Parse the zone file to get the bits we need.
- * There are two versions of the timezone file:
- *
- * Version 1(version = 0x00):
- * Magic(version)
- * Header
- * TransitionTimes(4 byte)
- * TransitionRules
- * Rules
- * LeapSeconds(4 byte)
- * IsStd
- * IsGmt
- *
- * Version2:
- * Version1(0x32) = a version 1 copy of the data for old clients
- * Magic(0x32)
- * Header
- * TransitionTimes(8 byte)
- * TransitionRules
- * Rules
- * LeapSeconds(8 byte)
- * IsStd
- * IsGmt
- * FutureString
- */
- void TimezoneImpl::parseZoneFile(const unsigned char *ptr,
- uint64_t sectionOffset,
- uint64_t fileLength,
- const VersionParser& versionParser) {
- const uint64_t magicOffset = sectionOffset + 0;
- const uint64_t headerOffset = magicOffset + 20;
-
- // check for validity before we start parsing
- if (fileLength < headerOffset + 6 * 4 ||
- strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4)
- != 0) {
- std::stringstream buffer;
- buffer << "non-tzfile " << filename;
- throw TimezoneError(buffer.str());
- }
-
- const uint64_t isGmtCount = decode32(ptr + headerOffset + 0);
- const uint64_t isStdCount = decode32(ptr + headerOffset + 4);
- const uint64_t leapCount = decode32(ptr + headerOffset + 8);
- const uint64_t timeCount = decode32(ptr + headerOffset + 12);
- const uint64_t variantCount = decode32(ptr + headerOffset + 16);
- const uint64_t nameCount = decode32(ptr + headerOffset + 20);
-
- const uint64_t timeOffset = headerOffset + 24;
- const uint64_t timeVariantOffset =
- timeOffset + versionParser.getTimeSize() * timeCount;
- const uint64_t variantOffset = timeVariantOffset + timeCount;
- const uint64_t nameOffset = variantOffset + variantCount * 6;
- const uint64_t sectionLength = nameOffset + nameCount
- + (versionParser.getTimeSize() + 4) * leapCount
- + isGmtCount + isStdCount;
-
- if (sectionLength > fileLength) {
- std::stringstream buffer;
- buffer << "tzfile too short " << filename
- << " needs " << sectionLength << " and has " << fileLength;
- throw TimezoneError(buffer.str());
- }
-
- // if it is version 2, skip over the old layout and read the new one.
- if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) {
- parseZoneFile(ptr, sectionLength, fileLength, Version2Parser());
- return;
- }
- version = versionParser.getVersion();
- variants.resize(variantCount);
- transitions.resize(timeCount);
- currentVariant.resize(timeCount);
- parseTimeVariants(ptr, variantOffset, variantCount, nameOffset,
- nameCount);
- bool foundAncient = false;
- for(uint64_t t=0; t < timeCount; ++t) {
- transitions[t] =
- versionParser.parseTime(ptr + timeOffset +
- t * versionParser.getTimeSize());
- currentVariant[t] = ptr[timeVariantOffset + t];
- if (currentVariant[t] >= variantCount) {
- std::stringstream buffer;
- buffer << "tzfile rule out of range " << filename
- << " references rule " << currentVariant[t]
- << " of " << variantCount;
- throw TimezoneError(buffer.str());
- }
- // find the oldest standard time and use that as the ancient value
- if (!foundAncient &&
- !variants[currentVariant[t]].isDst) {
- foundAncient = true;
- ancientVariant = currentVariant[t];
- }
- }
- if (!foundAncient) {
- ancientVariant = 0;
- }
- futureRule = parseFutureRule(versionParser.parseFutureString
- (ptr, sectionLength,
- fileLength - sectionLength));
-
- // find the lower bound for applying the future rule
- if (futureRule->isDefined()) {
- if (timeCount > 0) {
- lastTransition = transitions[timeCount - 1];
- } else {
- lastTransition = INT64_MIN;
- }
- } else {
- lastTransition = INT64_MAX;
- }
- }
-
- const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const {
- // if it is after the last explicit entry in the table,
- // use the future rule to get an answer
- if (clk > lastTransition) {
- return futureRule->getVariant(clk);
- } else {
- int64_t transition = binarySearch(transitions, clk);
- uint64_t idx;
- if (transition < 0) {
- idx = ancientVariant;
- } else {
- idx = currentVariant[static_cast<size_t>(transition)];
- }
- return variants[idx];
- }
- }
-
- void TimezoneImpl::print(std::ostream& out) const {
- out << "Timezone file: " << filename << "\n";
- out << " Version: " << version << "\n";
- futureRule->print(out);
- for(uint64_t r=0; r < variants.size(); ++r) {
- out << " Variant " << r << ": "
- << variants[r].toString() << "\n";
- }
- for(uint64_t t=0; t < transitions.size(); ++t) {
- tm timeStruct;
- tm* result = nullptr;
- char buffer[25];
- if (sizeof(time_t) >= 8) {
- time_t val = transitions[t];
- result = gmtime_r(&val, &timeStruct);
- if (result) {
- strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct);
- }
- }
- std::cout << " Transition: " << (result == nullptr ? "null" : buffer)
- << " (" << transitions[t] << ") -> "
- << variants[currentVariant[t]].name
- << "\n";
- }
- }
-
- TimezoneError::TimezoneError(const std::string& what
- ): std::runtime_error(what) {
- // PASS
- }
-
- TimezoneError::TimezoneError(const TimezoneError& other
- ): std::runtime_error(other) {
- // PASS
- }
-
- TimezoneError::~TimezoneError() ORC_NOEXCEPT {
- // PASS
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/OrcFile.hh"
+#include "Timezone.hh"
+
+#include <errno.h>
+#include <map>
+#include <sstream>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+namespace orc {
+
+ // default location of the timezone files
+ static const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo";
+
+ // location of a symlink to the local timezone
+ static const char LOCAL_TIMEZONE[] = "/etc/localtime";
+
+ enum TransitionKind {
+ TRANSITION_JULIAN,
+ TRANSITION_DAY,
+ TRANSITION_MONTH
+ };
+
+ static const int64_t MONTHS_PER_YEAR = 12;
+ /**
+ * The number of days in each month in non-leap and leap years.
+ */
+ static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] =
+ {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+ {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+ static const int64_t DAYS_PER_WEEK = 7;
+
+ // Leap years and day of the week repeat every 400 years, which makes it
+ // a good cycle length.
+ static const int64_t SECONDS_PER_400_YEARS =
+ SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3));
+
+ /**
+ * Is the given year a leap year?
+ */
+ bool isLeap(int64_t year) {
+ return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0));
+ }
+
+ /**
+ * Find the position that is the closest and less than or equal to the
+ * target.
+ * @return -1 if the target < array[0] or array is empty or
+ * i if array[i] <= target and (i == n or array[i] < array[i+1])
+ */
+ int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) {
+ uint64_t size = array.size();
+ if (size == 0) {
+ return -1;
+ }
+ uint64_t min = 0;
+ uint64_t max = size - 1;
+ uint64_t mid = (min + max) / 2;
+ while ((array[mid] != target) && (min < max)) {
+ if (array[mid] < target) {
+ min = mid + 1;
+ } else if (mid == 0) {
+ max = 0;
+ } else {
+ max = mid - 1;
+ }
+ mid = (min + max) / 2;
+ }
+ if (target < array[mid]) {
+ return static_cast<int64_t>(mid) - 1;
+ } else {
+ return static_cast<int64_t>(mid);
+ }
+ }
+
+ struct Transition {
+ TransitionKind kind;
+ int64_t day;
+ int64_t week;
+ int64_t month;
+ int64_t time;
+
+ std::string toString() const {
+ std::stringstream buffer;
+ switch (kind) {
+ case TRANSITION_JULIAN:
+ buffer << "julian " << day;
+ break;
+ case TRANSITION_DAY:
+ buffer << "day " << day;
+ break;
+ case TRANSITION_MONTH:
+ buffer << "month " << month << " week " << week << " day " << day;
+ break;
+ }
+ buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60)
+ << ":" << (time % 60);
+ return buffer.str();
+ }
+
+ /**
+ * Get the transition time for the given year.
+ * @param year the year
+ * @return the number of seconds past local Jan 1 00:00:00 that the
+ * transition happens.
+ */
+ int64_t getTime(int64_t year) const {
+ int64_t result = time;
+ switch (kind) {
+ case TRANSITION_JULIAN:
+ result += SECONDS_PER_DAY * day;
+ if (day > 60 && isLeap(year)) {
+ result += SECONDS_PER_DAY;
+ }
+ break;
+ case TRANSITION_DAY:
+ result += SECONDS_PER_DAY * day;
+ break;
+ case TRANSITION_MONTH: {
+ bool inLeap = isLeap(year);
+ int64_t adjustedMonth = (month + 9) % 12 + 1;
+ int64_t adjustedYear = (month <= 2) ? (year - 1) : year;
+ int64_t adjustedCentury = adjustedYear / 100;
+ int64_t adjustedRemainder = adjustedYear % 100;
+
+ // day of the week of the first day of month
+ int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 +
+ 1 + adjustedRemainder + adjustedRemainder / 4 +
+ adjustedCentury / 4 - 2 * adjustedCentury) % 7;
+ if (dayOfWeek < 0) {
+ dayOfWeek += DAYS_PER_WEEK;
+ }
+
+ int64_t d = day - dayOfWeek;
+ if (d < 0) {
+ d += DAYS_PER_WEEK;
+ }
+ for (int w = 1; w < week; ++w) {
+ if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) {
+ break;
+ }
+ d += DAYS_PER_WEEK;
+ }
+ result += d * SECONDS_PER_DAY;
+
+ // Add in the time for the month
+ for(int m=0; m < month - 1; ++m) {
+ result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY;
+ }
+ break;
+ }
+ }
+ return result;
+ }
+ };
+
+ /**
+ * The current rule for finding timezone variants arbitrarily far in
+ * the future. They are based on a string representation that
+ * specifies the standard name and offset. For timezones with
+ * daylight savings, the string specifies the daylight variant name
+ * and offset and the rules for switching between them.
+ *
+ * rule = <standard name><standard offset><daylight>?
+ * name = string with no numbers or '+', '-', or ','
+ * offset = [-+]?hh(:mm(:ss)?)?
+ * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)?
+ * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week>
+ */
+ class FutureRuleImpl: public FutureRule {
+ std::string ruleString;
+ TimezoneVariant standard;
+ bool hasDst;
+ TimezoneVariant dst;
+ Transition start;
+ Transition end;
+
+ // expanded time_t offsets of transitions
+ std::vector<int64_t> offsets;
+
+ // Is the epoch (1 Jan 1970 00:00) in standard time?
+ // This code assumes that the transition dates fall in the same order
+ // each year. Hopefully no timezone regions decide to move across the
+ // equator, which is about what it would take.
+ bool startInStd;
+
+ void computeOffsets() {
+ if (!hasDst) {
+ startInStd = true;
+ offsets.resize(1);
+ } else {
+ // Insert a transition for the epoch and two per a year for the next
+ // 400 years. We assume that the all even positions are in standard
+ // time if and only if startInStd and the odd ones are the reverse.
+ offsets.resize(400 * 2 + 1);
+ startInStd = start.getTime(1970) < end.getTime(1970);
+ int64_t base = 0;
+ for(int64_t year = 1970; year < 1970 + 400; ++year) {
+ if (startInStd) {
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
+ base + start.getTime(year) - standard.gmtOffset;
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
+ base + end.getTime(year) - dst.gmtOffset;
+ } else {
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
+ base + end.getTime(year) - dst.gmtOffset;
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
+ base + start.getTime(year) - standard.gmtOffset;
+ }
+ base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY;
+ }
+ }
+ offsets[0] = 0;
+ }
+
+ public:
+ virtual ~FutureRuleImpl() override;
+ bool isDefined() const override;
+ const TimezoneVariant& getVariant(int64_t clk) const override;
+ void print(std::ostream& out) const override;
+
+ friend class FutureRuleParser;
+ };
+
+ FutureRule::~FutureRule() {
+ // PASS
+ }
+
+ FutureRuleImpl::~FutureRuleImpl() {
+ // PASS
+ }
+
+ bool FutureRuleImpl::isDefined() const {
+ return ruleString.size() > 0;
+ }
+
+ const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const {
+ if (!hasDst) {
+ return standard;
+ } else {
+ int64_t adjusted = clk % SECONDS_PER_400_YEARS;
+ if (adjusted < 0) {
+ adjusted += SECONDS_PER_400_YEARS;
+ }
+ int64_t idx = binarySearch(offsets, adjusted);
+ if (startInStd == (idx % 2 == 0)) {
+ return standard;
+ } else {
+ return dst;
+ }
+ }
+ }
+
+ void FutureRuleImpl::print(std::ostream& out) const {
+ if (isDefined()) {
+ out << " Future rule: " << ruleString << "\n";
+ out << " standard " << standard.toString() << "\n";
+ if (hasDst) {
+ out << " dst " << dst.toString() << "\n";
+ out << " start " << start.toString() << "\n";
+ out << " end " << end.toString() << "\n";
+ }
+ }
+ }
+
+ /**
+ * A parser for the future rule strings.
+ */
+ class FutureRuleParser {
+ public:
+ FutureRuleParser(const std::string& str,
+ FutureRuleImpl* rule
+ ): ruleString(str),
+ length(str.size()),
+ position(0),
+ output(*rule) {
+ output.ruleString = str;
+ if (position != length) {
+ parseName(output.standard.name);
+ output.standard.gmtOffset = -parseOffset();
+ output.standard.isDst = false;
+ output.hasDst = position < length;
+ if (output.hasDst) {
+ parseName(output.dst.name);
+ output.dst.isDst = true;
+ if (ruleString[position] != ',') {
+ output.dst.gmtOffset = -parseOffset();
+ } else {
+ output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60;
+ }
+ parseTransition(output.start);
+ parseTransition(output.end);
+ }
+ if (position != length) {
+ throwError("Extra text");
+ }
+ output.computeOffsets();
+ }
+ }
+
+ private:
+
+ const std::string& ruleString;
+ size_t length;
+ size_t position;
+ FutureRuleImpl &output;
+
+ void throwError(const char *msg) {
+ std::stringstream buffer;
+ buffer << msg << " at " << position << " in '" << ruleString << "'";
+ throw TimezoneError(buffer.str());
+ }
+
+ /**
+ * Parse the names of the form:
+ * ([^-+0-9,]+|<[^>]+>)
+ * and set the output string.
+ */
+ void parseName(std::string& result) {
+ if (position == length) {
+ throwError("name required");
+ }
+ size_t start = position;
+ if (ruleString[position] == '<') {
+ while (position < length && ruleString[position] != '>') {
+ position += 1;
+ }
+ if (position == length) {
+ throwError("missing close '>'");
+ }
+ position +=1;
+ } else {
+ while (position < length) {
+ char ch = ruleString[position];
+ if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') {
+ break;
+ }
+ position += 1;
+ }
+ }
+ if (position == start) {
+ throwError("empty string not allowed");
+ }
+ result = ruleString.substr(start, position - start);
+ }
+
+ /**
+ * Parse an integer of the form [0-9]+ and return it.
+ */
+ int64_t parseNumber() {
+ if (position >= length) {
+ throwError("missing number");
+ }
+ int64_t result = 0;
+ while (position < length) {
+ char ch = ruleString[position];
+ if (isdigit(ch)) {
+ result = result * 10 + (ch - '0');
+ position += 1;
+ } else {
+ break;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Parse the offsets of the form:
+ * [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)?
+ * and convert it into a number of seconds.
+ */
+ int64_t parseOffset() {
+ int64_t scale = 3600;
+ bool isNegative = false;
+ if (position < length) {
+ char ch = ruleString[position];
+ isNegative = ch == '-';
+ if (ch == '-' || ch == '+') {
+ position += 1;
+ }
+ }
+ int64_t result = parseNumber() * scale;
+ while (position < length && scale > 1 && ruleString[position] == ':') {
+ scale /= 60;
+ position += 1;
+ result += parseNumber() * scale;
+ }
+ if (isNegative) {
+ result = -result;
+ }
+ return result;
+ }
+
+ /**
+ * Parse a transition of the following form:
+ * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)?
+ */
+ void parseTransition(Transition& transition) {
+ if (length - position < 2 || ruleString[position] != ',') {
+ throwError("missing transition");
+ }
+ position += 1;
+ char ch = ruleString[position];
+ if (ch == 'J') {
+ transition.kind = TRANSITION_JULIAN;
+ position += 1;
+ transition.day = parseNumber();
+ } else if (ch == 'M') {
+ transition.kind = TRANSITION_MONTH;
+ position += 1;
+ transition.month = parseNumber();
+ if (position == length || ruleString[position] != '.') {
+ throwError("missing first .");
+ }
+ position += 1;
+ transition.week = parseNumber();
+ if (position == length || ruleString[position] != '.') {
+ throwError("missing second .");
+ }
+ position += 1;
+ transition.day = parseNumber();
+ } else {
+ transition.kind = TRANSITION_DAY;
+ transition.day = parseNumber();
+ }
+ if (position < length && ruleString[position] == '/') {
+ position += 1;
+ transition.time = parseOffset();
+ } else {
+ transition.time = 2 * 60 * 60;
+ }
+ }
+ };
+
+ /**
+ * Parse the POSIX TZ string.
+ */
+ std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) {
+ std::shared_ptr<FutureRule> result(new FutureRuleImpl());
+ FutureRuleParser parser(ruleString,
+ dynamic_cast<FutureRuleImpl*>(result.get()));
+ return result;
+ }
+
+ std::string TimezoneVariant::toString() const {
+ std::stringstream buffer;
+ buffer << name << " " << gmtOffset;
+ if (isDst) {
+ buffer << " (dst)";
+ }
+ return buffer.str();
+ }
+
+ /**
+ * An abstraction of the differences between versions.
+ */
+ class VersionParser {
+ public:
+ virtual ~VersionParser();
+
+ /**
+ * Get the version number.
+ */
+ virtual uint64_t getVersion() const = 0;
+
+ /**
+ * Get the number of bytes
+ */
+ virtual uint64_t getTimeSize() const = 0;
+
+ /**
+ * Parse the time at the given location.
+ */
+ virtual int64_t parseTime(const unsigned char* ptr) const = 0;
+
+ /**
+ * Parse the future string
+ */
+ virtual std::string parseFutureString(const unsigned char *ptr,
+ uint64_t offset,
+ uint64_t length) const = 0;
+ };
+
+ VersionParser::~VersionParser() {
+ // PASS
+ }
+
+ static uint32_t decode32(const unsigned char* ptr) {
+ return static_cast<uint32_t>(ptr[0] << 24) |
+ static_cast<uint32_t>(ptr[1] << 16) |
+ static_cast<uint32_t>(ptr[2] << 8) |
+ static_cast<uint32_t>(ptr[3]);
+ }
+
+ class Version1Parser: public VersionParser {
+ public:
+ virtual ~Version1Parser() override;
+
+ virtual uint64_t getVersion() const override {
+ return 1;
+ }
+
+ /**
+ * Get the number of bytes
+ */
+ virtual uint64_t getTimeSize() const override {
+ return 4;
+ }
+
+ /**
+ * Parse the time at the given location.
+ */
+ virtual int64_t parseTime(const unsigned char* ptr) const override {
+ // sign extend from 32 bits
+ return static_cast<int32_t>(decode32(ptr));
+ }
+
+ virtual std::string parseFutureString(const unsigned char *,
+ uint64_t,
+ uint64_t) const override {
+ return "";
+ }
+ };
+
+ Version1Parser::~Version1Parser() {
+ // PASS
+ }
+
+ class Version2Parser: public VersionParser {
+ public:
+ virtual ~Version2Parser() override;
+
+ virtual uint64_t getVersion() const override {
+ return 2;
+ }
+
+ /**
+ * Get the number of bytes
+ */
+ virtual uint64_t getTimeSize() const override {
+ return 8;
+ }
+
+ /**
+ * Parse the time at the given location.
+ */
+ virtual int64_t parseTime(const unsigned char* ptr) const override {
+ return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4);
+ }
+
+ virtual std::string parseFutureString(const unsigned char *ptr,
+ uint64_t offset,
+ uint64_t length) const override {
+ return std::string(reinterpret_cast<const char*>(ptr) + offset + 1,
+ length - 2);
+ }
+ };
+
+ Version2Parser::~Version2Parser() {
+ // PASS
+ }
+
+ class TimezoneImpl: public Timezone {
+ public:
+ TimezoneImpl(const std::string& name,
+ const std::vector<unsigned char> bytes);
+ virtual ~TimezoneImpl() override;
+
+ /**
+ * Get the variant for the given time (time_t).
+ */
+ const TimezoneVariant& getVariant(int64_t clk) const override;
+
+ void print(std::ostream&) const override;
+
+ uint64_t getVersion() const override {
+ return version;
+ }
+
+ int64_t getEpoch() const override {
+ return epoch;
+ }
+
+ int64_t convertToUTC(int64_t clk) const override {
+ return clk + getVariant(clk).gmtOffset;
+ }
+
+ private:
+ void parseTimeVariants(const unsigned char* ptr,
+ uint64_t variantOffset,
+ uint64_t variantCount,
+ uint64_t nameOffset,
+ uint64_t nameCount);
+ void parseZoneFile(const unsigned char* ptr,
+ uint64_t sectionOffset,
+ uint64_t fileLength,
+ const VersionParser& version);
+ // filename
+ std::string filename;
+
+ // the version of the file
+ uint64_t version;
+
+ // the list of variants for this timezone
+ std::vector<TimezoneVariant> variants;
+
+ // the list of the times where the local rules change
+ std::vector<int64_t> transitions;
+
+ // the variant that starts at this transition.
+ std::vector<uint64_t> currentVariant;
+
+ // the variant before the first transition
+ uint64_t ancientVariant;
+
+ // the rule for future times
+ std::shared_ptr<FutureRule> futureRule;
+
+ // the last explicit transition after which we use the future rule
+ int64_t lastTransition;
+
+ // The ORC epoch time in this timezone.
+ int64_t epoch;
+ };
+
+ DIAGNOSTIC_PUSH
+ #ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wglobal-constructors")
+ DIAGNOSTIC_IGNORE("-Wexit-time-destructors")
+ #endif
+ static std::mutex timezone_mutex;
+ static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache;
+ DIAGNOSTIC_POP
+
+ Timezone::~Timezone() {
+ // PASS
+ }
+
+ TimezoneImpl::TimezoneImpl(const std::string& _filename,
+ const std::vector<unsigned char> buffer
+ ): filename(_filename) {
+ parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser());
+ // Build the literal for the ORC epoch
+ // 2015 Jan 1 00:00:00
+ tm epochStruct;
+ epochStruct.tm_sec = 0;
+ epochStruct.tm_min = 0;
+ epochStruct.tm_hour = 0;
+ epochStruct.tm_mday = 1;
+ epochStruct.tm_mon = 0;
+ epochStruct.tm_year = 2015 - 1900;
+ epochStruct.tm_isdst = 0;
+ time_t utcEpoch = timegm(&epochStruct);
+ epoch = utcEpoch - getVariant(utcEpoch).gmtOffset;
+ }
+
+ const char* getTimezoneDirectory() {
+ const char *dir = getenv("TZDIR");
+ if (!dir) {
+ dir = DEFAULT_TZDIR;
+ }
+ return dir;
+ }
+
+ /**
+ * Get a timezone by absolute filename.
+ * Results are cached.
+ */
+ const Timezone& getTimezoneByFilename(const std::string& filename) {
+ // ORC-110
+ std::lock_guard<std::mutex> timezone_lock(timezone_mutex);
+ std::map<std::string, std::shared_ptr<Timezone> >::iterator itr =
+ timezoneCache.find(filename);
+ if (itr != timezoneCache.end()) {
+ return *(itr->second).get();
+ }
+ try {
+ ORC_UNIQUE_PTR<InputStream> file = readFile(filename);
+ size_t size = static_cast<size_t>(file->getLength());
+ std::vector<unsigned char> buffer(size);
+ file->read(&buffer[0], size, 0);
+ timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer));
+ } catch(ParseError& err) {
+ throw TimezoneError(err.what());
+ }
+ return *timezoneCache[filename].get();
+ }
+
+ /**
+ * Get the local timezone.
+ */
+ const Timezone& getLocalTimezone() {
+#ifdef _MSC_VER
+ return getTimezoneByName("UTC");
+#else
+ return getTimezoneByFilename(LOCAL_TIMEZONE);
+#endif
+ }
+
+ /**
+ * Get a timezone by name (eg. America/Los_Angeles).
+ * Results are cached.
+ */
+ const Timezone& getTimezoneByName(const std::string& zone) {
+ std::string filename(getTimezoneDirectory());
+ filename += "/";
+ filename += zone;
+ return getTimezoneByFilename(filename);
+ }
+
+ /**
+ * Parse a set of bytes as a timezone file as if they came from filename.
+ */
+ std::unique_ptr<Timezone> getTimezone(const std::string& filename,
+ const std::vector<unsigned char>& b){
+ return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b));
+ }
+
+ TimezoneImpl::~TimezoneImpl() {
+ // PASS
+ }
+
+ void TimezoneImpl::parseTimeVariants(const unsigned char* ptr,
+ uint64_t variantOffset,
+ uint64_t variantCount,
+ uint64_t nameOffset,
+ uint64_t nameCount) {
+ for(uint64_t variant=0; variant < variantCount; ++variant) {
+ variants[variant].gmtOffset =
+ static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
+ variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0;
+ uint64_t nameStart = ptr[variantOffset + 6 * variant + 5];
+ if (nameStart >= nameCount) {
+ std::stringstream buffer;
+ buffer << "name out of range in variant " << variant
+ << " - " << nameStart << " >= " << nameCount;
+ throw TimezoneError(buffer.str());
+ }
+ variants[variant].name = std::string(reinterpret_cast<const char*>(ptr)
+ + nameOffset + nameStart);
+ }
+ }
+
+ /**
+ * Parse the zone file to get the bits we need.
+ * There are two versions of the timezone file:
+ *
+ * Version 1(version = 0x00):
+ * Magic(version)
+ * Header
+ * TransitionTimes(4 byte)
+ * TransitionRules
+ * Rules
+ * LeapSeconds(4 byte)
+ * IsStd
+ * IsGmt
+ *
+ * Version2:
+ * Version1(0x32) = a version 1 copy of the data for old clients
+ * Magic(0x32)
+ * Header
+ * TransitionTimes(8 byte)
+ * TransitionRules
+ * Rules
+ * LeapSeconds(8 byte)
+ * IsStd
+ * IsGmt
+ * FutureString
+ */
+ void TimezoneImpl::parseZoneFile(const unsigned char *ptr,
+ uint64_t sectionOffset,
+ uint64_t fileLength,
+ const VersionParser& versionParser) {
+ const uint64_t magicOffset = sectionOffset + 0;
+ const uint64_t headerOffset = magicOffset + 20;
+
+ // check for validity before we start parsing
+ if (fileLength < headerOffset + 6 * 4 ||
+ strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4)
+ != 0) {
+ std::stringstream buffer;
+ buffer << "non-tzfile " << filename;
+ throw TimezoneError(buffer.str());
+ }
+
+ const uint64_t isGmtCount = decode32(ptr + headerOffset + 0);
+ const uint64_t isStdCount = decode32(ptr + headerOffset + 4);
+ const uint64_t leapCount = decode32(ptr + headerOffset + 8);
+ const uint64_t timeCount = decode32(ptr + headerOffset + 12);
+ const uint64_t variantCount = decode32(ptr + headerOffset + 16);
+ const uint64_t nameCount = decode32(ptr + headerOffset + 20);
+
+ const uint64_t timeOffset = headerOffset + 24;
+ const uint64_t timeVariantOffset =
+ timeOffset + versionParser.getTimeSize() * timeCount;
+ const uint64_t variantOffset = timeVariantOffset + timeCount;
+ const uint64_t nameOffset = variantOffset + variantCount * 6;
+ const uint64_t sectionLength = nameOffset + nameCount
+ + (versionParser.getTimeSize() + 4) * leapCount
+ + isGmtCount + isStdCount;
+
+ if (sectionLength > fileLength) {
+ std::stringstream buffer;
+ buffer << "tzfile too short " << filename
+ << " needs " << sectionLength << " and has " << fileLength;
+ throw TimezoneError(buffer.str());
+ }
+
+ // if it is version 2, skip over the old layout and read the new one.
+ if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) {
+ parseZoneFile(ptr, sectionLength, fileLength, Version2Parser());
+ return;
+ }
+ version = versionParser.getVersion();
+ variants.resize(variantCount);
+ transitions.resize(timeCount);
+ currentVariant.resize(timeCount);
+ parseTimeVariants(ptr, variantOffset, variantCount, nameOffset,
+ nameCount);
+ bool foundAncient = false;
+ for(uint64_t t=0; t < timeCount; ++t) {
+ transitions[t] =
+ versionParser.parseTime(ptr + timeOffset +
+ t * versionParser.getTimeSize());
+ currentVariant[t] = ptr[timeVariantOffset + t];
+ if (currentVariant[t] >= variantCount) {
+ std::stringstream buffer;
+ buffer << "tzfile rule out of range " << filename
+ << " references rule " << currentVariant[t]
+ << " of " << variantCount;
+ throw TimezoneError(buffer.str());
+ }
+ // find the oldest standard time and use that as the ancient value
+ if (!foundAncient &&
+ !variants[currentVariant[t]].isDst) {
+ foundAncient = true;
+ ancientVariant = currentVariant[t];
+ }
+ }
+ if (!foundAncient) {
+ ancientVariant = 0;
+ }
+ futureRule = parseFutureRule(versionParser.parseFutureString
+ (ptr, sectionLength,
+ fileLength - sectionLength));
+
+ // find the lower bound for applying the future rule
+ if (futureRule->isDefined()) {
+ if (timeCount > 0) {
+ lastTransition = transitions[timeCount - 1];
+ } else {
+ lastTransition = INT64_MIN;
+ }
+ } else {
+ lastTransition = INT64_MAX;
+ }
+ }
+
+ const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const {
+ // if it is after the last explicit entry in the table,
+ // use the future rule to get an answer
+ if (clk > lastTransition) {
+ return futureRule->getVariant(clk);
+ } else {
+ int64_t transition = binarySearch(transitions, clk);
+ uint64_t idx;
+ if (transition < 0) {
+ idx = ancientVariant;
+ } else {
+ idx = currentVariant[static_cast<size_t>(transition)];
+ }
+ return variants[idx];
+ }
+ }
+
+ void TimezoneImpl::print(std::ostream& out) const {
+ out << "Timezone file: " << filename << "\n";
+ out << " Version: " << version << "\n";
+ futureRule->print(out);
+ for(uint64_t r=0; r < variants.size(); ++r) {
+ out << " Variant " << r << ": "
+ << variants[r].toString() << "\n";
+ }
+ for(uint64_t t=0; t < transitions.size(); ++t) {
+ tm timeStruct;
+ tm* result = nullptr;
+ char buffer[25];
+ if (sizeof(time_t) >= 8) {
+ time_t val = transitions[t];
+ result = gmtime_r(&val, &timeStruct);
+ if (result) {
+ strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct);
+ }
+ }
+ std::cout << " Transition: " << (result == nullptr ? "null" : buffer)
+ << " (" << transitions[t] << ") -> "
+ << variants[currentVariant[t]].name
+ << "\n";
+ }
+ }
+
+ TimezoneError::TimezoneError(const std::string& what
+ ): std::runtime_error(what) {
+ // PASS
+ }
+
+ TimezoneError::TimezoneError(const TimezoneError& other
+ ): std::runtime_error(other) {
+ // PASS
+ }
+
+ TimezoneError::~TimezoneError() ORC_NOEXCEPT {
+ // PASS
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/Timezone.hh b/contrib/libs/apache/orc/c++/src/Timezone.hh
index 136b7a18b7..6bcb6586d0 100644
--- a/contrib/libs/apache/orc/c++/src/Timezone.hh
+++ b/contrib/libs/apache/orc/c++/src/Timezone.hh
@@ -1,130 +1,130 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef TIMEZONE_HH
-#define TIMEZONE_HH
-
-// This file is for timezone routines.
-
-#include "Adaptor.hh"
-
-#include <memory>
-#include <stdexcept>
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-namespace orc {
-
- static const int64_t SECONDS_PER_HOUR = 60 * 60;
- static const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24;
-
- /**
- * A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles).
- */
- struct TimezoneVariant {
- int64_t gmtOffset;
- bool isDst;
- std::string name;
-
- std::string toString() const;
- };
-
- /**
- * A region that shares the same legal rules for wall clock time and
- * day light savings transitions. They are typically named for the largest
- * city in the region (eg. America/Los_Angeles or America/Mexico_City).
- */
- class Timezone {
- public:
- virtual ~Timezone();
-
- /**
- * Get the variant for the given time (time_t).
- */
- virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
-
- /**
- * Get the number of seconds between the ORC epoch in this timezone
- * and Unix epoch.
- * ORC epoch is 1 Jan 2015 00:00:00 local.
- * Unix epoch is 1 Jan 1970 00:00:00 UTC.
- */
- virtual int64_t getEpoch() const = 0;
-
- /**
- * Print the timezone to the stream.
- */
- virtual void print(std::ostream&) const = 0;
-
- /**
- * Get the version of the zone file.
- */
- virtual uint64_t getVersion() const =0;
-
- /**
- * Convert wall clock time of current timezone to UTC timezone
- */
- virtual int64_t convertToUTC(int64_t clk) const = 0;
- };
-
- /**
- * Get the local timezone.
- * Results are cached.
- */
- const Timezone& getLocalTimezone();
-
- /**
- * Get a timezone by name (eg. America/Los_Angeles).
- * Results are cached.
- */
- const Timezone& getTimezoneByName(const std::string& zone);
-
- /**
- * Parse a set of bytes as a timezone file as if they came from filename.
- */
- std::unique_ptr<Timezone> getTimezone(const std::string& filename,
- const std::vector<unsigned char>& b);
-
- class TimezoneError: public std::runtime_error {
- public:
- TimezoneError(const std::string& what);
- TimezoneError(const TimezoneError&);
- virtual ~TimezoneError() ORC_NOEXCEPT;
- };
-
- /**
- * Represents the parsed POSIX timezone rule strings that are used to
- * describe the future transitions, because they can go arbitrarily far into
- * the future.
- */
- class FutureRule {
- public:
- virtual ~FutureRule();
- virtual bool isDefined() const = 0;
- virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
- virtual void print(std::ostream& out) const = 0;
- };
-
- /**
- * Parse the POSIX TZ string.
- */
- std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TIMEZONE_HH
+#define TIMEZONE_HH
+
+// This file is for timezone routines.
+
+#include "Adaptor.hh"
+
+#include <memory>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace orc {
+
+ static const int64_t SECONDS_PER_HOUR = 60 * 60;
+ static const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24;
+
+ /**
+ * A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles).
+ */
+ struct TimezoneVariant {
+ int64_t gmtOffset;
+ bool isDst;
+ std::string name;
+
+ std::string toString() const;
+ };
+
+ /**
+ * A region that shares the same legal rules for wall clock time and
+ * day light savings transitions. They are typically named for the largest
+ * city in the region (eg. America/Los_Angeles or America/Mexico_City).
+ */
+ class Timezone {
+ public:
+ virtual ~Timezone();
+
+ /**
+ * Get the variant for the given time (time_t).
+ */
+ virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
+
+ /**
+ * Get the number of seconds between the ORC epoch in this timezone
+ * and Unix epoch.
+ * ORC epoch is 1 Jan 2015 00:00:00 local.
+ * Unix epoch is 1 Jan 1970 00:00:00 UTC.
+ */
+ virtual int64_t getEpoch() const = 0;
+
+ /**
+ * Print the timezone to the stream.
+ */
+ virtual void print(std::ostream&) const = 0;
+
+ /**
+ * Get the version of the zone file.
+ */
+ virtual uint64_t getVersion() const =0;
+
+ /**
+ * Convert wall clock time of current timezone to UTC timezone
+ */
+ virtual int64_t convertToUTC(int64_t clk) const = 0;
+ };
+
+ /**
+ * Get the local timezone.
+ * Results are cached.
+ */
+ const Timezone& getLocalTimezone();
+
+ /**
+ * Get a timezone by name (eg. America/Los_Angeles).
+ * Results are cached.
+ */
+ const Timezone& getTimezoneByName(const std::string& zone);
+
+ /**
+ * Parse a set of bytes as a timezone file as if they came from filename.
+ */
+ std::unique_ptr<Timezone> getTimezone(const std::string& filename,
+ const std::vector<unsigned char>& b);
+
+ class TimezoneError: public std::runtime_error {
+ public:
+ TimezoneError(const std::string& what);
+ TimezoneError(const TimezoneError&);
+ virtual ~TimezoneError() ORC_NOEXCEPT;
+ };
+
+ /**
+ * Represents the parsed POSIX timezone rule strings that are used to
+ * describe the future transitions, because they can go arbitrarily far into
+ * the future.
+ */
+ class FutureRule {
+ public:
+ virtual ~FutureRule();
+ virtual bool isDefined() const = 0;
+ virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
+ virtual void print(std::ostream& out) const = 0;
+ };
+
+ /**
+ * Parse the POSIX TZ string.
+ */
+ std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
index c154f2af04..78a0e00686 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
@@ -1,707 +1,707 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Adaptor.hh"
-#include "orc/Exceptions.hh"
-#include "TypeImpl.hh"
-
-#include <iostream>
-#include <sstream>
-
-namespace orc {
-
- Type::~Type() {
- // PASS
- }
-
- TypeImpl::TypeImpl(TypeKind _kind) {
- parent = nullptr;
- columnId = -1;
- maximumColumnId = -1;
- kind = _kind;
- maxLength = 0;
- precision = 0;
- scale = 0;
- subtypeCount = 0;
- }
-
- TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) {
- parent = nullptr;
- columnId = -1;
- maximumColumnId = -1;
- kind = _kind;
- maxLength = _maxLength;
- precision = 0;
- scale = 0;
- subtypeCount = 0;
- }
-
- TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision,
- uint64_t _scale) {
- parent = nullptr;
- columnId = -1;
- maximumColumnId = -1;
- kind = _kind;
- maxLength = 0;
- precision = _precision;
- scale = _scale;
- subtypeCount = 0;
- }
-
- uint64_t TypeImpl::assignIds(uint64_t root) const {
- columnId = static_cast<int64_t>(root);
- uint64_t current = root + 1;
- for(uint64_t i=0; i < subtypeCount; ++i) {
- current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current);
- }
- maximumColumnId = static_cast<int64_t>(current) - 1;
- return current;
- }
-
- TypeImpl::~TypeImpl() {
- for (std::vector<Type*>::iterator it = subTypes.begin();
- it != subTypes.end(); it++) {
- delete (*it) ;
- }
- }
-
- void TypeImpl::ensureIdAssigned() const {
- if (columnId == -1) {
- const TypeImpl* root = this;
- while (root->parent != nullptr) {
- root = root->parent;
- }
- root->assignIds(0);
- }
- }
-
- uint64_t TypeImpl::getColumnId() const {
- ensureIdAssigned();
- return static_cast<uint64_t>(columnId);
- }
-
- uint64_t TypeImpl::getMaximumColumnId() const {
- ensureIdAssigned();
- return static_cast<uint64_t>(maximumColumnId);
- }
-
- TypeKind TypeImpl::getKind() const {
- return kind;
- }
-
- uint64_t TypeImpl::getSubtypeCount() const {
- return subtypeCount;
- }
-
- const Type* TypeImpl::getSubtype(uint64_t i) const {
- return subTypes[i];
- }
-
- const std::string& TypeImpl::getFieldName(uint64_t i) const {
- return fieldNames[i];
- }
-
- uint64_t TypeImpl::getMaximumLength() const {
- return maxLength;
- }
-
- uint64_t TypeImpl::getPrecision() const {
- return precision;
- }
-
- uint64_t TypeImpl::getScale() const {
- return scale;
- }
-
- void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) {
- columnId = static_cast<int64_t>(_columnId);
- maximumColumnId = static_cast<int64_t>(_maxColumnId);
- }
-
- void TypeImpl::addChildType(std::unique_ptr<Type> childType) {
- TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release());
- subTypes.push_back(child);
- if (child != nullptr) {
- child->parent = this;
- }
- subtypeCount += 1;
- }
-
- Type* TypeImpl::addStructField(const std::string& fieldName,
- std::unique_ptr<Type> fieldType) {
- addChildType(std::move(fieldType));
- fieldNames.push_back(fieldName);
- return this;
- }
-
- Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) {
- addChildType(std::move(fieldType));
- return this;
- }
-
- std::string TypeImpl::toString() const {
- switch (static_cast<int64_t>(kind)) {
- case BOOLEAN:
- return "boolean";
- case BYTE:
- return "tinyint";
- case SHORT:
- return "smallint";
- case INT:
- return "int";
- case LONG:
- return "bigint";
- case FLOAT:
- return "float";
- case DOUBLE:
- return "double";
- case STRING:
- return "string";
- case BINARY:
- return "binary";
- case TIMESTAMP:
- return "timestamp";
- case LIST:
- return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
- case MAP:
- return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
- (subTypes[1] ? subTypes[1]->toString() : "void") + ">";
- case STRUCT: {
- std::string result = "struct<";
- for(size_t i=0; i < subTypes.size(); ++i) {
- if (i != 0) {
- result += ",";
- }
- result += fieldNames[i];
- result += ":";
- result += subTypes[i]->toString();
- }
- result += ">";
- return result;
- }
- case UNION: {
- std::string result = "uniontype<";
- for(size_t i=0; i < subTypes.size(); ++i) {
- if (i != 0) {
- result += ",";
- }
- result += subTypes[i]->toString();
- }
- result += ">";
- return result;
- }
- case DECIMAL: {
- std::stringstream result;
- result << "decimal(" << precision << "," << scale << ")";
- return result.str();
- }
- case DATE:
- return "date";
- case VARCHAR: {
- std::stringstream result;
- result << "varchar(" << maxLength << ")";
- return result.str();
- }
- case CHAR: {
- std::stringstream result;
- result << "char(" << maxLength << ")";
- return result.str();
- }
- default:
- throw NotImplementedYet("Unknown type");
- }
- }
-
- std::unique_ptr<ColumnVectorBatch>
- TypeImpl::createRowBatch(uint64_t capacity,
- MemoryPool& memoryPool,
- bool encoded) const {
- switch (static_cast<int64_t>(kind)) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case DATE:
- return std::unique_ptr<ColumnVectorBatch>
- (new LongVectorBatch(capacity, memoryPool));
-
- case FLOAT:
- case DOUBLE:
- return std::unique_ptr<ColumnVectorBatch>
- (new DoubleVectorBatch(capacity, memoryPool));
-
- case STRING:
- case BINARY:
- case CHAR:
- case VARCHAR:
- return encoded ?
- std::unique_ptr<ColumnVectorBatch>
- (new EncodedStringVectorBatch(capacity, memoryPool))
- : std::unique_ptr<ColumnVectorBatch>
- (new StringVectorBatch(capacity, memoryPool));
-
- case TIMESTAMP:
- return std::unique_ptr<ColumnVectorBatch>
- (new TimestampVectorBatch(capacity, memoryPool));
-
- case STRUCT: {
- StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- for(uint64_t i=0; i < getSubtypeCount(); ++i) {
- result->fields.push_back(getSubtype(i)->
- createRowBatch(capacity,
- memoryPool, encoded).release());
- }
- return return_value;
- }
-
- case LIST: {
- ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- if (getSubtype(0) != nullptr) {
- result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded);
- }
- return return_value;
- }
-
- case MAP: {
- MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- if (getSubtype(0) != nullptr) {
- result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded);
- }
- if (getSubtype(1) != nullptr) {
- result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded);
- }
- return return_value;
- }
-
- case DECIMAL: {
- if (getPrecision() == 0 || getPrecision() > 18) {
- return std::unique_ptr<ColumnVectorBatch>
- (new Decimal128VectorBatch(capacity, memoryPool));
- } else {
- return std::unique_ptr<ColumnVectorBatch>
- (new Decimal64VectorBatch(capacity, memoryPool));
- }
- }
-
- case UNION: {
- UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool);
- std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
- for(uint64_t i=0; i < getSubtypeCount(); ++i) {
- result->children.push_back(getSubtype(i)->createRowBatch(capacity,
- memoryPool, encoded)
- .release());
- }
- return return_value;
- }
-
- default:
- throw NotImplementedYet("not supported yet");
- }
- }
-
- std::unique_ptr<Type> createPrimitiveType(TypeKind kind) {
- return std::unique_ptr<Type>(new TypeImpl(kind));
- }
-
- std::unique_ptr<Type> createCharType(TypeKind kind,
- uint64_t maxLength) {
- return std::unique_ptr<Type>(new TypeImpl(kind, maxLength));
- }
-
- std::unique_ptr<Type> createDecimalType(uint64_t precision,
- uint64_t scale) {
- return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
- }
-
- std::unique_ptr<Type> createStructType() {
- return std::unique_ptr<Type>(new TypeImpl(STRUCT));
- }
-
- std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) {
- TypeImpl* result = new TypeImpl(LIST);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
- result->addChildType(std::move(elements));
- return return_value;
- }
-
- std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
- std::unique_ptr<Type> value) {
- TypeImpl* result = new TypeImpl(MAP);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
- result->addChildType(std::move(key));
- result->addChildType(std::move(value));
- return return_value;
- }
-
- std::unique_ptr<Type> createUnionType() {
- return std::unique_ptr<Type>(new TypeImpl(UNION));
- }
-
- std::string printProtobufMessage(const google::protobuf::Message& message);
- std::unique_ptr<Type> convertType(const proto::Type& type,
- const proto::Footer& footer) {
- switch (static_cast<int64_t>(type.kind())) {
-
- case proto::Type_Kind_BOOLEAN:
- case proto::Type_Kind_BYTE:
- case proto::Type_Kind_SHORT:
- case proto::Type_Kind_INT:
- case proto::Type_Kind_LONG:
- case proto::Type_Kind_FLOAT:
- case proto::Type_Kind_DOUBLE:
- case proto::Type_Kind_STRING:
- case proto::Type_Kind_BINARY:
- case proto::Type_Kind_TIMESTAMP:
- case proto::Type_Kind_DATE:
- return std::unique_ptr<Type>
- (new TypeImpl(static_cast<TypeKind>(type.kind())));
-
- case proto::Type_Kind_CHAR:
- case proto::Type_Kind_VARCHAR:
- return std::unique_ptr<Type>
- (new TypeImpl(static_cast<TypeKind>(type.kind()),
- type.maximumlength()));
-
- case proto::Type_Kind_DECIMAL:
- return std::unique_ptr<Type>
- (new TypeImpl(DECIMAL, type.precision(), type.scale()));
-
- case proto::Type_Kind_LIST:
- case proto::Type_Kind_MAP:
- case proto::Type_Kind_UNION: {
- TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind()));
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
- if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1)
- throw ParseError("Illegal LIST type that doesn't contain one subtype");
- if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2)
- throw ParseError("Illegal MAP type that doesn't contain two subtypes");
- if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0)
- throw ParseError("Illegal UNION type that doesn't contain any subtypes");
- for(int i=0; i < type.subtypes_size(); ++i) {
- result->addUnionChild(convertType(footer.types(static_cast<int>
- (type.subtypes(i))),
- footer));
- }
- return return_value;
- }
-
- case proto::Type_Kind_STRUCT: {
- TypeImpl* result = new TypeImpl(STRUCT);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
- for(int i=0; i < type.subtypes_size(); ++i) {
- result->addStructField(type.fieldnames(i),
- convertType(footer.types(static_cast<int>
- (type.subtypes(i))),
- footer));
- }
- return return_value;
- }
- default:
- throw NotImplementedYet("Unknown type kind");
- }
- }
-
- /**
- * Build a clone of the file type, projecting columns from the selected
- * vector. This routine assumes that the parent of any selected column
- * is also selected. The column ids are copied from the fileType.
- * @param fileType the type in the file
- * @param selected is each column by id selected
- * @return a clone of the fileType filtered by the selection array
- */
- std::unique_ptr<Type> buildSelectedType(const Type *fileType,
- const std::vector<bool>& selected) {
- if (fileType == nullptr || !selected[fileType->getColumnId()]) {
- return std::unique_ptr<Type>();
- }
-
- TypeImpl* result;
- switch (static_cast<int>(fileType->getKind())) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case FLOAT:
- case DOUBLE:
- case STRING:
- case BINARY:
- case TIMESTAMP:
- case DATE:
- result = new TypeImpl(fileType->getKind());
- break;
-
- case DECIMAL:
- result= new TypeImpl(fileType->getKind(),
- fileType->getPrecision(), fileType->getScale());
- break;
-
- case VARCHAR:
- case CHAR:
- result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength());
- break;
-
- case LIST:
- result = new TypeImpl(fileType->getKind());
- result->addChildType(buildSelectedType(fileType->getSubtype(0),
- selected));
- break;
-
- case MAP:
- result = new TypeImpl(fileType->getKind());
- result->addChildType(buildSelectedType(fileType->getSubtype(0),
- selected));
- result->addChildType(buildSelectedType(fileType->getSubtype(1),
- selected));
- break;
-
- case STRUCT: {
- result = new TypeImpl(fileType->getKind());
- for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) {
- std::unique_ptr<Type> childType =
- buildSelectedType(fileType->getSubtype(child), selected);
- if (childType.get() != nullptr) {
- result->addStructField(fileType->getFieldName(child),
- std::move(childType));
- }
- }
- break;
- }
-
- case UNION: {
- result = new TypeImpl(fileType->getKind());
- for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) {
- std::unique_ptr<Type> childType =
- buildSelectedType(fileType->getSubtype(child), selected);
- if (childType.get() != nullptr) {
- result->addUnionChild(std::move(childType));
- }
- }
- break;
- }
-
- default:
- throw NotImplementedYet("Unknown type kind");
- }
- result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
- return std::unique_ptr<Type>(result);
- }
-
- ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) {
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res =
- TypeImpl::parseType(input, 0, input.size());
- if (res.size() != 1) {
- throw std::logic_error("Invalid type string.");
- }
- return std::move(res[0].second);
- }
-
- std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input,
- size_t start,
- size_t end) {
- TypeImpl* arrayType = new TypeImpl(LIST);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType);
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v =
- TypeImpl::parseType(input, start, end);
- if (v.size() != 1) {
- throw std::logic_error("Array type must contain exactly one sub type.");
- }
- arrayType->addChildType(std::move(v[0].second));
- return return_value;
- }
-
- std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input,
- size_t start,
- size_t end) {
- TypeImpl * mapType = new TypeImpl(MAP);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType);
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v =
- TypeImpl::parseType(input, start, end);
- if (v.size() != 2) {
- throw std::logic_error(
- "Map type must contain exactly two sub types.");
- }
- mapType->addChildType(std::move(v[0].second));
- mapType->addChildType(std::move(v[1].second));
- return return_value;
- }
-
- std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input,
- size_t start,
- size_t end) {
- TypeImpl* structType = new TypeImpl(STRUCT);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType);
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type>> > v =
- TypeImpl::parseType(input, start, end);
- if (v.size() == 0) {
- throw std::logic_error(
- "Struct type must contain at least one sub type.");
- }
- for (size_t i = 0; i < v.size(); ++i) {
- structType->addStructField(v[i].first, std::move(v[i].second));
- }
- return return_value;
- }
-
- std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input,
- size_t start,
- size_t end) {
- TypeImpl* unionType = new TypeImpl(UNION);
- std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType);
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v =
- TypeImpl::parseType(input, start, end);
- if (v.size() == 0) {
- throw std::logic_error("Union type must contain at least one sub type.");
- }
- for (size_t i = 0; i < v.size(); ++i) {
- unionType->addChildType(std::move(v[i].second));
- }
- return return_value;
- }
-
- std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input,
- size_t start,
- size_t end) {
- size_t sep = input.find(',', start);
- if (sep + 1 >= end || sep == std::string::npos) {
- throw std::logic_error("Decimal type must specify precision and scale.");
- }
- uint64_t precision =
- static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str()));
- uint64_t scale =
- static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str()));
- return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
- }
-
- std::unique_ptr<Type> TypeImpl::parseCategory(std::string category,
- const std::string &input,
- size_t start,
- size_t end) {
- if (category == "boolean") {
- return std::unique_ptr<Type>(new TypeImpl(BOOLEAN));
- } else if (category == "tinyint") {
- return std::unique_ptr<Type>(new TypeImpl(BYTE));
- } else if (category == "smallint") {
- return std::unique_ptr<Type>(new TypeImpl(SHORT));
- } else if (category == "int") {
- return std::unique_ptr<Type>(new TypeImpl(INT));
- } else if (category == "bigint") {
- return std::unique_ptr<Type>(new TypeImpl(LONG));
- } else if (category == "float") {
- return std::unique_ptr<Type>(new TypeImpl(FLOAT));
- } else if (category == "double") {
- return std::unique_ptr<Type>(new TypeImpl(DOUBLE));
- } else if (category == "string") {
- return std::unique_ptr<Type>(new TypeImpl(STRING));
- } else if (category == "binary") {
- return std::unique_ptr<Type>(new TypeImpl(BINARY));
- } else if (category == "timestamp") {
- return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP));
- } else if (category == "array") {
- return parseArrayType(input, start, end);
- } else if (category == "map") {
- return parseMapType(input, start, end);
- } else if (category == "struct") {
- return parseStructType(input, start, end);
- } else if (category == "uniontype") {
- return parseUnionType(input, start, end);
- } else if (category == "decimal") {
- return parseDecimalType(input, start, end);
- } else if (category == "date") {
- return std::unique_ptr<Type>(new TypeImpl(DATE));
- } else if (category == "varchar") {
- uint64_t maxLength = static_cast<uint64_t>(
- atoi(input.substr(start, end - start).c_str()));
- return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength));
- } else if (category == "char") {
- uint64_t maxLength = static_cast<uint64_t>(
- atoi(input.substr(start, end - start).c_str()));
- return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength));
- } else {
- throw std::logic_error("Unknown type " + category);
- }
- }
-
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > TypeImpl::parseType(
- const std::string &input,
- size_t start,
- size_t end) {
- std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res;
- size_t pos = start;
-
- while (pos < end) {
- size_t endPos = pos;
- while (endPos < end && (isalnum(input[endPos]) || input[endPos] == '_')) {
- ++endPos;
- }
-
- std::string fieldName;
- if (input[endPos] == ':') {
- fieldName = input.substr(pos, endPos - pos);
- pos = ++endPos;
- while (endPos < end && isalpha(input[endPos])) {
- ++endPos;
- }
- }
-
- size_t nextPos = endPos + 1;
- if (input[endPos] == '<') {
- int count = 1;
- while (nextPos < end) {
- if (input[nextPos] == '<') {
- ++count;
- } else if (input[nextPos] == '>') {
- --count;
- }
- if (count == 0) {
- break;
- }
- ++nextPos;
- }
- if (nextPos == end) {
- throw std::logic_error("Invalid type string. Cannot find closing >");
- }
- } else if (input[endPos] == '(') {
- while (nextPos < end && input[nextPos] != ')') {
- ++nextPos;
- }
- if (nextPos == end) {
- throw std::logic_error("Invalid type string. Cannot find closing )");
- }
- } else if (input[endPos] != ',' && endPos != end) {
- throw std::logic_error("Unrecognized character.");
- }
-
- std::string category = input.substr(pos, endPos - pos);
- res.push_back(std::make_pair(fieldName, parseCategory(category, input, endPos + 1, nextPos)));
-
- if (nextPos < end && (input[nextPos] == ')' || input[nextPos] == '>')) {
- pos = nextPos + 2;
- } else {
- pos = nextPos;
- }
- }
-
- return res;
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "orc/Exceptions.hh"
+#include "TypeImpl.hh"
+
+#include <iostream>
+#include <sstream>
+
+namespace orc {
+
+ Type::~Type() {
+ // PASS
+ }
+
+ TypeImpl::TypeImpl(TypeKind _kind) {
+ parent = nullptr;
+ columnId = -1;
+ maximumColumnId = -1;
+ kind = _kind;
+ maxLength = 0;
+ precision = 0;
+ scale = 0;
+ subtypeCount = 0;
+ }
+
+ TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) {
+ parent = nullptr;
+ columnId = -1;
+ maximumColumnId = -1;
+ kind = _kind;
+ maxLength = _maxLength;
+ precision = 0;
+ scale = 0;
+ subtypeCount = 0;
+ }
+
+ TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision,
+ uint64_t _scale) {
+ parent = nullptr;
+ columnId = -1;
+ maximumColumnId = -1;
+ kind = _kind;
+ maxLength = 0;
+ precision = _precision;
+ scale = _scale;
+ subtypeCount = 0;
+ }
+
+ uint64_t TypeImpl::assignIds(uint64_t root) const {
+ columnId = static_cast<int64_t>(root);
+ uint64_t current = root + 1;
+ for(uint64_t i=0; i < subtypeCount; ++i) {
+ current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current);
+ }
+ maximumColumnId = static_cast<int64_t>(current) - 1;
+ return current;
+ }
+
+ TypeImpl::~TypeImpl() {
+ for (std::vector<Type*>::iterator it = subTypes.begin();
+ it != subTypes.end(); it++) {
+ delete (*it) ;
+ }
+ }
+
+ void TypeImpl::ensureIdAssigned() const {
+ if (columnId == -1) {
+ const TypeImpl* root = this;
+ while (root->parent != nullptr) {
+ root = root->parent;
+ }
+ root->assignIds(0);
+ }
+ }
+
+ uint64_t TypeImpl::getColumnId() const {
+ ensureIdAssigned();
+ return static_cast<uint64_t>(columnId);
+ }
+
+ uint64_t TypeImpl::getMaximumColumnId() const {
+ ensureIdAssigned();
+ return static_cast<uint64_t>(maximumColumnId);
+ }
+
+ TypeKind TypeImpl::getKind() const {
+ return kind;
+ }
+
+ uint64_t TypeImpl::getSubtypeCount() const {
+ return subtypeCount;
+ }
+
+ const Type* TypeImpl::getSubtype(uint64_t i) const {
+ return subTypes[i];
+ }
+
+ const std::string& TypeImpl::getFieldName(uint64_t i) const {
+ return fieldNames[i];
+ }
+
+ uint64_t TypeImpl::getMaximumLength() const {
+ return maxLength;
+ }
+
+ uint64_t TypeImpl::getPrecision() const {
+ return precision;
+ }
+
+ uint64_t TypeImpl::getScale() const {
+ return scale;
+ }
+
+ void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) {
+ columnId = static_cast<int64_t>(_columnId);
+ maximumColumnId = static_cast<int64_t>(_maxColumnId);
+ }
+
+ void TypeImpl::addChildType(std::unique_ptr<Type> childType) {
+ TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release());
+ subTypes.push_back(child);
+ if (child != nullptr) {
+ child->parent = this;
+ }
+ subtypeCount += 1;
+ }
+
+ Type* TypeImpl::addStructField(const std::string& fieldName,
+ std::unique_ptr<Type> fieldType) {
+ addChildType(std::move(fieldType));
+ fieldNames.push_back(fieldName);
+ return this;
+ }
+
+ Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) {
+ addChildType(std::move(fieldType));
+ return this;
+ }
+
+ std::string TypeImpl::toString() const {
+ switch (static_cast<int64_t>(kind)) {
+ case BOOLEAN:
+ return "boolean";
+ case BYTE:
+ return "tinyint";
+ case SHORT:
+ return "smallint";
+ case INT:
+ return "int";
+ case LONG:
+ return "bigint";
+ case FLOAT:
+ return "float";
+ case DOUBLE:
+ return "double";
+ case STRING:
+ return "string";
+ case BINARY:
+ return "binary";
+ case TIMESTAMP:
+ return "timestamp";
+ case LIST:
+ return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
+ case MAP:
+ return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
+ (subTypes[1] ? subTypes[1]->toString() : "void") + ">";
+ case STRUCT: {
+ std::string result = "struct<";
+ for(size_t i=0; i < subTypes.size(); ++i) {
+ if (i != 0) {
+ result += ",";
+ }
+ result += fieldNames[i];
+ result += ":";
+ result += subTypes[i]->toString();
+ }
+ result += ">";
+ return result;
+ }
+ case UNION: {
+ std::string result = "uniontype<";
+ for(size_t i=0; i < subTypes.size(); ++i) {
+ if (i != 0) {
+ result += ",";
+ }
+ result += subTypes[i]->toString();
+ }
+ result += ">";
+ return result;
+ }
+ case DECIMAL: {
+ std::stringstream result;
+ result << "decimal(" << precision << "," << scale << ")";
+ return result.str();
+ }
+ case DATE:
+ return "date";
+ case VARCHAR: {
+ std::stringstream result;
+ result << "varchar(" << maxLength << ")";
+ return result.str();
+ }
+ case CHAR: {
+ std::stringstream result;
+ result << "char(" << maxLength << ")";
+ return result.str();
+ }
+ default:
+ throw NotImplementedYet("Unknown type");
+ }
+ }
+
+ std::unique_ptr<ColumnVectorBatch>
+ TypeImpl::createRowBatch(uint64_t capacity,
+ MemoryPool& memoryPool,
+ bool encoded) const {
+ switch (static_cast<int64_t>(kind)) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DATE:
+ return std::unique_ptr<ColumnVectorBatch>
+ (new LongVectorBatch(capacity, memoryPool));
+
+ case FLOAT:
+ case DOUBLE:
+ return std::unique_ptr<ColumnVectorBatch>
+ (new DoubleVectorBatch(capacity, memoryPool));
+
+ case STRING:
+ case BINARY:
+ case CHAR:
+ case VARCHAR:
+ return encoded ?
+ std::unique_ptr<ColumnVectorBatch>
+ (new EncodedStringVectorBatch(capacity, memoryPool))
+ : std::unique_ptr<ColumnVectorBatch>
+ (new StringVectorBatch(capacity, memoryPool));
+
+ case TIMESTAMP:
+ return std::unique_ptr<ColumnVectorBatch>
+ (new TimestampVectorBatch(capacity, memoryPool));
+
+ case STRUCT: {
+ StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool);
+ std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
+ for(uint64_t i=0; i < getSubtypeCount(); ++i) {
+ result->fields.push_back(getSubtype(i)->
+ createRowBatch(capacity,
+ memoryPool, encoded).release());
+ }
+ return return_value;
+ }
+
+ case LIST: {
+ ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool);
+ std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
+ if (getSubtype(0) != nullptr) {
+ result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded);
+ }
+ return return_value;
+ }
+
+ case MAP: {
+ MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool);
+ std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
+ if (getSubtype(0) != nullptr) {
+ result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded);
+ }
+ if (getSubtype(1) != nullptr) {
+ result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded);
+ }
+ return return_value;
+ }
+
+ case DECIMAL: {
+ if (getPrecision() == 0 || getPrecision() > 18) {
+ return std::unique_ptr<ColumnVectorBatch>
+ (new Decimal128VectorBatch(capacity, memoryPool));
+ } else {
+ return std::unique_ptr<ColumnVectorBatch>
+ (new Decimal64VectorBatch(capacity, memoryPool));
+ }
+ }
+
+ case UNION: {
+ UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool);
+ std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result);
+ for(uint64_t i=0; i < getSubtypeCount(); ++i) {
+ result->children.push_back(getSubtype(i)->createRowBatch(capacity,
+ memoryPool, encoded)
+ .release());
+ }
+ return return_value;
+ }
+
+ default:
+ throw NotImplementedYet("not supported yet");
+ }
+ }
+
+ std::unique_ptr<Type> createPrimitiveType(TypeKind kind) {
+ return std::unique_ptr<Type>(new TypeImpl(kind));
+ }
+
+ std::unique_ptr<Type> createCharType(TypeKind kind,
+ uint64_t maxLength) {
+ return std::unique_ptr<Type>(new TypeImpl(kind, maxLength));
+ }
+
+ std::unique_ptr<Type> createDecimalType(uint64_t precision,
+ uint64_t scale) {
+ return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
+ }
+
+ std::unique_ptr<Type> createStructType() {
+ return std::unique_ptr<Type>(new TypeImpl(STRUCT));
+ }
+
+ std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) {
+ TypeImpl* result = new TypeImpl(LIST);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
+ result->addChildType(std::move(elements));
+ return return_value;
+ }
+
+ std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
+ std::unique_ptr<Type> value) {
+ TypeImpl* result = new TypeImpl(MAP);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
+ result->addChildType(std::move(key));
+ result->addChildType(std::move(value));
+ return return_value;
+ }
+
+ std::unique_ptr<Type> createUnionType() {
+ return std::unique_ptr<Type>(new TypeImpl(UNION));
+ }
+
+ std::string printProtobufMessage(const google::protobuf::Message& message);
+ std::unique_ptr<Type> convertType(const proto::Type& type,
+ const proto::Footer& footer) {
+ switch (static_cast<int64_t>(type.kind())) {
+
+ case proto::Type_Kind_BOOLEAN:
+ case proto::Type_Kind_BYTE:
+ case proto::Type_Kind_SHORT:
+ case proto::Type_Kind_INT:
+ case proto::Type_Kind_LONG:
+ case proto::Type_Kind_FLOAT:
+ case proto::Type_Kind_DOUBLE:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_BINARY:
+ case proto::Type_Kind_TIMESTAMP:
+ case proto::Type_Kind_DATE:
+ return std::unique_ptr<Type>
+ (new TypeImpl(static_cast<TypeKind>(type.kind())));
+
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_VARCHAR:
+ return std::unique_ptr<Type>
+ (new TypeImpl(static_cast<TypeKind>(type.kind()),
+ type.maximumlength()));
+
+ case proto::Type_Kind_DECIMAL:
+ return std::unique_ptr<Type>
+ (new TypeImpl(DECIMAL, type.precision(), type.scale()));
+
+ case proto::Type_Kind_LIST:
+ case proto::Type_Kind_MAP:
+ case proto::Type_Kind_UNION: {
+ TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind()));
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
+ if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1)
+ throw ParseError("Illegal LIST type that doesn't contain one subtype");
+ if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2)
+ throw ParseError("Illegal MAP type that doesn't contain two subtypes");
+ if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0)
+ throw ParseError("Illegal UNION type that doesn't contain any subtypes");
+ for(int i=0; i < type.subtypes_size(); ++i) {
+ result->addUnionChild(convertType(footer.types(static_cast<int>
+ (type.subtypes(i))),
+ footer));
+ }
+ return return_value;
+ }
+
+ case proto::Type_Kind_STRUCT: {
+ TypeImpl* result = new TypeImpl(STRUCT);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result);
+ for(int i=0; i < type.subtypes_size(); ++i) {
+ result->addStructField(type.fieldnames(i),
+ convertType(footer.types(static_cast<int>
+ (type.subtypes(i))),
+ footer));
+ }
+ return return_value;
+ }
+ default:
+ throw NotImplementedYet("Unknown type kind");
+ }
+ }
+
+ /**
+ * Build a clone of the file type, projecting columns from the selected
+ * vector. This routine assumes that the parent of any selected column
+ * is also selected. The column ids are copied from the fileType.
+ * @param fileType the type in the file
+ * @param selected is each column by id selected
+ * @return a clone of the fileType filtered by the selection array
+ */
+ std::unique_ptr<Type> buildSelectedType(const Type *fileType,
+ const std::vector<bool>& selected) {
+ if (fileType == nullptr || !selected[fileType->getColumnId()]) {
+ return std::unique_ptr<Type>();
+ }
+
+ TypeImpl* result;
+ switch (static_cast<int>(fileType->getKind())) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case STRING:
+ case BINARY:
+ case TIMESTAMP:
+ case DATE:
+ result = new TypeImpl(fileType->getKind());
+ break;
+
+ case DECIMAL:
+ result= new TypeImpl(fileType->getKind(),
+ fileType->getPrecision(), fileType->getScale());
+ break;
+
+ case VARCHAR:
+ case CHAR:
+ result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength());
+ break;
+
+ case LIST:
+ result = new TypeImpl(fileType->getKind());
+ result->addChildType(buildSelectedType(fileType->getSubtype(0),
+ selected));
+ break;
+
+ case MAP:
+ result = new TypeImpl(fileType->getKind());
+ result->addChildType(buildSelectedType(fileType->getSubtype(0),
+ selected));
+ result->addChildType(buildSelectedType(fileType->getSubtype(1),
+ selected));
+ break;
+
+ case STRUCT: {
+ result = new TypeImpl(fileType->getKind());
+ for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) {
+ std::unique_ptr<Type> childType =
+ buildSelectedType(fileType->getSubtype(child), selected);
+ if (childType.get() != nullptr) {
+ result->addStructField(fileType->getFieldName(child),
+ std::move(childType));
+ }
+ }
+ break;
+ }
+
+ case UNION: {
+ result = new TypeImpl(fileType->getKind());
+ for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) {
+ std::unique_ptr<Type> childType =
+ buildSelectedType(fileType->getSubtype(child), selected);
+ if (childType.get() != nullptr) {
+ result->addUnionChild(std::move(childType));
+ }
+ }
+ break;
+ }
+
+ default:
+ throw NotImplementedYet("Unknown type kind");
+ }
+ result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
+ return std::unique_ptr<Type>(result);
+ }
+
+ ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) {
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res =
+ TypeImpl::parseType(input, 0, input.size());
+ if (res.size() != 1) {
+ throw std::logic_error("Invalid type string.");
+ }
+ return std::move(res[0].second);
+ }
+
+ std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input,
+ size_t start,
+ size_t end) {
+ TypeImpl* arrayType = new TypeImpl(LIST);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType);
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v =
+ TypeImpl::parseType(input, start, end);
+ if (v.size() != 1) {
+ throw std::logic_error("Array type must contain exactly one sub type.");
+ }
+ arrayType->addChildType(std::move(v[0].second));
+ return return_value;
+ }
+
+ std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input,
+ size_t start,
+ size_t end) {
+ TypeImpl * mapType = new TypeImpl(MAP);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType);
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v =
+ TypeImpl::parseType(input, start, end);
+ if (v.size() != 2) {
+ throw std::logic_error(
+ "Map type must contain exactly two sub types.");
+ }
+ mapType->addChildType(std::move(v[0].second));
+ mapType->addChildType(std::move(v[1].second));
+ return return_value;
+ }
+
+ std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input,
+ size_t start,
+ size_t end) {
+ TypeImpl* structType = new TypeImpl(STRUCT);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType);
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type>> > v =
+ TypeImpl::parseType(input, start, end);
+ if (v.size() == 0) {
+ throw std::logic_error(
+ "Struct type must contain at least one sub type.");
+ }
+ for (size_t i = 0; i < v.size(); ++i) {
+ structType->addStructField(v[i].first, std::move(v[i].second));
+ }
+ return return_value;
+ }
+
+ std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input,
+ size_t start,
+ size_t end) {
+ TypeImpl* unionType = new TypeImpl(UNION);
+ std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType);
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v =
+ TypeImpl::parseType(input, start, end);
+ if (v.size() == 0) {
+ throw std::logic_error("Union type must contain at least one sub type.");
+ }
+ for (size_t i = 0; i < v.size(); ++i) {
+ unionType->addChildType(std::move(v[i].second));
+ }
+ return return_value;
+ }
+
+ std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input,
+ size_t start,
+ size_t end) {
+ size_t sep = input.find(',', start);
+ if (sep + 1 >= end || sep == std::string::npos) {
+ throw std::logic_error("Decimal type must specify precision and scale.");
+ }
+ uint64_t precision =
+ static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str()));
+ uint64_t scale =
+ static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str()));
+ return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
+ }
+
+ std::unique_ptr<Type> TypeImpl::parseCategory(std::string category,
+ const std::string &input,
+ size_t start,
+ size_t end) {
+ if (category == "boolean") {
+ return std::unique_ptr<Type>(new TypeImpl(BOOLEAN));
+ } else if (category == "tinyint") {
+ return std::unique_ptr<Type>(new TypeImpl(BYTE));
+ } else if (category == "smallint") {
+ return std::unique_ptr<Type>(new TypeImpl(SHORT));
+ } else if (category == "int") {
+ return std::unique_ptr<Type>(new TypeImpl(INT));
+ } else if (category == "bigint") {
+ return std::unique_ptr<Type>(new TypeImpl(LONG));
+ } else if (category == "float") {
+ return std::unique_ptr<Type>(new TypeImpl(FLOAT));
+ } else if (category == "double") {
+ return std::unique_ptr<Type>(new TypeImpl(DOUBLE));
+ } else if (category == "string") {
+ return std::unique_ptr<Type>(new TypeImpl(STRING));
+ } else if (category == "binary") {
+ return std::unique_ptr<Type>(new TypeImpl(BINARY));
+ } else if (category == "timestamp") {
+ return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP));
+ } else if (category == "array") {
+ return parseArrayType(input, start, end);
+ } else if (category == "map") {
+ return parseMapType(input, start, end);
+ } else if (category == "struct") {
+ return parseStructType(input, start, end);
+ } else if (category == "uniontype") {
+ return parseUnionType(input, start, end);
+ } else if (category == "decimal") {
+ return parseDecimalType(input, start, end);
+ } else if (category == "date") {
+ return std::unique_ptr<Type>(new TypeImpl(DATE));
+ } else if (category == "varchar") {
+ uint64_t maxLength = static_cast<uint64_t>(
+ atoi(input.substr(start, end - start).c_str()));
+ return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength));
+ } else if (category == "char") {
+ uint64_t maxLength = static_cast<uint64_t>(
+ atoi(input.substr(start, end - start).c_str()));
+ return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength));
+ } else {
+ throw std::logic_error("Unknown type " + category);
+ }
+ }
+
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > TypeImpl::parseType(
+ const std::string &input,
+ size_t start,
+ size_t end) {
+ std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res;
+ size_t pos = start;
+
+ while (pos < end) {
+ size_t endPos = pos;
+ while (endPos < end && (isalnum(input[endPos]) || input[endPos] == '_')) {
+ ++endPos;
+ }
+
+ std::string fieldName;
+ if (input[endPos] == ':') {
+ fieldName = input.substr(pos, endPos - pos);
+ pos = ++endPos;
+ while (endPos < end && isalpha(input[endPos])) {
+ ++endPos;
+ }
+ }
+
+ size_t nextPos = endPos + 1;
+ if (input[endPos] == '<') {
+ int count = 1;
+ while (nextPos < end) {
+ if (input[nextPos] == '<') {
+ ++count;
+ } else if (input[nextPos] == '>') {
+ --count;
+ }
+ if (count == 0) {
+ break;
+ }
+ ++nextPos;
+ }
+ if (nextPos == end) {
+ throw std::logic_error("Invalid type string. Cannot find closing >");
+ }
+ } else if (input[endPos] == '(') {
+ while (nextPos < end && input[nextPos] != ')') {
+ ++nextPos;
+ }
+ if (nextPos == end) {
+ throw std::logic_error("Invalid type string. Cannot find closing )");
+ }
+ } else if (input[endPos] != ',' && endPos != end) {
+ throw std::logic_error("Unrecognized character.");
+ }
+
+ std::string category = input.substr(pos, endPos - pos);
+ res.push_back(std::make_pair(fieldName, parseCategory(category, input, endPos + 1, nextPos)));
+
+ if (nextPos < end && (input[nextPos] == ')' || input[nextPos] == '>')) {
+ pos = nextPos + 2;
+ } else {
+ pos = nextPos;
+ }
+ }
+
+ return res;
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
index 054ceab5dc..cee52006b7 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
@@ -1,198 +1,198 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef TYPE_IMPL_HH
-#define TYPE_IMPL_HH
-
-#include "orc/Type.hh"
-
-#include "Adaptor.hh"
-#include "wrap/orc-proto-wrapper.hh"
-
-#include <vector>
-
-namespace orc {
-
- class TypeImpl: public Type {
- private:
- TypeImpl* parent;
- mutable int64_t columnId;
- mutable int64_t maximumColumnId;
- TypeKind kind;
- std::vector<Type*> subTypes;
- std::vector<std::string> fieldNames;
- uint64_t subtypeCount;
- uint64_t maxLength;
- uint64_t precision;
- uint64_t scale;
-
- public:
- /**
- * Create most of the primitive types.
- */
- TypeImpl(TypeKind kind);
-
- /**
- * Create char and varchar type.
- */
- TypeImpl(TypeKind kind, uint64_t maxLength);
-
- /**
- * Create decimal type.
- */
- TypeImpl(TypeKind kind, uint64_t precision,
- uint64_t scale);
-
- virtual ~TypeImpl() override;
-
- uint64_t getColumnId() const override;
-
- uint64_t getMaximumColumnId() const override;
-
- TypeKind getKind() const override;
-
- uint64_t getSubtypeCount() const override;
-
- const Type* getSubtype(uint64_t i) const override;
-
- const std::string& getFieldName(uint64_t i) const override;
-
- uint64_t getMaximumLength() const override;
-
- uint64_t getPrecision() const override;
-
- uint64_t getScale() const override;
-
- std::string toString() const override;
-
- Type* addStructField(const std::string& fieldName,
- std::unique_ptr<Type> fieldType) override;
- Type* addUnionChild(std::unique_ptr<Type> fieldType) override;
-
- std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size,
- MemoryPool& memoryPool,
- bool encoded = false
- ) const override;
-
- /**
- * Explicitly set the column ids. Only for internal usage.
- */
- void setIds(uint64_t columnId, uint64_t maxColumnId);
-
- /**
- * Add a child type.
- */
- void addChildType(std::unique_ptr<Type> childType);
-
- static std::vector<std::pair<std::string, std::unique_ptr<Type> > > parseType(
- const std::string &input,
- size_t start,
- size_t end);
-
- private:
- /**
- * Assign ids to this node and its children giving this
- * node rootId.
- * @param rootId the column id that should be assigned to this node.
- */
- uint64_t assignIds(uint64_t rootId) const;
-
- /**
- * Ensure that ids are assigned to all of the nodes.
- */
- void ensureIdAssigned() const;
-
- /**
- * Parse array type from string
- * @param input the input string of an array type
- * @param start start position of the input string
- * @param end end position of the input string
- */
- static std::unique_ptr<Type> parseArrayType(const std::string &input,
- size_t start,
- size_t end);
-
- /**
- * Parse map type from string
- * @param input the input string of a map type
- * @param start start position of the input string
- * @param end end position of the input string
- */
- static std::unique_ptr<Type> parseMapType(const std::string &input,
- size_t start,
- size_t end);
-
- /**
- * Parse struct type from string
- * @param input the input string of a struct type
- * @param start start position of the input string
- * @param end end position of the input string
- */
- static std::unique_ptr<Type> parseStructType(const std::string &input,
- size_t start,
- size_t end);
-
- /**
- * Parse union type from string
- * @param input the input string of an union type
- * @param start start position of the input string
- * @param end end position of the input string
- */
- static std::unique_ptr<Type> parseUnionType(const std::string &input,
- size_t start,
- size_t end);
-
- /**
- * Parse decimal type from string
- * @param input the input string of a decimal type
- * @param start start position of the input string
- * @param end end position of the input string
- */
- static std::unique_ptr<Type> parseDecimalType(const std::string &input,
- size_t start,
- size_t end);
-
- /**
- * Parse type for a category
- * @param category type name
- * @param input the input string of the category
- * @param start start position of the input string
- * @param end end position of the input string
- */
- static std::unique_ptr<Type> parseCategory(std::string category,
- const std::string &input,
- size_t start,
- size_t end);
- };
-
- std::unique_ptr<Type> convertType(const proto::Type& type,
- const proto::Footer& footer);
-
- /**
- * Build a clone of the file type, projecting columns from the selected
- * vector. This routine assumes that the parent of any selected column
- * is also selected.
- * @param fileType the type in the file
- * @param selected is each column by id selected
- * @return a clone of the fileType filtered by the selection array
- */
- std::unique_ptr<Type> buildSelectedType(const Type *fileType,
- const std::vector<bool>& selected);
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TYPE_IMPL_HH
+#define TYPE_IMPL_HH
+
+#include "orc/Type.hh"
+
+#include "Adaptor.hh"
+#include "wrap/orc-proto-wrapper.hh"
+
+#include <vector>
+
+namespace orc {
+
+ class TypeImpl: public Type {
+ private:
+ TypeImpl* parent;
+ mutable int64_t columnId;
+ mutable int64_t maximumColumnId;
+ TypeKind kind;
+ std::vector<Type*> subTypes;
+ std::vector<std::string> fieldNames;
+ uint64_t subtypeCount;
+ uint64_t maxLength;
+ uint64_t precision;
+ uint64_t scale;
+
+ public:
+ /**
+ * Create most of the primitive types.
+ */
+ TypeImpl(TypeKind kind);
+
+ /**
+ * Create char and varchar type.
+ */
+ TypeImpl(TypeKind kind, uint64_t maxLength);
+
+ /**
+ * Create decimal type.
+ */
+ TypeImpl(TypeKind kind, uint64_t precision,
+ uint64_t scale);
+
+ virtual ~TypeImpl() override;
+
+ uint64_t getColumnId() const override;
+
+ uint64_t getMaximumColumnId() const override;
+
+ TypeKind getKind() const override;
+
+ uint64_t getSubtypeCount() const override;
+
+ const Type* getSubtype(uint64_t i) const override;
+
+ const std::string& getFieldName(uint64_t i) const override;
+
+ uint64_t getMaximumLength() const override;
+
+ uint64_t getPrecision() const override;
+
+ uint64_t getScale() const override;
+
+ std::string toString() const override;
+
+ Type* addStructField(const std::string& fieldName,
+ std::unique_ptr<Type> fieldType) override;
+ Type* addUnionChild(std::unique_ptr<Type> fieldType) override;
+
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size,
+ MemoryPool& memoryPool,
+ bool encoded = false
+ ) const override;
+
+ /**
+ * Explicitly set the column ids. Only for internal usage.
+ */
+ void setIds(uint64_t columnId, uint64_t maxColumnId);
+
+ /**
+ * Add a child type.
+ */
+ void addChildType(std::unique_ptr<Type> childType);
+
+ static std::vector<std::pair<std::string, std::unique_ptr<Type> > > parseType(
+ const std::string &input,
+ size_t start,
+ size_t end);
+
+ private:
+ /**
+ * Assign ids to this node and its children giving this
+ * node rootId.
+ * @param rootId the column id that should be assigned to this node.
+ */
+ uint64_t assignIds(uint64_t rootId) const;
+
+ /**
+ * Ensure that ids are assigned to all of the nodes.
+ */
+ void ensureIdAssigned() const;
+
+ /**
+ * Parse array type from string
+ * @param input the input string of an array type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseArrayType(const std::string &input,
+ size_t start,
+ size_t end);
+
+ /**
+ * Parse map type from string
+ * @param input the input string of a map type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseMapType(const std::string &input,
+ size_t start,
+ size_t end);
+
+ /**
+ * Parse struct type from string
+ * @param input the input string of a struct type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseStructType(const std::string &input,
+ size_t start,
+ size_t end);
+
+ /**
+ * Parse union type from string
+ * @param input the input string of an union type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseUnionType(const std::string &input,
+ size_t start,
+ size_t end);
+
+ /**
+ * Parse decimal type from string
+ * @param input the input string of a decimal type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseDecimalType(const std::string &input,
+ size_t start,
+ size_t end);
+
+ /**
+ * Parse type for a category
+ * @param category type name
+ * @param input the input string of the category
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseCategory(std::string category,
+ const std::string &input,
+ size_t start,
+ size_t end);
+ };
+
+ std::unique_ptr<Type> convertType(const proto::Type& type,
+ const proto::Footer& footer);
+
+ /**
+ * Build a clone of the file type, projecting columns from the selected
+ * vector. This routine assumes that the parent of any selected column
+ * is also selected.
+ * @param fileType the type in the file
+ * @param selected is each column by id selected
+ * @return a clone of the fileType filtered by the selection array
+ */
+ std::unique_ptr<Type> buildSelectedType(const Type *fileType,
+ const std::vector<bool>& selected);
+}
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc
index 14c0ded030..6ba2f8ae7d 100644
--- a/contrib/libs/apache/orc/c++/src/Vector.cc
+++ b/contrib/libs/apache/orc/c++/src/Vector.cc
@@ -1,518 +1,518 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Vector.hh"
-
-#include "Adaptor.hh"
-#include "orc/Exceptions.hh"
-
-#include <iostream>
-#include <sstream>
-#include <cstdlib>
-
-namespace orc {
-
- ColumnVectorBatch::ColumnVectorBatch(uint64_t cap,
- MemoryPool& pool
- ): capacity(cap),
- numElements(0),
- notNull(pool, cap),
- hasNulls(false),
- isEncoded(false),
- memoryPool(pool) {
- std::memset(notNull.data(), 1, capacity);
- }
-
- ColumnVectorBatch::~ColumnVectorBatch() {
- // PASS
- }
-
- void ColumnVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- capacity = cap;
- notNull.resize(cap);
- }
- }
-
- void ColumnVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t ColumnVectorBatch::getMemoryUsage() {
- return static_cast<uint64_t>(notNull.capacity() * sizeof(char));
- }
-
- bool ColumnVectorBatch::hasVariableLength() {
- return false;
- }
-
- LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool
- ): ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity) {
- // PASS
- }
-
- LongVectorBatch::~LongVectorBatch() {
- // PASS
- }
-
- std::string LongVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Long vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void LongVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- data.resize(cap);
- }
- }
-
- void LongVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t LongVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage() +
- static_cast<uint64_t>(data.capacity() * sizeof(int64_t));
- }
-
- DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool
- ): ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity) {
- // PASS
- }
-
- DoubleVectorBatch::~DoubleVectorBatch() {
- // PASS
- }
-
- std::string DoubleVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Double vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void DoubleVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- data.resize(cap);
- }
- }
-
- void DoubleVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t DoubleVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(data.capacity() * sizeof(double));
- }
-
- StringDictionary::StringDictionary(MemoryPool& pool)
- : dictionaryBlob(pool),
- dictionaryOffset(pool) {
- // PASS
- }
-
- EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity,
- MemoryPool& pool)
- : StringVectorBatch(_capacity, pool),
- dictionary(),
- index(pool, _capacity) {
- // PASS
- }
-
- EncodedStringVectorBatch::~EncodedStringVectorBatch() {
- // PASS
- }
-
- std::string EncodedStringVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Encoded string vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool
- ): ColumnVectorBatch(_capacity, pool),
- data(pool, _capacity),
- length(pool, _capacity),
- blob(pool) {
- // PASS
- }
-
- StringVectorBatch::~StringVectorBatch() {
- // PASS
- }
-
- std::string StringVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Byte vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void StringVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- data.resize(cap);
- length.resize(cap);
- }
- }
-
- void StringVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t StringVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(data.capacity() * sizeof(char*)
- + length.capacity() * sizeof(int64_t));
- }
-
- StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool) {
- // PASS
- }
-
- StructVectorBatch::~StructVectorBatch() {
- for (uint64_t i=0; i<this->fields.size(); i++) {
- delete this->fields[i];
- }
- }
-
- std::string StructVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Struct vector <" << numElements << " of " << capacity
- << "; ";
- for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin();
- ptr != fields.end(); ++ptr) {
- buffer << (*ptr)->toString() << "; ";
- }
- buffer << ">";
- return buffer.str();
- }
-
- void StructVectorBatch::resize(uint64_t cap) {
- ColumnVectorBatch::resize(cap);
- }
-
- void StructVectorBatch::clear() {
- for(size_t i=0; i < fields.size(); i++) {
- fields[i]->clear();
- }
- numElements = 0;
- }
-
- uint64_t StructVectorBatch::getMemoryUsage() {
- uint64_t memory = ColumnVectorBatch::getMemoryUsage();
- for (unsigned int i=0; i < fields.size(); i++) {
- memory += fields[i]->getMemoryUsage();
- }
- return memory;
- }
-
- bool StructVectorBatch::hasVariableLength() {
- for (unsigned int i=0; i < fields.size(); i++) {
- if (fields[i]->hasVariableLength()) {
- return true;
- }
- }
- return false;
- }
-
- ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- offsets(pool, cap+1) {
- // PASS
- }
-
- ListVectorBatch::~ListVectorBatch() {
- // PASS
- }
-
- std::string ListVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "List vector <" << elements->toString() << " with "
- << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void ListVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- offsets.resize(cap + 1);
- }
- }
-
- void ListVectorBatch::clear() {
- numElements = 0;
- elements->clear();
- }
-
- uint64_t ListVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t))
- + elements->getMemoryUsage();
- }
-
- bool ListVectorBatch::hasVariableLength() {
- return true;
- }
-
- MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- offsets(pool, cap+1) {
- // PASS
- }
-
- MapVectorBatch::~MapVectorBatch() {
- // PASS
- }
-
- std::string MapVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Map vector <" << keys->toString() << ", "
- << elements->toString() << " with "
- << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void MapVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- offsets.resize(cap + 1);
- }
- }
-
- void MapVectorBatch::clear() {
- keys->clear();
- elements->clear();
- numElements = 0;
- }
-
- uint64_t MapVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t))
- + keys->getMemoryUsage()
- + elements->getMemoryUsage();
- }
-
- bool MapVectorBatch::hasVariableLength() {
- return true;
- }
-
- UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- tags(pool, cap),
- offsets(pool, cap) {
- // PASS
- }
-
- UnionVectorBatch::~UnionVectorBatch() {
- for (uint64_t i=0; i < children.size(); i++) {
- delete children[i];
- }
- }
-
- std::string UnionVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Union vector <";
- for(size_t i=0; i < children.size(); ++i) {
- if (i != 0) {
- buffer << ", ";
- }
- buffer << children[i]->toString();
- }
- buffer << "; with " << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void UnionVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- tags.resize(cap);
- offsets.resize(cap);
- }
- }
-
- void UnionVectorBatch::clear() {
- for(size_t i=0; i < children.size(); i++) {
- children[i]->clear();
- }
- numElements = 0;
- }
-
- uint64_t UnionVectorBatch::getMemoryUsage() {
- uint64_t memory = ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char)
- + offsets.capacity() * sizeof(uint64_t));
- for(size_t i=0; i < children.size(); ++i) {
- memory += children[i]->getMemoryUsage();
- }
- return memory;
- }
-
- bool UnionVectorBatch::hasVariableLength() {
- for(size_t i=0; i < children.size(); ++i) {
- if (children[i]->hasVariableLength()) {
- return true;
- }
- }
- return false;
- }
-
- Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- precision(0),
- scale(0),
- values(pool, cap),
- readScales(pool, cap) {
- // PASS
- }
-
- Decimal64VectorBatch::~Decimal64VectorBatch() {
- // PASS
- }
-
- std::string Decimal64VectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Decimal64 vector with "
- << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void Decimal64VectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- values.resize(cap);
- readScales.resize(cap);
- }
- }
-
- void Decimal64VectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t Decimal64VectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(
- (values.capacity() + readScales.capacity()) * sizeof(int64_t));
- }
-
- Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool
- ): ColumnVectorBatch(cap, pool),
- precision(0),
- scale(0),
- values(pool, cap),
- readScales(pool, cap) {
- // PASS
- }
-
- Decimal128VectorBatch::~Decimal128VectorBatch() {
- // PASS
- }
-
- std::string Decimal128VectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Decimal128 vector with "
- << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void Decimal128VectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- values.resize(cap);
- readScales.resize(cap);
- }
- }
-
- void Decimal128VectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t Decimal128VectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(values.capacity() * sizeof(Int128)
- + readScales.capacity() * sizeof(int64_t));
- }
-
- Decimal::Decimal(const Int128& _value,
- int32_t _scale): value(_value), scale(_scale) {
- // PASS
- }
-
- Decimal::Decimal(const std::string& str) {
- std::size_t foundPoint = str.find(".");
- // no decimal point, it is int
- if(foundPoint == std::string::npos){
- value = Int128(str);
- scale = 0;
- }else{
- std::string copy(str);
- scale = static_cast<int32_t>(str.length() - foundPoint - 1);
- value = Int128(copy.replace(foundPoint, 1, ""));
- }
- }
-
- Decimal::Decimal() : value(0), scale(0) {
- // PASS
- }
-
- std::string Decimal::toString() const {
- return value.toDecimalString(scale);
- }
-
- TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity,
- MemoryPool& pool
- ): ColumnVectorBatch(_capacity,
- pool),
- data(pool, _capacity),
- nanoseconds(pool, _capacity) {
- // PASS
- }
-
- TimestampVectorBatch::~TimestampVectorBatch() {
- // PASS
- }
-
- std::string TimestampVectorBatch::toString() const {
- std::ostringstream buffer;
- buffer << "Timestamp vector <" << numElements << " of " << capacity << ">";
- return buffer.str();
- }
-
- void TimestampVectorBatch::resize(uint64_t cap) {
- if (capacity < cap) {
- ColumnVectorBatch::resize(cap);
- data.resize(cap);
- nanoseconds.resize(cap);
- }
- }
-
- void TimestampVectorBatch::clear() {
- numElements = 0;
- }
-
- uint64_t TimestampVectorBatch::getMemoryUsage() {
- return ColumnVectorBatch::getMemoryUsage()
- + static_cast<uint64_t>(
- (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t));
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Vector.hh"
+
+#include "Adaptor.hh"
+#include "orc/Exceptions.hh"
+
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+
+namespace orc {
+
+ ColumnVectorBatch::ColumnVectorBatch(uint64_t cap,
+ MemoryPool& pool
+ ): capacity(cap),
+ numElements(0),
+ notNull(pool, cap),
+ hasNulls(false),
+ isEncoded(false),
+ memoryPool(pool) {
+ std::memset(notNull.data(), 1, capacity);
+ }
+
+ ColumnVectorBatch::~ColumnVectorBatch() {
+ // PASS
+ }
+
+ void ColumnVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ capacity = cap;
+ notNull.resize(cap);
+ }
+ }
+
+ void ColumnVectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t ColumnVectorBatch::getMemoryUsage() {
+ return static_cast<uint64_t>(notNull.capacity() * sizeof(char));
+ }
+
+ bool ColumnVectorBatch::hasVariableLength() {
+ return false;
+ }
+
+ LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool
+ ): ColumnVectorBatch(_capacity, pool),
+ data(pool, _capacity) {
+ // PASS
+ }
+
+ LongVectorBatch::~LongVectorBatch() {
+ // PASS
+ }
+
+ std::string LongVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Long vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void LongVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ }
+ }
+
+ void LongVectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t LongVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(data.capacity() * sizeof(int64_t));
+ }
+
+ DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool
+ ): ColumnVectorBatch(_capacity, pool),
+ data(pool, _capacity) {
+ // PASS
+ }
+
+ DoubleVectorBatch::~DoubleVectorBatch() {
+ // PASS
+ }
+
+ std::string DoubleVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Double vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void DoubleVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ }
+ }
+
+ void DoubleVectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t DoubleVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(data.capacity() * sizeof(double));
+ }
+
+ StringDictionary::StringDictionary(MemoryPool& pool)
+ : dictionaryBlob(pool),
+ dictionaryOffset(pool) {
+ // PASS
+ }
+
+ EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity,
+ MemoryPool& pool)
+ : StringVectorBatch(_capacity, pool),
+ dictionary(),
+ index(pool, _capacity) {
+ // PASS
+ }
+
+ EncodedStringVectorBatch::~EncodedStringVectorBatch() {
+ // PASS
+ }
+
+ std::string EncodedStringVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Encoded string vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool
+ ): ColumnVectorBatch(_capacity, pool),
+ data(pool, _capacity),
+ length(pool, _capacity),
+ blob(pool) {
+ // PASS
+ }
+
+ StringVectorBatch::~StringVectorBatch() {
+ // PASS
+ }
+
+ std::string StringVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Byte vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void StringVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ length.resize(cap);
+ }
+ }
+
+ void StringVectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t StringVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(data.capacity() * sizeof(char*)
+ + length.capacity() * sizeof(int64_t));
+ }
+
+ StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool
+ ): ColumnVectorBatch(cap, pool) {
+ // PASS
+ }
+
+ StructVectorBatch::~StructVectorBatch() {
+ for (uint64_t i=0; i<this->fields.size(); i++) {
+ delete this->fields[i];
+ }
+ }
+
+ std::string StructVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Struct vector <" << numElements << " of " << capacity
+ << "; ";
+ for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin();
+ ptr != fields.end(); ++ptr) {
+ buffer << (*ptr)->toString() << "; ";
+ }
+ buffer << ">";
+ return buffer.str();
+ }
+
+ void StructVectorBatch::resize(uint64_t cap) {
+ ColumnVectorBatch::resize(cap);
+ }
+
+ void StructVectorBatch::clear() {
+ for(size_t i=0; i < fields.size(); i++) {
+ fields[i]->clear();
+ }
+ numElements = 0;
+ }
+
+ uint64_t StructVectorBatch::getMemoryUsage() {
+ uint64_t memory = ColumnVectorBatch::getMemoryUsage();
+ for (unsigned int i=0; i < fields.size(); i++) {
+ memory += fields[i]->getMemoryUsage();
+ }
+ return memory;
+ }
+
+ bool StructVectorBatch::hasVariableLength() {
+ for (unsigned int i=0; i < fields.size(); i++) {
+ if (fields[i]->hasVariableLength()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool
+ ): ColumnVectorBatch(cap, pool),
+ offsets(pool, cap+1) {
+ // PASS
+ }
+
+ ListVectorBatch::~ListVectorBatch() {
+ // PASS
+ }
+
+ std::string ListVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "List vector <" << elements->toString() << " with "
+ << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void ListVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ offsets.resize(cap + 1);
+ }
+ }
+
+ void ListVectorBatch::clear() {
+ numElements = 0;
+ elements->clear();
+ }
+
+ uint64_t ListVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t))
+ + elements->getMemoryUsage();
+ }
+
+ bool ListVectorBatch::hasVariableLength() {
+ return true;
+ }
+
+ MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool
+ ): ColumnVectorBatch(cap, pool),
+ offsets(pool, cap+1) {
+ // PASS
+ }
+
+ MapVectorBatch::~MapVectorBatch() {
+ // PASS
+ }
+
+ std::string MapVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Map vector <" << keys->toString() << ", "
+ << elements->toString() << " with "
+ << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void MapVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ offsets.resize(cap + 1);
+ }
+ }
+
+ void MapVectorBatch::clear() {
+ keys->clear();
+ elements->clear();
+ numElements = 0;
+ }
+
+ uint64_t MapVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t))
+ + keys->getMemoryUsage()
+ + elements->getMemoryUsage();
+ }
+
+ bool MapVectorBatch::hasVariableLength() {
+ return true;
+ }
+
+ UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool
+ ): ColumnVectorBatch(cap, pool),
+ tags(pool, cap),
+ offsets(pool, cap) {
+ // PASS
+ }
+
+ UnionVectorBatch::~UnionVectorBatch() {
+ for (uint64_t i=0; i < children.size(); i++) {
+ delete children[i];
+ }
+ }
+
+ std::string UnionVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Union vector <";
+ for(size_t i=0; i < children.size(); ++i) {
+ if (i != 0) {
+ buffer << ", ";
+ }
+ buffer << children[i]->toString();
+ }
+ buffer << "; with " << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void UnionVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ tags.resize(cap);
+ offsets.resize(cap);
+ }
+ }
+
+ void UnionVectorBatch::clear() {
+ for(size_t i=0; i < children.size(); i++) {
+ children[i]->clear();
+ }
+ numElements = 0;
+ }
+
+ uint64_t UnionVectorBatch::getMemoryUsage() {
+ uint64_t memory = ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char)
+ + offsets.capacity() * sizeof(uint64_t));
+ for(size_t i=0; i < children.size(); ++i) {
+ memory += children[i]->getMemoryUsage();
+ }
+ return memory;
+ }
+
+ bool UnionVectorBatch::hasVariableLength() {
+ for(size_t i=0; i < children.size(); ++i) {
+ if (children[i]->hasVariableLength()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool
+ ): ColumnVectorBatch(cap, pool),
+ precision(0),
+ scale(0),
+ values(pool, cap),
+ readScales(pool, cap) {
+ // PASS
+ }
+
+ Decimal64VectorBatch::~Decimal64VectorBatch() {
+ // PASS
+ }
+
+ std::string Decimal64VectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Decimal64 vector with "
+ << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void Decimal64VectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ values.resize(cap);
+ readScales.resize(cap);
+ }
+ }
+
+ void Decimal64VectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t Decimal64VectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(
+ (values.capacity() + readScales.capacity()) * sizeof(int64_t));
+ }
+
+ Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool
+ ): ColumnVectorBatch(cap, pool),
+ precision(0),
+ scale(0),
+ values(pool, cap),
+ readScales(pool, cap) {
+ // PASS
+ }
+
+ Decimal128VectorBatch::~Decimal128VectorBatch() {
+ // PASS
+ }
+
+ std::string Decimal128VectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Decimal128 vector with "
+ << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void Decimal128VectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ values.resize(cap);
+ readScales.resize(cap);
+ }
+ }
+
+ void Decimal128VectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t Decimal128VectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(values.capacity() * sizeof(Int128)
+ + readScales.capacity() * sizeof(int64_t));
+ }
+
+ Decimal::Decimal(const Int128& _value,
+ int32_t _scale): value(_value), scale(_scale) {
+ // PASS
+ }
+
+ Decimal::Decimal(const std::string& str) {
+ std::size_t foundPoint = str.find(".");
+ // no decimal point, it is int
+ if(foundPoint == std::string::npos){
+ value = Int128(str);
+ scale = 0;
+ }else{
+ std::string copy(str);
+ scale = static_cast<int32_t>(str.length() - foundPoint - 1);
+ value = Int128(copy.replace(foundPoint, 1, ""));
+ }
+ }
+
+ Decimal::Decimal() : value(0), scale(0) {
+ // PASS
+ }
+
+ std::string Decimal::toString() const {
+ return value.toDecimalString(scale);
+ }
+
+ TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity,
+ MemoryPool& pool
+ ): ColumnVectorBatch(_capacity,
+ pool),
+ data(pool, _capacity),
+ nanoseconds(pool, _capacity) {
+ // PASS
+ }
+
+ TimestampVectorBatch::~TimestampVectorBatch() {
+ // PASS
+ }
+
+ std::string TimestampVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Timestamp vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void TimestampVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ nanoseconds.resize(cap);
+ }
+ }
+
+ void TimestampVectorBatch::clear() {
+ numElements = 0;
+ }
+
+ uint64_t TimestampVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage()
+ + static_cast<uint64_t>(
+ (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t));
+ }
+}
diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc
index b5bd19b304..8b13750865 100644
--- a/contrib/libs/apache/orc/c++/src/Writer.cc
+++ b/contrib/libs/apache/orc/c++/src/Writer.cc
@@ -1,641 +1,641 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Common.hh"
-#include "orc/OrcFile.hh"
-
-#include "ColumnWriter.hh"
-#include "Timezone.hh"
-
-#include <memory>
-
-namespace orc {
-
- struct WriterOptionsPrivate {
- uint64_t stripeSize;
- uint64_t compressionBlockSize;
- uint64_t rowIndexStride;
- CompressionKind compression;
- CompressionStrategy compressionStrategy;
- MemoryPool* memoryPool;
- double paddingTolerance;
- std::ostream* errorStream;
- FileVersion fileVersion;
- double dictionaryKeySizeThreshold;
- bool enableIndex;
- std::set<uint64_t> columnsUseBloomFilter;
- double bloomFilterFalsePositiveProb;
- BloomFilterVersion bloomFilterVersion;
-
- WriterOptionsPrivate() :
- fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
- stripeSize = 64 * 1024 * 1024; // 64M
- compressionBlockSize = 64 * 1024; // 64K
- rowIndexStride = 10000;
- compression = CompressionKind_ZLIB;
- compressionStrategy = CompressionStrategy_SPEED;
- memoryPool = getDefaultPool();
- paddingTolerance = 0.0;
- errorStream = &std::cerr;
- dictionaryKeySizeThreshold = 0.0;
- enableIndex = true;
- bloomFilterFalsePositiveProb = 0.05;
- bloomFilterVersion = UTF8;
- }
- };
-
- WriterOptions::WriterOptions():
- privateBits(std::unique_ptr<WriterOptionsPrivate>
- (new WriterOptionsPrivate())) {
- // PASS
- }
-
- WriterOptions::WriterOptions(const WriterOptions& rhs):
- privateBits(std::unique_ptr<WriterOptionsPrivate>
- (new WriterOptionsPrivate(*(rhs.privateBits.get())))) {
- // PASS
- }
-
- WriterOptions::WriterOptions(WriterOptions& rhs) {
- // swap privateBits with rhs
- WriterOptionsPrivate* l = privateBits.release();
- privateBits.reset(rhs.privateBits.release());
- rhs.privateBits.reset(l);
- }
-
- WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) {
- if (this != &rhs) {
- privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get())));
- }
- return *this;
- }
-
- WriterOptions::~WriterOptions() {
- // PASS
- }
- RleVersion WriterOptions::getRleVersion() const {
- if(privateBits->fileVersion == FileVersion::v_0_11())
- {
- return RleVersion_1;
- }
-
- return RleVersion_2;
- }
-
- WriterOptions& WriterOptions::setStripeSize(uint64_t size) {
- privateBits->stripeSize = size;
- return *this;
- }
-
- uint64_t WriterOptions::getStripeSize() const {
- return privateBits->stripeSize;
- }
-
- WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) {
- privateBits->compressionBlockSize = size;
- return *this;
- }
-
- uint64_t WriterOptions::getCompressionBlockSize() const {
- return privateBits->compressionBlockSize;
- }
-
- WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) {
- privateBits->rowIndexStride = stride;
- privateBits->enableIndex = (stride != 0);
- return *this;
- }
-
- uint64_t WriterOptions::getRowIndexStride() const {
- return privateBits->rowIndexStride;
- }
-
- WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) {
- privateBits->dictionaryKeySizeThreshold = val;
- return *this;
- }
-
- double WriterOptions::getDictionaryKeySizeThreshold() const {
- return privateBits->dictionaryKeySizeThreshold;
- }
-
- WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) {
- // Only Hive_0_11 and Hive_0_12 version are supported currently
- if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) {
- privateBits->fileVersion = version;
- return *this;
- }
- throw std::logic_error("Unsupported file version specified.");
- }
-
- FileVersion WriterOptions::getFileVersion() const {
- return privateBits->fileVersion;
- }
-
- WriterOptions& WriterOptions::setCompression(CompressionKind comp) {
- privateBits->compression = comp;
- return *this;
- }
-
- CompressionKind WriterOptions::getCompression() const {
- return privateBits->compression;
- }
-
- WriterOptions& WriterOptions::setCompressionStrategy(
- CompressionStrategy strategy) {
- privateBits->compressionStrategy = strategy;
- return *this;
- }
-
- CompressionStrategy WriterOptions::getCompressionStrategy() const {
- return privateBits->compressionStrategy;
- }
-
- bool WriterOptions::getAlignedBitpacking() const {
- return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED;
- }
-
- WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) {
- privateBits->paddingTolerance = tolerance;
- return *this;
- }
-
- double WriterOptions::getPaddingTolerance() const {
- return privateBits->paddingTolerance;
- }
-
- WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) {
- privateBits->memoryPool = memoryPool;
- return *this;
- }
-
- MemoryPool* WriterOptions::getMemoryPool() const {
- return privateBits->memoryPool;
- }
-
- WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) {
- privateBits->errorStream = &errStream;
- return *this;
- }
-
- std::ostream* WriterOptions::getErrorStream() const {
- return privateBits->errorStream;
- }
-
- bool WriterOptions::getEnableIndex() const {
- return privateBits->enableIndex;
- }
-
- bool WriterOptions::getEnableDictionary() const {
- return privateBits->dictionaryKeySizeThreshold > 0.0;
- }
-
- WriterOptions& WriterOptions::setColumnsUseBloomFilter(
- const std::set<uint64_t>& columns) {
- privateBits->columnsUseBloomFilter = columns;
- return *this;
- }
-
- bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const {
- return privateBits->columnsUseBloomFilter.find(column) !=
- privateBits->columnsUseBloomFilter.end();
- }
-
- WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) {
- privateBits->bloomFilterFalsePositiveProb = fpp;
- return *this;
- }
-
- double WriterOptions::getBloomFilterFPP() const {
- return privateBits->bloomFilterFalsePositiveProb;
- }
-
- // delibrately not provide setter to write bloom filter version because
- // we only support UTF8 for now.
- BloomFilterVersion WriterOptions::getBloomFilterVersion() const {
- return privateBits->bloomFilterVersion;
- }
-
- Writer::~Writer() {
- // PASS
- }
-
- class WriterImpl : public Writer {
- private:
- std::unique_ptr<ColumnWriter> columnWriter;
- std::unique_ptr<BufferedOutputStream> compressionStream;
- std::unique_ptr<BufferedOutputStream> bufferedStream;
- std::unique_ptr<StreamsFactory> streamsFactory;
- OutputStream* outStream;
- WriterOptions options;
- const Type& type;
- uint64_t stripeRows, totalRows, indexRows;
- uint64_t currentOffset;
- proto::Footer fileFooter;
- proto::PostScript postScript;
- proto::StripeInformation stripeInfo;
- proto::Metadata metadata;
-
- static const char* magicId;
- static const WriterId writerId;
-
- public:
- WriterImpl(
- const Type& type,
- OutputStream* stream,
- const WriterOptions& options);
-
- std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size)
- const override;
-
- void add(ColumnVectorBatch& rowsToAdd) override;
-
- void close() override;
-
- void addUserMetadata(const std::string name, const std::string value) override;
-
- private:
- void init();
- void initStripe();
- void writeStripe();
- void writeMetadata();
- void writeFileFooter();
- void writePostscript();
- void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index);
- static proto::CompressionKind convertCompressionKind(
- const CompressionKind& kind);
- };
-
- const char * WriterImpl::magicId = "ORC";
-
- const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER;
-
- WriterImpl::WriterImpl(
- const Type& t,
- OutputStream* stream,
- const WriterOptions& opts) :
- outStream(stream),
- options(opts),
- type(t) {
- streamsFactory = createStreamsFactory(options, outStream);
- columnWriter = buildWriter(type, *streamsFactory, options);
- stripeRows = totalRows = indexRows = 0;
- currentOffset = 0;
-
- // compression stream for stripe footer, file footer and metadata
- compressionStream = createCompressor(
- options.getCompression(),
- outStream,
- options.getCompressionStrategy(),
- 1 * 1024 * 1024, // buffer capacity: 1M
- options.getCompressionBlockSize(),
- *options.getMemoryPool());
-
- // uncompressed stream for post script
- bufferedStream.reset(new BufferedOutputStream(
- *options.getMemoryPool(),
- outStream,
- 1024, // buffer capacity: 1024 bytes
- options.getCompressionBlockSize()));
-
- init();
- }
-
- std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size)
- const {
- return type.createRowBatch(size, *options.getMemoryPool());
- }
-
- void WriterImpl::add(ColumnVectorBatch& rowsToAdd) {
- if (options.getEnableIndex()) {
- uint64_t pos = 0;
- uint64_t chunkSize = 0;
- uint64_t rowIndexStride = options.getRowIndexStride();
- while (pos < rowsToAdd.numElements) {
- chunkSize = std::min(rowsToAdd.numElements - pos,
- rowIndexStride - indexRows);
- columnWriter->add(rowsToAdd, pos, chunkSize, nullptr);
-
- pos += chunkSize;
- indexRows += chunkSize;
- stripeRows += chunkSize;
-
- if (indexRows >= rowIndexStride) {
- columnWriter->createRowIndexEntry();
- indexRows = 0;
- }
- }
- } else {
- stripeRows += rowsToAdd.numElements;
- columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr);
- }
-
- if (columnWriter->getEstimatedSize() >= options.getStripeSize()) {
- writeStripe();
- }
- }
-
- void WriterImpl::close() {
- if (stripeRows > 0) {
- writeStripe();
- }
- writeMetadata();
- writeFileFooter();
- writePostscript();
- outStream->close();
- }
-
- void WriterImpl::addUserMetadata(const std::string name, const std::string value){
- proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata();
- userMetadataItem->set_name(TString(name));
- userMetadataItem->set_value(TString(value));
- }
-
- void WriterImpl::init() {
- // Write file header
- const static size_t magicIdLength = strlen(WriterImpl::magicId);
- outStream->write(WriterImpl::magicId, magicIdLength);
- currentOffset += magicIdLength;
-
- // Initialize file footer
- fileFooter.set_headerlength(currentOffset);
- fileFooter.set_contentlength(0);
- fileFooter.set_numberofrows(0);
- fileFooter.set_rowindexstride(
- static_cast<uint32_t>(options.getRowIndexStride()));
- fileFooter.set_writer(writerId);
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Common.hh"
+#include "orc/OrcFile.hh"
+
+#include "ColumnWriter.hh"
+#include "Timezone.hh"
+
+#include <memory>
+
+namespace orc {
+
+ struct WriterOptionsPrivate {
+ uint64_t stripeSize;
+ uint64_t compressionBlockSize;
+ uint64_t rowIndexStride;
+ CompressionKind compression;
+ CompressionStrategy compressionStrategy;
+ MemoryPool* memoryPool;
+ double paddingTolerance;
+ std::ostream* errorStream;
+ FileVersion fileVersion;
+ double dictionaryKeySizeThreshold;
+ bool enableIndex;
+ std::set<uint64_t> columnsUseBloomFilter;
+ double bloomFilterFalsePositiveProb;
+ BloomFilterVersion bloomFilterVersion;
+
+ WriterOptionsPrivate() :
+ fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12
+ stripeSize = 64 * 1024 * 1024; // 64M
+ compressionBlockSize = 64 * 1024; // 64K
+ rowIndexStride = 10000;
+ compression = CompressionKind_ZLIB;
+ compressionStrategy = CompressionStrategy_SPEED;
+ memoryPool = getDefaultPool();
+ paddingTolerance = 0.0;
+ errorStream = &std::cerr;
+ dictionaryKeySizeThreshold = 0.0;
+ enableIndex = true;
+ bloomFilterFalsePositiveProb = 0.05;
+ bloomFilterVersion = UTF8;
+ }
+ };
+
+ WriterOptions::WriterOptions():
+ privateBits(std::unique_ptr<WriterOptionsPrivate>
+ (new WriterOptionsPrivate())) {
+ // PASS
+ }
+
+ WriterOptions::WriterOptions(const WriterOptions& rhs):
+ privateBits(std::unique_ptr<WriterOptionsPrivate>
+ (new WriterOptionsPrivate(*(rhs.privateBits.get())))) {
+ // PASS
+ }
+
+ WriterOptions::WriterOptions(WriterOptions& rhs) {
+ // swap privateBits with rhs
+ WriterOptionsPrivate* l = privateBits.release();
+ privateBits.reset(rhs.privateBits.release());
+ rhs.privateBits.reset(l);
+ }
+
+ WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) {
+ if (this != &rhs) {
+ privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get())));
+ }
+ return *this;
+ }
+
+ WriterOptions::~WriterOptions() {
+ // PASS
+ }
+ RleVersion WriterOptions::getRleVersion() const {
+ if(privateBits->fileVersion == FileVersion::v_0_11())
+ {
+ return RleVersion_1;
+ }
+
+ return RleVersion_2;
+ }
+
+ WriterOptions& WriterOptions::setStripeSize(uint64_t size) {
+ privateBits->stripeSize = size;
+ return *this;
+ }
+
+ uint64_t WriterOptions::getStripeSize() const {
+ return privateBits->stripeSize;
+ }
+
+ WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) {
+ privateBits->compressionBlockSize = size;
+ return *this;
+ }
+
+ uint64_t WriterOptions::getCompressionBlockSize() const {
+ return privateBits->compressionBlockSize;
+ }
+
+ WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) {
+ privateBits->rowIndexStride = stride;
+ privateBits->enableIndex = (stride != 0);
+ return *this;
+ }
+
+ uint64_t WriterOptions::getRowIndexStride() const {
+ return privateBits->rowIndexStride;
+ }
+
+ WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) {
+ privateBits->dictionaryKeySizeThreshold = val;
+ return *this;
+ }
+
+ double WriterOptions::getDictionaryKeySizeThreshold() const {
+ return privateBits->dictionaryKeySizeThreshold;
+ }
+
+ WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) {
+ // Only Hive_0_11 and Hive_0_12 version are supported currently
+ if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) {
+ privateBits->fileVersion = version;
+ return *this;
+ }
+ throw std::logic_error("Unsupported file version specified.");
+ }
+
+ FileVersion WriterOptions::getFileVersion() const {
+ return privateBits->fileVersion;
+ }
+
+ WriterOptions& WriterOptions::setCompression(CompressionKind comp) {
+ privateBits->compression = comp;
+ return *this;
+ }
+
+ CompressionKind WriterOptions::getCompression() const {
+ return privateBits->compression;
+ }
+
+ WriterOptions& WriterOptions::setCompressionStrategy(
+ CompressionStrategy strategy) {
+ privateBits->compressionStrategy = strategy;
+ return *this;
+ }
+
+ CompressionStrategy WriterOptions::getCompressionStrategy() const {
+ return privateBits->compressionStrategy;
+ }
+
+ bool WriterOptions::getAlignedBitpacking() const {
+ return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED;
+ }
+
+ WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) {
+ privateBits->paddingTolerance = tolerance;
+ return *this;
+ }
+
+ double WriterOptions::getPaddingTolerance() const {
+ return privateBits->paddingTolerance;
+ }
+
+ WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) {
+ privateBits->memoryPool = memoryPool;
+ return *this;
+ }
+
+ MemoryPool* WriterOptions::getMemoryPool() const {
+ return privateBits->memoryPool;
+ }
+
+ WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) {
+ privateBits->errorStream = &errStream;
+ return *this;
+ }
+
+ std::ostream* WriterOptions::getErrorStream() const {
+ return privateBits->errorStream;
+ }
+
+ bool WriterOptions::getEnableIndex() const {
+ return privateBits->enableIndex;
+ }
+
+ bool WriterOptions::getEnableDictionary() const {
+ return privateBits->dictionaryKeySizeThreshold > 0.0;
+ }
+
+ WriterOptions& WriterOptions::setColumnsUseBloomFilter(
+ const std::set<uint64_t>& columns) {
+ privateBits->columnsUseBloomFilter = columns;
+ return *this;
+ }
+
+ bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const {
+ return privateBits->columnsUseBloomFilter.find(column) !=
+ privateBits->columnsUseBloomFilter.end();
+ }
+
+ WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) {
+ privateBits->bloomFilterFalsePositiveProb = fpp;
+ return *this;
+ }
+
+ double WriterOptions::getBloomFilterFPP() const {
+ return privateBits->bloomFilterFalsePositiveProb;
+ }
+
+ // delibrately not provide setter to write bloom filter version because
+ // we only support UTF8 for now.
+ BloomFilterVersion WriterOptions::getBloomFilterVersion() const {
+ return privateBits->bloomFilterVersion;
+ }
+
+ Writer::~Writer() {
+ // PASS
+ }
+
+ class WriterImpl : public Writer {
+ private:
+ std::unique_ptr<ColumnWriter> columnWriter;
+ std::unique_ptr<BufferedOutputStream> compressionStream;
+ std::unique_ptr<BufferedOutputStream> bufferedStream;
+ std::unique_ptr<StreamsFactory> streamsFactory;
+ OutputStream* outStream;
+ WriterOptions options;
+ const Type& type;
+ uint64_t stripeRows, totalRows, indexRows;
+ uint64_t currentOffset;
+ proto::Footer fileFooter;
+ proto::PostScript postScript;
+ proto::StripeInformation stripeInfo;
+ proto::Metadata metadata;
+
+ static const char* magicId;
+ static const WriterId writerId;
+
+ public:
+ WriterImpl(
+ const Type& type,
+ OutputStream* stream,
+ const WriterOptions& options);
+
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size)
+ const override;
+
+ void add(ColumnVectorBatch& rowsToAdd) override;
+
+ void close() override;
+
+ void addUserMetadata(const std::string name, const std::string value) override;
+
+ private:
+ void init();
+ void initStripe();
+ void writeStripe();
+ void writeMetadata();
+ void writeFileFooter();
+ void writePostscript();
+ void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index);
+ static proto::CompressionKind convertCompressionKind(
+ const CompressionKind& kind);
+ };
+
+ const char * WriterImpl::magicId = "ORC";
+
+ const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER;
+
+ WriterImpl::WriterImpl(
+ const Type& t,
+ OutputStream* stream,
+ const WriterOptions& opts) :
+ outStream(stream),
+ options(opts),
+ type(t) {
+ streamsFactory = createStreamsFactory(options, outStream);
+ columnWriter = buildWriter(type, *streamsFactory, options);
+ stripeRows = totalRows = indexRows = 0;
+ currentOffset = 0;
+
+ // compression stream for stripe footer, file footer and metadata
+ compressionStream = createCompressor(
+ options.getCompression(),
+ outStream,
+ options.getCompressionStrategy(),
+ 1 * 1024 * 1024, // buffer capacity: 1M
+ options.getCompressionBlockSize(),
+ *options.getMemoryPool());
+
+ // uncompressed stream for post script
+ bufferedStream.reset(new BufferedOutputStream(
+ *options.getMemoryPool(),
+ outStream,
+ 1024, // buffer capacity: 1024 bytes
+ options.getCompressionBlockSize()));
+
+ init();
+ }
+
+ std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size)
+ const {
+ return type.createRowBatch(size, *options.getMemoryPool());
+ }
+
+ void WriterImpl::add(ColumnVectorBatch& rowsToAdd) {
+ if (options.getEnableIndex()) {
+ uint64_t pos = 0;
+ uint64_t chunkSize = 0;
+ uint64_t rowIndexStride = options.getRowIndexStride();
+ while (pos < rowsToAdd.numElements) {
+ chunkSize = std::min(rowsToAdd.numElements - pos,
+ rowIndexStride - indexRows);
+ columnWriter->add(rowsToAdd, pos, chunkSize, nullptr);
+
+ pos += chunkSize;
+ indexRows += chunkSize;
+ stripeRows += chunkSize;
+
+ if (indexRows >= rowIndexStride) {
+ columnWriter->createRowIndexEntry();
+ indexRows = 0;
+ }
+ }
+ } else {
+ stripeRows += rowsToAdd.numElements;
+ columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr);
+ }
+
+ if (columnWriter->getEstimatedSize() >= options.getStripeSize()) {
+ writeStripe();
+ }
+ }
+
+ void WriterImpl::close() {
+ if (stripeRows > 0) {
+ writeStripe();
+ }
+ writeMetadata();
+ writeFileFooter();
+ writePostscript();
+ outStream->close();
+ }
+
+ void WriterImpl::addUserMetadata(const std::string name, const std::string value){
+ proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata();
+ userMetadataItem->set_name(TString(name));
+ userMetadataItem->set_value(TString(value));
+ }
+
+ void WriterImpl::init() {
+ // Write file header
+ const static size_t magicIdLength = strlen(WriterImpl::magicId);
+ outStream->write(WriterImpl::magicId, magicIdLength);
+ currentOffset += magicIdLength;
+
+ // Initialize file footer
+ fileFooter.set_headerlength(currentOffset);
+ fileFooter.set_contentlength(0);
+ fileFooter.set_numberofrows(0);
+ fileFooter.set_rowindexstride(
+ static_cast<uint32_t>(options.getRowIndexStride()));
+ fileFooter.set_writer(writerId);
fileFooter.set_softwareversion(ORC_VERSION);
-
- uint32_t index = 0;
- buildFooterType(type, fileFooter, index);
-
- // Initialize post script
- postScript.set_footerlength(0);
- postScript.set_compression(
- WriterImpl::convertCompressionKind(options.getCompression()));
- postScript.set_compressionblocksize(options.getCompressionBlockSize());
-
- postScript.add_version(options.getFileVersion().getMajor());
- postScript.add_version(options.getFileVersion().getMinor());
-
- postScript.set_writerversion(WriterVersion_ORC_135);
- postScript.set_magic("ORC");
-
- // Initialize first stripe
- initStripe();
- }
-
- void WriterImpl::initStripe() {
- stripeInfo.set_offset(currentOffset);
- stripeInfo.set_indexlength(0);
- stripeInfo.set_datalength(0);
- stripeInfo.set_footerlength(0);
- stripeInfo.set_numberofrows(0);
-
- stripeRows = indexRows = 0;
- }
-
- void WriterImpl::writeStripe() {
- if (options.getEnableIndex() && indexRows != 0) {
- columnWriter->createRowIndexEntry();
- indexRows = 0;
- } else {
- columnWriter->mergeRowGroupStatsIntoStripeStats();
- }
-
- // dictionary should be written before any stream is flushed
- columnWriter->writeDictionary();
-
- std::vector<proto::Stream> streams;
- // write ROW_INDEX streams
- if (options.getEnableIndex()) {
- columnWriter->writeIndex(streams);
- }
- // write streams like PRESENT, DATA, etc.
- columnWriter->flush(streams);
-
- // generate and write stripe footer
- proto::StripeFooter stripeFooter;
- for (uint32_t i = 0; i < streams.size(); ++i) {
- *stripeFooter.add_streams() = streams[i];
- }
-
- std::vector<proto::ColumnEncoding> encodings;
- columnWriter->getColumnEncoding(encodings);
-
- for (uint32_t i = 0; i < encodings.size(); ++i) {
- *stripeFooter.add_columns() = encodings[i];
- }
-
- // use GMT to guarantee TimestampVectorBatch from reader can write
- // same wall clock time
- stripeFooter.set_writertimezone("GMT");
-
- // add stripe statistics to metadata
- proto::StripeStatistics* stripeStats = metadata.add_stripestats();
- std::vector<proto::ColumnStatistics> colStats;
- columnWriter->getStripeStatistics(colStats);
- for (uint32_t i = 0; i != colStats.size(); ++i) {
- *stripeStats->add_colstats() = colStats[i];
- }
- // merge stripe stats into file stats and clear stripe stats
- columnWriter->mergeStripeStatsIntoFileStats();
-
- if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) {
- throw std::logic_error("Failed to write stripe footer.");
- }
- uint64_t footerLength = compressionStream->flush();
-
- // calculate data length and index length
- uint64_t dataLength = 0;
- uint64_t indexLength = 0;
- for (uint32_t i = 0; i < streams.size(); ++i) {
- if (streams[i].kind() == proto::Stream_Kind_ROW_INDEX ||
- streams[i].kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8) {
- indexLength += streams[i].length();
- } else {
- dataLength += streams[i].length();
- }
- }
-
- // update stripe info
- stripeInfo.set_indexlength(indexLength);
- stripeInfo.set_datalength(dataLength);
- stripeInfo.set_footerlength(footerLength);
- stripeInfo.set_numberofrows(stripeRows);
-
- *fileFooter.add_stripes() = stripeInfo;
-
- currentOffset = currentOffset + indexLength + dataLength + footerLength;
- totalRows += stripeRows;
-
- columnWriter->reset();
-
- initStripe();
- }
-
- void WriterImpl::writeMetadata() {
- if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) {
- throw std::logic_error("Failed to write metadata.");
- }
- postScript.set_metadatalength(compressionStream.get()->flush());
- }
-
- void WriterImpl::writeFileFooter() {
- fileFooter.set_contentlength(currentOffset - fileFooter.headerlength());
- fileFooter.set_numberofrows(totalRows);
-
- // update file statistics
- std::vector<proto::ColumnStatistics> colStats;
- columnWriter->getFileStatistics(colStats);
- for (uint32_t i = 0; i != colStats.size(); ++i) {
- *fileFooter.add_statistics() = colStats[i];
- }
-
- if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) {
- throw std::logic_error("Failed to write file footer.");
- }
- postScript.set_footerlength(compressionStream->flush());
- }
-
- void WriterImpl::writePostscript() {
- if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) {
- throw std::logic_error("Failed to write post script.");
- }
- unsigned char psLength =
- static_cast<unsigned char>(bufferedStream->flush());
- outStream->write(&psLength, sizeof(unsigned char));
- }
-
- void WriterImpl::buildFooterType(
- const Type& t,
- proto::Footer& footer,
- uint32_t & index) {
- proto::Type protoType;
- protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength()));
- protoType.set_precision(static_cast<uint32_t>(t.getPrecision()));
- protoType.set_scale(static_cast<uint32_t>(t.getScale()));
-
- switch (t.getKind()) {
- case BOOLEAN: {
- protoType.set_kind(proto::Type_Kind_BOOLEAN);
- break;
- }
- case BYTE: {
- protoType.set_kind(proto::Type_Kind_BYTE);
- break;
- }
- case SHORT: {
- protoType.set_kind(proto::Type_Kind_SHORT);
- break;
- }
- case INT: {
- protoType.set_kind(proto::Type_Kind_INT);
- break;
- }
- case LONG: {
- protoType.set_kind(proto::Type_Kind_LONG);
- break;
- }
- case FLOAT: {
- protoType.set_kind(proto::Type_Kind_FLOAT);
- break;
- }
- case DOUBLE: {
- protoType.set_kind(proto::Type_Kind_DOUBLE);
- break;
- }
- case STRING: {
- protoType.set_kind(proto::Type_Kind_STRING);
- break;
- }
- case BINARY: {
- protoType.set_kind(proto::Type_Kind_BINARY);
- break;
- }
- case TIMESTAMP: {
- protoType.set_kind(proto::Type_Kind_TIMESTAMP);
- break;
- }
- case LIST: {
- protoType.set_kind(proto::Type_Kind_LIST);
- break;
- }
- case MAP: {
- protoType.set_kind(proto::Type_Kind_MAP);
- break;
- }
- case STRUCT: {
- protoType.set_kind(proto::Type_Kind_STRUCT);
- break;
- }
- case UNION: {
- protoType.set_kind(proto::Type_Kind_UNION);
- break;
- }
- case DECIMAL: {
- protoType.set_kind(proto::Type_Kind_DECIMAL);
- break;
- }
- case DATE: {
- protoType.set_kind(proto::Type_Kind_DATE);
- break;
- }
- case VARCHAR: {
- protoType.set_kind(proto::Type_Kind_VARCHAR);
- break;
- }
- case CHAR: {
- protoType.set_kind(proto::Type_Kind_CHAR);
- break;
- }
- default:
- throw std::logic_error("Unknown type.");
- }
-
- int pos = static_cast<int>(index);
- *footer.add_types() = protoType;
-
- for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) {
- // only add subtypes' field names if this type is STRUCT
- if (t.getKind() == STRUCT) {
- footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i)));
- }
- footer.mutable_types(pos)->add_subtypes(++index);
- buildFooterType(*t.getSubtype(i), footer, index);
- }
- }
-
- proto::CompressionKind WriterImpl::convertCompressionKind(
- const CompressionKind& kind) {
- return static_cast<proto::CompressionKind>(kind);
- }
-
- std::unique_ptr<Writer> createWriter(
- const Type& type,
- OutputStream* stream,
- const WriterOptions& options) {
- return std::unique_ptr<Writer>(
- new WriterImpl(
- type,
- stream,
- options));
- }
-
-}
-
+
+ uint32_t index = 0;
+ buildFooterType(type, fileFooter, index);
+
+ // Initialize post script
+ postScript.set_footerlength(0);
+ postScript.set_compression(
+ WriterImpl::convertCompressionKind(options.getCompression()));
+ postScript.set_compressionblocksize(options.getCompressionBlockSize());
+
+ postScript.add_version(options.getFileVersion().getMajor());
+ postScript.add_version(options.getFileVersion().getMinor());
+
+ postScript.set_writerversion(WriterVersion_ORC_135);
+ postScript.set_magic("ORC");
+
+ // Initialize first stripe
+ initStripe();
+ }
+
+ void WriterImpl::initStripe() {
+ stripeInfo.set_offset(currentOffset);
+ stripeInfo.set_indexlength(0);
+ stripeInfo.set_datalength(0);
+ stripeInfo.set_footerlength(0);
+ stripeInfo.set_numberofrows(0);
+
+ stripeRows = indexRows = 0;
+ }
+
+ void WriterImpl::writeStripe() {
+ if (options.getEnableIndex() && indexRows != 0) {
+ columnWriter->createRowIndexEntry();
+ indexRows = 0;
+ } else {
+ columnWriter->mergeRowGroupStatsIntoStripeStats();
+ }
+
+ // dictionary should be written before any stream is flushed
+ columnWriter->writeDictionary();
+
+ std::vector<proto::Stream> streams;
+ // write ROW_INDEX streams
+ if (options.getEnableIndex()) {
+ columnWriter->writeIndex(streams);
+ }
+ // write streams like PRESENT, DATA, etc.
+ columnWriter->flush(streams);
+
+ // generate and write stripe footer
+ proto::StripeFooter stripeFooter;
+ for (uint32_t i = 0; i < streams.size(); ++i) {
+ *stripeFooter.add_streams() = streams[i];
+ }
+
+ std::vector<proto::ColumnEncoding> encodings;
+ columnWriter->getColumnEncoding(encodings);
+
+ for (uint32_t i = 0; i < encodings.size(); ++i) {
+ *stripeFooter.add_columns() = encodings[i];
+ }
+
+ // use GMT to guarantee TimestampVectorBatch from reader can write
+ // same wall clock time
+ stripeFooter.set_writertimezone("GMT");
+
+ // add stripe statistics to metadata
+ proto::StripeStatistics* stripeStats = metadata.add_stripestats();
+ std::vector<proto::ColumnStatistics> colStats;
+ columnWriter->getStripeStatistics(colStats);
+ for (uint32_t i = 0; i != colStats.size(); ++i) {
+ *stripeStats->add_colstats() = colStats[i];
+ }
+ // merge stripe stats into file stats and clear stripe stats
+ columnWriter->mergeStripeStatsIntoFileStats();
+
+ if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) {
+ throw std::logic_error("Failed to write stripe footer.");
+ }
+ uint64_t footerLength = compressionStream->flush();
+
+ // calculate data length and index length
+ uint64_t dataLength = 0;
+ uint64_t indexLength = 0;
+ for (uint32_t i = 0; i < streams.size(); ++i) {
+ if (streams[i].kind() == proto::Stream_Kind_ROW_INDEX ||
+ streams[i].kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8) {
+ indexLength += streams[i].length();
+ } else {
+ dataLength += streams[i].length();
+ }
+ }
+
+ // update stripe info
+ stripeInfo.set_indexlength(indexLength);
+ stripeInfo.set_datalength(dataLength);
+ stripeInfo.set_footerlength(footerLength);
+ stripeInfo.set_numberofrows(stripeRows);
+
+ *fileFooter.add_stripes() = stripeInfo;
+
+ currentOffset = currentOffset + indexLength + dataLength + footerLength;
+ totalRows += stripeRows;
+
+ columnWriter->reset();
+
+ initStripe();
+ }
+
+ void WriterImpl::writeMetadata() {
+ if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) {
+ throw std::logic_error("Failed to write metadata.");
+ }
+ postScript.set_metadatalength(compressionStream.get()->flush());
+ }
+
+ void WriterImpl::writeFileFooter() {
+ fileFooter.set_contentlength(currentOffset - fileFooter.headerlength());
+ fileFooter.set_numberofrows(totalRows);
+
+ // update file statistics
+ std::vector<proto::ColumnStatistics> colStats;
+ columnWriter->getFileStatistics(colStats);
+ for (uint32_t i = 0; i != colStats.size(); ++i) {
+ *fileFooter.add_statistics() = colStats[i];
+ }
+
+ if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) {
+ throw std::logic_error("Failed to write file footer.");
+ }
+ postScript.set_footerlength(compressionStream->flush());
+ }
+
+ void WriterImpl::writePostscript() {
+ if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) {
+ throw std::logic_error("Failed to write post script.");
+ }
+ unsigned char psLength =
+ static_cast<unsigned char>(bufferedStream->flush());
+ outStream->write(&psLength, sizeof(unsigned char));
+ }
+
+ void WriterImpl::buildFooterType(
+ const Type& t,
+ proto::Footer& footer,
+ uint32_t & index) {
+ proto::Type protoType;
+ protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength()));
+ protoType.set_precision(static_cast<uint32_t>(t.getPrecision()));
+ protoType.set_scale(static_cast<uint32_t>(t.getScale()));
+
+ switch (t.getKind()) {
+ case BOOLEAN: {
+ protoType.set_kind(proto::Type_Kind_BOOLEAN);
+ break;
+ }
+ case BYTE: {
+ protoType.set_kind(proto::Type_Kind_BYTE);
+ break;
+ }
+ case SHORT: {
+ protoType.set_kind(proto::Type_Kind_SHORT);
+ break;
+ }
+ case INT: {
+ protoType.set_kind(proto::Type_Kind_INT);
+ break;
+ }
+ case LONG: {
+ protoType.set_kind(proto::Type_Kind_LONG);
+ break;
+ }
+ case FLOAT: {
+ protoType.set_kind(proto::Type_Kind_FLOAT);
+ break;
+ }
+ case DOUBLE: {
+ protoType.set_kind(proto::Type_Kind_DOUBLE);
+ break;
+ }
+ case STRING: {
+ protoType.set_kind(proto::Type_Kind_STRING);
+ break;
+ }
+ case BINARY: {
+ protoType.set_kind(proto::Type_Kind_BINARY);
+ break;
+ }
+ case TIMESTAMP: {
+ protoType.set_kind(proto::Type_Kind_TIMESTAMP);
+ break;
+ }
+ case LIST: {
+ protoType.set_kind(proto::Type_Kind_LIST);
+ break;
+ }
+ case MAP: {
+ protoType.set_kind(proto::Type_Kind_MAP);
+ break;
+ }
+ case STRUCT: {
+ protoType.set_kind(proto::Type_Kind_STRUCT);
+ break;
+ }
+ case UNION: {
+ protoType.set_kind(proto::Type_Kind_UNION);
+ break;
+ }
+ case DECIMAL: {
+ protoType.set_kind(proto::Type_Kind_DECIMAL);
+ break;
+ }
+ case DATE: {
+ protoType.set_kind(proto::Type_Kind_DATE);
+ break;
+ }
+ case VARCHAR: {
+ protoType.set_kind(proto::Type_Kind_VARCHAR);
+ break;
+ }
+ case CHAR: {
+ protoType.set_kind(proto::Type_Kind_CHAR);
+ break;
+ }
+ default:
+ throw std::logic_error("Unknown type.");
+ }
+
+ int pos = static_cast<int>(index);
+ *footer.add_types() = protoType;
+
+ for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) {
+ // only add subtypes' field names if this type is STRUCT
+ if (t.getKind() == STRUCT) {
+ footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i)));
+ }
+ footer.mutable_types(pos)->add_subtypes(++index);
+ buildFooterType(*t.getSubtype(i), footer, index);
+ }
+ }
+
+ proto::CompressionKind WriterImpl::convertCompressionKind(
+ const CompressionKind& kind) {
+ return static_cast<proto::CompressionKind>(kind);
+ }
+
+ std::unique_ptr<Writer> createWriter(
+ const Type& type,
+ OutputStream* stream,
+ const WriterOptions& options) {
+ return std::unique_ptr<Writer>(
+ new WriterImpl(
+ type,
+ stream,
+ options));
+ }
+
+}
+
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
index 6e54b1412f..201f6f9c1d 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
@@ -1,222 +1,222 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Exceptions.hh"
-#include "InputStream.hh"
-
-#include <algorithm>
-#include <iomanip>
-
-namespace orc {
-
- void printBuffer(std::ostream& out,
- const char *buffer,
- uint64_t length) {
- const uint64_t width = 24;
- out << std::hex;
- for(uint64_t line = 0; line < (length + width - 1) / width; ++line) {
- out << std::setfill('0') << std::setw(7) << (line * width);
- for(uint64_t byte = 0;
- byte < width && line * width + byte < length; ++byte) {
- out << " " << std::setfill('0') << std::setw(2)
- << static_cast<uint64_t>(0xff & buffer[line * width +
- byte]);
- }
- out << "\n";
- }
- out << std::dec;
- }
-
- PositionProvider::PositionProvider(const std::list<uint64_t>& posns) {
- position = posns.begin();
- }
-
- uint64_t PositionProvider::next() {
- uint64_t result = *position;
- ++position;
- return result;
- }
-
- SeekableInputStream::~SeekableInputStream() {
- // PASS
- }
-
- SeekableArrayInputStream::~SeekableArrayInputStream() {
- // PASS
- }
-
- SeekableArrayInputStream::SeekableArrayInputStream
- (const unsigned char* values,
- uint64_t size,
- uint64_t blkSize
- ): data(reinterpret_cast<const char*>(values)) {
- length = size;
- position = 0;
- blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
- }
-
- SeekableArrayInputStream::SeekableArrayInputStream(const char* values,
- uint64_t size,
- uint64_t blkSize
- ): data(values) {
- length = size;
- position = 0;
- blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
- }
-
- bool SeekableArrayInputStream::Next(const void** buffer, int*size) {
- uint64_t currentSize = std::min(length - position, blockSize);
- if (currentSize > 0) {
- *buffer = data + position;
- *size = static_cast<int>(currentSize);
- position += currentSize;
- return true;
- }
- *size = 0;
- return false;
- }
-
- void SeekableArrayInputStream::BackUp(int count) {
- if (count >= 0) {
- uint64_t unsignedCount = static_cast<uint64_t>(count);
- if (unsignedCount <= blockSize && unsignedCount <= position) {
- position -= unsignedCount;
- } else {
- throw std::logic_error("Can't backup that much!");
- }
- }
- }
-
- bool SeekableArrayInputStream::Skip(int count) {
- if (count >= 0) {
- uint64_t unsignedCount = static_cast<uint64_t>(count);
- if (unsignedCount + position <= length) {
- position += unsignedCount;
- return true;
- } else {
- position = length;
- }
- }
- return false;
- }
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "InputStream.hh"
+
+#include <algorithm>
+#include <iomanip>
+
+namespace orc {
+
+ void printBuffer(std::ostream& out,
+ const char *buffer,
+ uint64_t length) {
+ const uint64_t width = 24;
+ out << std::hex;
+ for(uint64_t line = 0; line < (length + width - 1) / width; ++line) {
+ out << std::setfill('0') << std::setw(7) << (line * width);
+ for(uint64_t byte = 0;
+ byte < width && line * width + byte < length; ++byte) {
+ out << " " << std::setfill('0') << std::setw(2)
+ << static_cast<uint64_t>(0xff & buffer[line * width +
+ byte]);
+ }
+ out << "\n";
+ }
+ out << std::dec;
+ }
+
+ PositionProvider::PositionProvider(const std::list<uint64_t>& posns) {
+ position = posns.begin();
+ }
+
+ uint64_t PositionProvider::next() {
+ uint64_t result = *position;
+ ++position;
+ return result;
+ }
+
+ SeekableInputStream::~SeekableInputStream() {
+ // PASS
+ }
+
+ SeekableArrayInputStream::~SeekableArrayInputStream() {
+ // PASS
+ }
+
+ SeekableArrayInputStream::SeekableArrayInputStream
+ (const unsigned char* values,
+ uint64_t size,
+ uint64_t blkSize
+ ): data(reinterpret_cast<const char*>(values)) {
+ length = size;
+ position = 0;
+ blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
+ }
+
+ SeekableArrayInputStream::SeekableArrayInputStream(const char* values,
+ uint64_t size,
+ uint64_t blkSize
+ ): data(values) {
+ length = size;
+ position = 0;
+ blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
+ }
+
+ bool SeekableArrayInputStream::Next(const void** buffer, int*size) {
+ uint64_t currentSize = std::min(length - position, blockSize);
+ if (currentSize > 0) {
+ *buffer = data + position;
+ *size = static_cast<int>(currentSize);
+ position += currentSize;
+ return true;
+ }
+ *size = 0;
+ return false;
+ }
+
+ void SeekableArrayInputStream::BackUp(int count) {
+ if (count >= 0) {
+ uint64_t unsignedCount = static_cast<uint64_t>(count);
+ if (unsignedCount <= blockSize && unsignedCount <= position) {
+ position -= unsignedCount;
+ } else {
+ throw std::logic_error("Can't backup that much!");
+ }
+ }
+ }
+
+ bool SeekableArrayInputStream::Skip(int count) {
+ if (count >= 0) {
+ uint64_t unsignedCount = static_cast<uint64_t>(count);
+ if (unsignedCount + position <= length) {
+ position += unsignedCount;
+ return true;
+ } else {
+ position = length;
+ }
+ }
+ return false;
+ }
+
int64_t SeekableArrayInputStream::ByteCount() const {
- return static_cast<google::protobuf::int64>(position);
- }
-
- void SeekableArrayInputStream::seek(PositionProvider& seekPosition) {
- position = seekPosition.next();
- }
-
- std::string SeekableArrayInputStream::getName() const {
- std::ostringstream result;
- result << "SeekableArrayInputStream " << position << " of " << length;
- return result.str();
- }
-
- static uint64_t computeBlock(uint64_t request, uint64_t length) {
- return std::min(length, request == 0 ? 256 * 1024 : request);
- }
-
- SeekableFileInputStream::SeekableFileInputStream(InputStream* stream,
- uint64_t offset,
- uint64_t byteCount,
- MemoryPool& _pool,
- uint64_t _blockSize
- ):pool(_pool),
- input(stream),
- start(offset),
- length(byteCount),
- blockSize(computeBlock
- (_blockSize,
- length)) {
-
- position = 0;
- buffer.reset(new DataBuffer<char>(pool));
- pushBack = 0;
- }
-
- SeekableFileInputStream::~SeekableFileInputStream() {
- // PASS
- }
-
- bool SeekableFileInputStream::Next(const void** data, int*size) {
- uint64_t bytesRead;
- if (pushBack != 0) {
- *data = buffer->data() + (buffer->size() - pushBack);
- bytesRead = pushBack;
- } else {
- bytesRead = std::min(length - position, blockSize);
- buffer->resize(bytesRead);
- if (bytesRead > 0) {
- input->read(buffer->data(), bytesRead, start+position);
- *data = static_cast<void*>(buffer->data());
- }
- }
- position += bytesRead;
- pushBack = 0;
- *size = static_cast<int>(bytesRead);
- return bytesRead != 0;
- }
-
- void SeekableFileInputStream::BackUp(int signedCount) {
- if (signedCount < 0) {
- throw std::logic_error("can't backup negative distances");
- }
- uint64_t count = static_cast<uint64_t>(signedCount);
- if (pushBack > 0) {
- throw std::logic_error("can't backup unless we just called Next");
- }
- if (count > blockSize || count > position) {
- throw std::logic_error("can't backup that far");
- }
- pushBack = static_cast<uint64_t>(count);
- position -= pushBack;
- }
-
- bool SeekableFileInputStream::Skip(int signedCount) {
- if (signedCount < 0) {
- return false;
- }
- uint64_t count = static_cast<uint64_t>(signedCount);
- position = std::min(position + count, length);
- pushBack = 0;
- return position < length;
- }
-
- int64_t SeekableFileInputStream::ByteCount() const {
- return static_cast<int64_t>(position);
- }
-
- void SeekableFileInputStream::seek(PositionProvider& location) {
- position = location.next();
- if (position > length) {
- position = length;
- throw std::logic_error("seek too far");
- }
- pushBack = 0;
- }
-
- std::string SeekableFileInputStream::getName() const {
- std::ostringstream result;
- result << input->getName() << " from " << start << " for "
- << length;
- return result.str();
- }
-
-}
+ return static_cast<google::protobuf::int64>(position);
+ }
+
+ void SeekableArrayInputStream::seek(PositionProvider& seekPosition) {
+ position = seekPosition.next();
+ }
+
+ std::string SeekableArrayInputStream::getName() const {
+ std::ostringstream result;
+ result << "SeekableArrayInputStream " << position << " of " << length;
+ return result.str();
+ }
+
+ static uint64_t computeBlock(uint64_t request, uint64_t length) {
+ return std::min(length, request == 0 ? 256 * 1024 : request);
+ }
+
+ SeekableFileInputStream::SeekableFileInputStream(InputStream* stream,
+ uint64_t offset,
+ uint64_t byteCount,
+ MemoryPool& _pool,
+ uint64_t _blockSize
+ ):pool(_pool),
+ input(stream),
+ start(offset),
+ length(byteCount),
+ blockSize(computeBlock
+ (_blockSize,
+ length)) {
+
+ position = 0;
+ buffer.reset(new DataBuffer<char>(pool));
+ pushBack = 0;
+ }
+
+ SeekableFileInputStream::~SeekableFileInputStream() {
+ // PASS
+ }
+
+ bool SeekableFileInputStream::Next(const void** data, int*size) {
+ uint64_t bytesRead;
+ if (pushBack != 0) {
+ *data = buffer->data() + (buffer->size() - pushBack);
+ bytesRead = pushBack;
+ } else {
+ bytesRead = std::min(length - position, blockSize);
+ buffer->resize(bytesRead);
+ if (bytesRead > 0) {
+ input->read(buffer->data(), bytesRead, start+position);
+ *data = static_cast<void*>(buffer->data());
+ }
+ }
+ position += bytesRead;
+ pushBack = 0;
+ *size = static_cast<int>(bytesRead);
+ return bytesRead != 0;
+ }
+
+ void SeekableFileInputStream::BackUp(int signedCount) {
+ if (signedCount < 0) {
+ throw std::logic_error("can't backup negative distances");
+ }
+ uint64_t count = static_cast<uint64_t>(signedCount);
+ if (pushBack > 0) {
+ throw std::logic_error("can't backup unless we just called Next");
+ }
+ if (count > blockSize || count > position) {
+ throw std::logic_error("can't backup that far");
+ }
+ pushBack = static_cast<uint64_t>(count);
+ position -= pushBack;
+ }
+
+ bool SeekableFileInputStream::Skip(int signedCount) {
+ if (signedCount < 0) {
+ return false;
+ }
+ uint64_t count = static_cast<uint64_t>(signedCount);
+ position = std::min(position + count, length);
+ pushBack = 0;
+ return position < length;
+ }
+
+ int64_t SeekableFileInputStream::ByteCount() const {
+ return static_cast<int64_t>(position);
+ }
+
+ void SeekableFileInputStream::seek(PositionProvider& location) {
+ position = location.next();
+ if (position > length) {
+ position = length;
+ throw std::logic_error("seek too far");
+ }
+ pushBack = 0;
+ }
+
+ std::string SeekableFileInputStream::getName() const {
+ std::ostringstream result;
+ result << input->getName() << " from " << start << " for "
+ << length;
+ return result.str();
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
index d8bd3d4d8c..797049a300 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
@@ -1,116 +1,116 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_INPUTSTREAM_HH
-#define ORC_INPUTSTREAM_HH
-
-#include "Adaptor.hh"
-#include "orc/OrcFile.hh"
-#include "wrap/zero-copy-stream-wrapper.h"
-
-#include <list>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-namespace orc {
-
- void printBuffer(std::ostream& out,
- const char *buffer,
- uint64_t length);
-
- class PositionProvider {
- private:
- std::list<uint64_t>::const_iterator position;
- public:
- PositionProvider(const std::list<uint64_t>& positions);
- uint64_t next();
- };
-
- /**
- * A subclass of Google's ZeroCopyInputStream that supports seek.
- * By extending Google's class, we get the ability to pass it directly
- * to the protobuf readers.
- */
- class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream {
- public:
- virtual ~SeekableInputStream();
- virtual void seek(PositionProvider& position) = 0;
- virtual std::string getName() const = 0;
- };
-
- /**
- * Create a seekable input stream based on a memory range.
- */
- class SeekableArrayInputStream: public SeekableInputStream {
- private:
- const char* data;
- uint64_t length;
- uint64_t position;
- uint64_t blockSize;
-
- public:
- SeekableArrayInputStream(const unsigned char* list,
- uint64_t length,
- uint64_t block_size = 0);
- SeekableArrayInputStream(const char* list,
- uint64_t length,
- uint64_t block_size = 0);
- virtual ~SeekableArrayInputStream() override;
- virtual bool Next(const void** data, int*size) override;
- virtual void BackUp(int count) override;
- virtual bool Skip(int count) override;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_INPUTSTREAM_HH
+#define ORC_INPUTSTREAM_HH
+
+#include "Adaptor.hh"
+#include "orc/OrcFile.hh"
+#include "wrap/zero-copy-stream-wrapper.h"
+
+#include <list>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+namespace orc {
+
+ void printBuffer(std::ostream& out,
+ const char *buffer,
+ uint64_t length);
+
+ class PositionProvider {
+ private:
+ std::list<uint64_t>::const_iterator position;
+ public:
+ PositionProvider(const std::list<uint64_t>& positions);
+ uint64_t next();
+ };
+
+ /**
+ * A subclass of Google's ZeroCopyInputStream that supports seek.
+ * By extending Google's class, we get the ability to pass it directly
+ * to the protobuf readers.
+ */
+ class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream {
+ public:
+ virtual ~SeekableInputStream();
+ virtual void seek(PositionProvider& position) = 0;
+ virtual std::string getName() const = 0;
+ };
+
+ /**
+ * Create a seekable input stream based on a memory range.
+ */
+ class SeekableArrayInputStream: public SeekableInputStream {
+ private:
+ const char* data;
+ uint64_t length;
+ uint64_t position;
+ uint64_t blockSize;
+
+ public:
+ SeekableArrayInputStream(const unsigned char* list,
+ uint64_t length,
+ uint64_t block_size = 0);
+ SeekableArrayInputStream(const char* list,
+ uint64_t length,
+ uint64_t block_size = 0);
+ virtual ~SeekableArrayInputStream() override;
+ virtual bool Next(const void** data, int*size) override;
+ virtual void BackUp(int count) override;
+ virtual bool Skip(int count) override;
virtual int64_t ByteCount() const override;
- virtual void seek(PositionProvider& position) override;
- virtual std::string getName() const override;
- };
-
- /**
- * Create a seekable input stream based on an input stream.
- */
- class SeekableFileInputStream: public SeekableInputStream {
- private:
- MemoryPool& pool;
- InputStream* const input;
- const uint64_t start;
- const uint64_t length;
- const uint64_t blockSize;
- std::unique_ptr<DataBuffer<char> > buffer;
- uint64_t position;
- uint64_t pushBack;
-
- public:
- SeekableFileInputStream(InputStream* input,
- uint64_t offset,
- uint64_t byteCount,
- MemoryPool& pool,
- uint64_t blockSize = 0);
- virtual ~SeekableFileInputStream() override;
-
- virtual bool Next(const void** data, int*size) override;
- virtual void BackUp(int count) override;
- virtual bool Skip(int count) override;
- virtual int64_t ByteCount() const override;
- virtual void seek(PositionProvider& position) override;
- virtual std::string getName() const override;
- };
-
-}
-
-#endif //ORC_INPUTSTREAM_HH
+ virtual void seek(PositionProvider& position) override;
+ virtual std::string getName() const override;
+ };
+
+ /**
+ * Create a seekable input stream based on an input stream.
+ */
+ class SeekableFileInputStream: public SeekableInputStream {
+ private:
+ MemoryPool& pool;
+ InputStream* const input;
+ const uint64_t start;
+ const uint64_t length;
+ const uint64_t blockSize;
+ std::unique_ptr<DataBuffer<char> > buffer;
+ uint64_t position;
+ uint64_t pushBack;
+
+ public:
+ SeekableFileInputStream(InputStream* input,
+ uint64_t offset,
+ uint64_t byteCount,
+ MemoryPool& pool,
+ uint64_t blockSize = 0);
+ virtual ~SeekableFileInputStream() override;
+
+ virtual bool Next(const void** data, int*size) override;
+ virtual void BackUp(int count) override;
+ virtual bool Skip(int count) override;
+ virtual int64_t ByteCount() const override;
+ virtual void seek(PositionProvider& position) override;
+ virtual std::string getName() const override;
+ };
+
+}
+
+#endif //ORC_INPUTSTREAM_HH
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
index 11a21c0bd3..dd9327adf9 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
@@ -1,147 +1,147 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/Exceptions.hh"
-#include "OutputStream.hh"
-
-#include <sstream>
-
-namespace orc {
-
- PositionRecorder::~PositionRecorder() {
- // PASS
- }
-
- BufferedOutputStream::BufferedOutputStream(
- MemoryPool& pool,
- OutputStream * outStream,
- uint64_t capacity_,
- uint64_t blockSize_)
- : outputStream(outStream),
- blockSize(blockSize_) {
- dataBuffer.reset(new DataBuffer<char>(pool));
- dataBuffer->reserve(capacity_);
- }
-
- BufferedOutputStream::~BufferedOutputStream() {
- // PASS
- }
-
- bool BufferedOutputStream::Next(void** buffer, int* size) {
- *size = static_cast<int>(blockSize);
- uint64_t oldSize = dataBuffer->size();
- uint64_t newSize = oldSize + blockSize;
- uint64_t newCapacity = dataBuffer->capacity();
- while (newCapacity < newSize) {
- newCapacity += dataBuffer->capacity();
- }
- dataBuffer->reserve(newCapacity);
- dataBuffer->resize(newSize);
- *buffer = dataBuffer->data() + oldSize;
- return true;
- }
-
- void BufferedOutputStream::BackUp(int count) {
- if (count >= 0) {
- uint64_t unsignedCount = static_cast<uint64_t>(count);
- if (unsignedCount <= dataBuffer->size()) {
- dataBuffer->resize(dataBuffer->size() - unsignedCount);
- } else {
- throw std::logic_error("Can't backup that much!");
- }
- }
- }
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "OutputStream.hh"
+
+#include <sstream>
+
+namespace orc {
+
+ PositionRecorder::~PositionRecorder() {
+ // PASS
+ }
+
+ BufferedOutputStream::BufferedOutputStream(
+ MemoryPool& pool,
+ OutputStream * outStream,
+ uint64_t capacity_,
+ uint64_t blockSize_)
+ : outputStream(outStream),
+ blockSize(blockSize_) {
+ dataBuffer.reset(new DataBuffer<char>(pool));
+ dataBuffer->reserve(capacity_);
+ }
+
+ BufferedOutputStream::~BufferedOutputStream() {
+ // PASS
+ }
+
+ bool BufferedOutputStream::Next(void** buffer, int* size) {
+ *size = static_cast<int>(blockSize);
+ uint64_t oldSize = dataBuffer->size();
+ uint64_t newSize = oldSize + blockSize;
+ uint64_t newCapacity = dataBuffer->capacity();
+ while (newCapacity < newSize) {
+ newCapacity += dataBuffer->capacity();
+ }
+ dataBuffer->reserve(newCapacity);
+ dataBuffer->resize(newSize);
+ *buffer = dataBuffer->data() + oldSize;
+ return true;
+ }
+
+ void BufferedOutputStream::BackUp(int count) {
+ if (count >= 0) {
+ uint64_t unsignedCount = static_cast<uint64_t>(count);
+ if (unsignedCount <= dataBuffer->size()) {
+ dataBuffer->resize(dataBuffer->size() - unsignedCount);
+ } else {
+ throw std::logic_error("Can't backup that much!");
+ }
+ }
+ }
+
int64_t BufferedOutputStream::ByteCount() const {
- return static_cast<google::protobuf::int64>(dataBuffer->size());
- }
-
- bool BufferedOutputStream::WriteAliasedRaw(const void *, int) {
- throw NotImplementedYet("WriteAliasedRaw is not supported.");
- }
-
- bool BufferedOutputStream::AllowsAliasing() const {
- return false;
- }
-
- std::string BufferedOutputStream::getName() const {
- std::ostringstream result;
- result << "BufferedOutputStream " << dataBuffer->size() << " of "
- << dataBuffer->capacity();
- return result.str();
- }
-
- uint64_t BufferedOutputStream::getSize() const {
- return dataBuffer->size();
- }
-
- uint64_t BufferedOutputStream::flush() {
- uint64_t dataSize = dataBuffer->size();
- outputStream->write(dataBuffer->data(), dataSize);
- dataBuffer->resize(0);
- return dataSize;
- }
-
- void AppendOnlyBufferedStream::write(const char * data, size_t size) {
- size_t dataOffset = 0;
- while (size > 0) {
- if (bufferOffset == bufferLength) {
- if (!outStream->Next(
- reinterpret_cast<void **>(&buffer),
- &bufferLength)) {
- throw std::logic_error("Failed to allocate buffer.");
- }
- bufferOffset = 0;
- }
- size_t len = std::min(
- static_cast<size_t>(bufferLength - bufferOffset),
- size);
- memcpy(buffer + bufferOffset, data + dataOffset, len);
- bufferOffset += static_cast<int>(len);
- dataOffset += len;
- size -= len;
- }
- }
-
- uint64_t AppendOnlyBufferedStream::getSize() const {
- return outStream->getSize();
- }
-
- uint64_t AppendOnlyBufferedStream::flush() {
- outStream->BackUp(bufferLength - bufferOffset);
- bufferOffset = bufferLength = 0;
- buffer = nullptr;
- return outStream->flush();
- }
-
- void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const {
- uint64_t flushedSize = outStream->getSize();
- uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset);
- if (outStream->isCompressed()) {
- // start of the compression chunk in the stream
- recorder->add(flushedSize);
- // number of decompressed bytes that need to be consumed
- recorder->add(unflushedSize);
- } else {
- flushedSize -= static_cast<uint64_t>(bufferLength);
- // byte offset of the start location
- recorder->add(flushedSize + unflushedSize);
- }
- }
-
-}
+ return static_cast<google::protobuf::int64>(dataBuffer->size());
+ }
+
+ bool BufferedOutputStream::WriteAliasedRaw(const void *, int) {
+ throw NotImplementedYet("WriteAliasedRaw is not supported.");
+ }
+
+ bool BufferedOutputStream::AllowsAliasing() const {
+ return false;
+ }
+
+ std::string BufferedOutputStream::getName() const {
+ std::ostringstream result;
+ result << "BufferedOutputStream " << dataBuffer->size() << " of "
+ << dataBuffer->capacity();
+ return result.str();
+ }
+
+ uint64_t BufferedOutputStream::getSize() const {
+ return dataBuffer->size();
+ }
+
+ uint64_t BufferedOutputStream::flush() {
+ uint64_t dataSize = dataBuffer->size();
+ outputStream->write(dataBuffer->data(), dataSize);
+ dataBuffer->resize(0);
+ return dataSize;
+ }
+
+ void AppendOnlyBufferedStream::write(const char * data, size_t size) {
+ size_t dataOffset = 0;
+ while (size > 0) {
+ if (bufferOffset == bufferLength) {
+ if (!outStream->Next(
+ reinterpret_cast<void **>(&buffer),
+ &bufferLength)) {
+ throw std::logic_error("Failed to allocate buffer.");
+ }
+ bufferOffset = 0;
+ }
+ size_t len = std::min(
+ static_cast<size_t>(bufferLength - bufferOffset),
+ size);
+ memcpy(buffer + bufferOffset, data + dataOffset, len);
+ bufferOffset += static_cast<int>(len);
+ dataOffset += len;
+ size -= len;
+ }
+ }
+
+ uint64_t AppendOnlyBufferedStream::getSize() const {
+ return outStream->getSize();
+ }
+
+ uint64_t AppendOnlyBufferedStream::flush() {
+ outStream->BackUp(bufferLength - bufferOffset);
+ bufferOffset = bufferLength = 0;
+ buffer = nullptr;
+ return outStream->flush();
+ }
+
+ void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const {
+ uint64_t flushedSize = outStream->getSize();
+ uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset);
+ if (outStream->isCompressed()) {
+ // start of the compression chunk in the stream
+ recorder->add(flushedSize);
+ // number of decompressed bytes that need to be consumed
+ recorder->add(unflushedSize);
+ } else {
+ flushedSize -= static_cast<uint64_t>(bufferLength);
+ // byte offset of the start location
+ recorder->add(flushedSize + unflushedSize);
+ }
+ }
+
+}
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
index 7ce9fafa24..e40263fdfb 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
@@ -1,96 +1,96 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_OUTPUTSTREAM_HH
-#define ORC_OUTPUTSTREAM_HH
-
-#include "Adaptor.hh"
-#include "orc/OrcFile.hh"
-#include "wrap/zero-copy-stream-wrapper.h"
-
-namespace orc {
-
- /**
- * Record write position for creating index stream
- */
- class PositionRecorder {
- public:
- virtual ~PositionRecorder();
- virtual void add(uint64_t pos) = 0;
- };
-
- /**
- * A subclass of Google's ZeroCopyOutputStream that supports output to memory
- * buffer, and flushing to OutputStream.
- * By extending Google's class, we get the ability to pass it directly
- * to the protobuf writers.
- */
- class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream {
- private:
- OutputStream * outputStream;
- std::unique_ptr<DataBuffer<char> > dataBuffer;
- uint64_t blockSize;
-
- public:
- BufferedOutputStream(MemoryPool& pool,
- OutputStream * outStream,
- uint64_t capacity,
- uint64_t block_size);
- virtual ~BufferedOutputStream() override;
-
- virtual bool Next(void** data, int*size) override;
- virtual void BackUp(int count) override;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_OUTPUTSTREAM_HH
+#define ORC_OUTPUTSTREAM_HH
+
+#include "Adaptor.hh"
+#include "orc/OrcFile.hh"
+#include "wrap/zero-copy-stream-wrapper.h"
+
+namespace orc {
+
+ /**
+ * Record write position for creating index stream
+ */
+ class PositionRecorder {
+ public:
+ virtual ~PositionRecorder();
+ virtual void add(uint64_t pos) = 0;
+ };
+
+ /**
+ * A subclass of Google's ZeroCopyOutputStream that supports output to memory
+ * buffer, and flushing to OutputStream.
+ * By extending Google's class, we get the ability to pass it directly
+ * to the protobuf writers.
+ */
+ class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream {
+ private:
+ OutputStream * outputStream;
+ std::unique_ptr<DataBuffer<char> > dataBuffer;
+ uint64_t blockSize;
+
+ public:
+ BufferedOutputStream(MemoryPool& pool,
+ OutputStream * outStream,
+ uint64_t capacity,
+ uint64_t block_size);
+ virtual ~BufferedOutputStream() override;
+
+ virtual bool Next(void** data, int*size) override;
+ virtual void BackUp(int count) override;
virtual int64_t ByteCount() const override;
- virtual bool WriteAliasedRaw(const void * data, int size) override;
- virtual bool AllowsAliasing() const override;
-
- virtual std::string getName() const;
- virtual uint64_t getSize() const;
- virtual uint64_t flush();
-
- virtual bool isCompressed() const { return false; }
- };
-
- /**
- * An append only buffered stream that allows
- * buffer, and flushing to OutputStream.
- * By extending Google's class, we get the ability to pass it directly
- * to the protobuf writers.
- */
- class AppendOnlyBufferedStream {
- private:
- std::unique_ptr<BufferedOutputStream> outStream;
- char * buffer;
- int bufferOffset, bufferLength;
-
- public:
- AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) :
- outStream(std::move(_outStream)) {
- buffer = nullptr;
- bufferOffset = bufferLength = 0;
- }
-
- void write(const char * data, size_t size);
- uint64_t getSize() const;
- uint64_t flush();
-
- void recordPosition(PositionRecorder* recorder) const;
- };
-}
-
-#endif // ORC_OUTPUTSTREAM_HH
+ virtual bool WriteAliasedRaw(const void * data, int size) override;
+ virtual bool AllowsAliasing() const override;
+
+ virtual std::string getName() const;
+ virtual uint64_t getSize() const;
+ virtual uint64_t flush();
+
+ virtual bool isCompressed() const { return false; }
+ };
+
+ /**
+ * An append only buffered stream that allows
+ * buffer, and flushing to OutputStream.
+ * By extending Google's class, we get the ability to pass it directly
+ * to the protobuf writers.
+ */
+ class AppendOnlyBufferedStream {
+ private:
+ std::unique_ptr<BufferedOutputStream> outStream;
+ char * buffer;
+ int bufferOffset, bufferLength;
+
+ public:
+ AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) :
+ outStream(std::move(_outStream)) {
+ buffer = nullptr;
+ bufferOffset = bufferLength = 0;
+ }
+
+ void write(const char * data, size_t size);
+ uint64_t getSize() const;
+ uint64_t flush();
+
+ void recordPosition(PositionRecorder* recorder) const;
+ };
+}
+
+#endif // ORC_OUTPUTSTREAM_HH
diff --git a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
index 605fbf826c..8d1eab50b4 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
@@ -1,35 +1,35 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CODED_STREAM_WRAPPER_HH
-#define CODED_STREAM_WRAPPER_HH
-
-#include "Adaptor.hh"
-
-DIAGNOSTIC_PUSH
-
-#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wconversion")
-#endif
-
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CODED_STREAM_WRAPPER_HH
+#define CODED_STREAM_WRAPPER_HH
+
+#include "Adaptor.hh"
+
+DIAGNOSTIC_PUSH
+
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+ DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wconversion")
+#endif
+
#include <google/protobuf/io/coded_stream.h>
-
-DIAGNOSTIC_POP
-
-#endif
+
+DIAGNOSTIC_POP
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
index 5c161660cc..dc8e9de7f6 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
+++ b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
@@ -1,47 +1,47 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ORC_PROTO_WRAPPER_HH
-#define ORC_PROTO_WRAPPER_HH
-
-#include "Adaptor.hh"
-
-DIAGNOSTIC_PUSH
-
-#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wconversion")
- DIAGNOSTIC_IGNORE("-Wdeprecated")
- DIAGNOSTIC_IGNORE("-Wsign-conversion")
- DIAGNOSTIC_IGNORE("-Wunused-parameter")
-#endif
-
-#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wnested-anon-types")
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
- DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
- DIAGNOSTIC_IGNORE("-Wunknown-warning-option")
- DIAGNOSTIC_IGNORE("-Wweak-vtables")
- DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant")
-#endif
-
-#if defined(_MSC_VER)
- DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned
- DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false'
-#endif
-
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_PROTO_WRAPPER_HH
+#define ORC_PROTO_WRAPPER_HH
+
+#include "Adaptor.hh"
+
+DIAGNOSTIC_PUSH
+
+#if defined(__GNUC__) || defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wconversion")
+ DIAGNOSTIC_IGNORE("-Wdeprecated")
+ DIAGNOSTIC_IGNORE("-Wsign-conversion")
+ DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#endif
+
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wnested-anon-types")
+ DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+ DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+ DIAGNOSTIC_IGNORE("-Wunknown-warning-option")
+ DIAGNOSTIC_IGNORE("-Wweak-vtables")
+ DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant")
+#endif
+
+#if defined(_MSC_VER)
+ DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned
+ DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false'
+#endif
+
#include "contrib/libs/apache/orc/proto/orc_proto.pb.h"
-
-DIAGNOSTIC_POP
-
-#endif
+
+DIAGNOSTIC_POP
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h
index aeab0f0033..497ae6f508 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h
@@ -1,30 +1,30 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SNAPPY_WRAPPER_HH
-#define SNAPPY_WRAPPER_HH
-
-#include "Adaptor.hh"
-
-DIAGNOSTIC_PUSH
-
-#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
-#endif
-
-#include <snappy.h>
-
-DIAGNOSTIC_POP
-
-#endif
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SNAPPY_WRAPPER_HH
+#define SNAPPY_WRAPPER_HH
+
+#include "Adaptor.hh"
+
+DIAGNOSTIC_PUSH
+
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+#endif
+
+#include <snappy.h>
+
+DIAGNOSTIC_POP
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
index 1af0bd002d..7cf1491d3d 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
@@ -1,36 +1,36 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ZERO_COPY_STREAM_WRAPPER_HH
-#define ZERO_COPY_STREAM_WRAPPER_HH
-
-#include "Adaptor.hh"
-
-DIAGNOSTIC_PUSH
-
-#if defined(__GNUC__) || defined(__clang__)
- DIAGNOSTIC_IGNORE("-Wdeprecated")
- DIAGNOSTIC_IGNORE("-Wpadded")
- DIAGNOSTIC_IGNORE("-Wunused-parameter")
-#endif
-
-#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
-#endif
-
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ZERO_COPY_STREAM_WRAPPER_HH
+#define ZERO_COPY_STREAM_WRAPPER_HH
+
+#include "Adaptor.hh"
+
+DIAGNOSTIC_PUSH
+
+#if defined(__GNUC__) || defined(__clang__)
+ DIAGNOSTIC_IGNORE("-Wdeprecated")
+ DIAGNOSTIC_IGNORE("-Wpadded")
+ DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#endif
+
+#ifdef __clang__
+ DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
+#endif
+
#include <google/protobuf/io/zero_copy_stream.h>
-
-DIAGNOSTIC_POP
-
-#endif
+
+DIAGNOSTIC_POP
+
+#endif