diff options
author | iaz1607 <iaz1607@yandex-team.ru> | 2022-02-10 16:45:37 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:37 +0300 |
commit | e5437feb4ac2d2dc044e1090b9312dde5ef197e0 (patch) | |
tree | f5a238c69dd20a1fa2092127a31b8aff25020f7d /contrib/libs/apache/orc/c++ | |
parent | f4945d0a44b8770f0801de3056aa41639b0b7bd2 (diff) | |
download | ydb-e5437feb4ac2d2dc044e1090b9312dde5ef197e0.tar.gz |
Restoring authorship annotation for <iaz1607@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/apache/orc/c++')
65 files changed, 23322 insertions, 23322 deletions
diff --git a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh index 86c1288b62..42f0476f03 100644 --- a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh @@ -1,45 +1,45 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_BLOOMFILTER_HH -#define ORC_BLOOMFILTER_HH - -#include "orc/orc-config.hh" - -#include <memory> -#include <vector> - -namespace orc { - - class BloomFilter { - public: - virtual ~BloomFilter(); - - // test if the element exists in BloomFilter - virtual bool testBytes(const char * data, int64_t length) const = 0; - virtual bool testLong(int64_t data) const = 0; - virtual bool testDouble(double data) const = 0; - }; - - struct BloomFilterIndex { - std::vector<std::shared_ptr<BloomFilter>> entries; - }; - -}; - -#endif //ORC_BLOOMFILTER_HH +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BLOOMFILTER_HH +#define ORC_BLOOMFILTER_HH + +#include "orc/orc-config.hh" + +#include <memory> +#include <vector> + +namespace orc { + + class BloomFilter { + public: + virtual ~BloomFilter(); + + // test if the element exists in BloomFilter + virtual bool testBytes(const char * data, int64_t length) const = 0; + virtual bool testLong(int64_t data) const = 0; + virtual bool testDouble(double data) const = 0; + }; + + struct BloomFilterIndex { + std::vector<std::shared_ptr<BloomFilter>> entries; + }; + +}; + +#endif //ORC_BLOOMFILTER_HH diff --git a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh index aa19214738..349cabe025 100644 --- a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh @@ -1,51 +1,51 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COLUMN_PRINTER_HH -#define ORC_COLUMN_PRINTER_HH - -#include "orc/orc-config.hh" -#include "orc/OrcFile.hh" -#include "orc/Vector.hh" - -#include <stdio.h> -#include <string> -#include <memory> -#include <string> -#include <vector> - -namespace orc { - - class ColumnPrinter { - protected: - std::string &buffer; - bool hasNulls ; - const char* notNull; - - public: - ColumnPrinter(std::string&); - virtual ~ColumnPrinter(); - virtual void printRow(uint64_t rowId) = 0; - // should be called once at the start of each batch of rows - virtual void reset(const ColumnVectorBatch& batch); - }; - - ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&, - const Type* type); -} -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COLUMN_PRINTER_HH +#define ORC_COLUMN_PRINTER_HH + +#include "orc/orc-config.hh" +#include "orc/OrcFile.hh" +#include "orc/Vector.hh" + +#include <stdio.h> +#include <string> +#include <memory> +#include <string> +#include <vector> + +namespace orc { + + class ColumnPrinter { + protected: + std::string &buffer; + bool hasNulls ; + const char* notNull; + + public: + ColumnPrinter(std::string&); + virtual ~ColumnPrinter(); + virtual void printRow(uint64_t rowId) = 0; + // should be called once at the start of each batch of rows + virtual void reset(const ColumnVectorBatch& batch); + }; + + ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&, + const Type* type); +} +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Common.hh b/contrib/libs/apache/orc/c++/include/orc/Common.hh index 4aa4a85118..34dc0a118f 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Common.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Common.hh @@ -1,286 +1,286 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COMMON_HH -#define ORC_COMMON_HH - -#include "orc/Vector.hh" -#include "orc/Type.hh" -#include "orc/Exceptions.hh" - -#include <string> - -namespace orc { - - class FileVersion { - private: - uint32_t majorVersion; - uint32_t minorVersion; - public: - static const FileVersion& v_0_11(); - static const FileVersion& v_0_12(); - - FileVersion(uint32_t major, uint32_t minor) : - majorVersion(major), minorVersion(minor) { - } - - /** - * Get major version - */ - uint32_t getMajor() const { - return this->majorVersion; - } - - /** - * Get minor version - */ - uint32_t getMinor() const { - return this->minorVersion; - } - - bool operator == (const FileVersion & right) const { - return this->majorVersion == right.getMajor() && - this->minorVersion == right.getMinor(); - } - - bool operator != (const FileVersion & right) const { - return !(*this == right); - } - - std::string toString() const; - }; - - enum WriterId { - ORC_JAVA_WRITER = 0, - ORC_CPP_WRITER = 1, - PRESTO_WRITER = 2, +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COMMON_HH +#define ORC_COMMON_HH + +#include "orc/Vector.hh" +#include "orc/Type.hh" +#include "orc/Exceptions.hh" + +#include <string> + +namespace orc { + + class FileVersion { + private: + uint32_t majorVersion; + uint32_t minorVersion; + public: + static const FileVersion& v_0_11(); + static const FileVersion& v_0_12(); + + FileVersion(uint32_t major, uint32_t minor) : + majorVersion(major), minorVersion(minor) { + } + + /** + * Get major version + */ + uint32_t getMajor() const { + return this->majorVersion; + } + + /** + * Get minor version + */ + uint32_t getMinor() const { + return this->minorVersion; + } + + bool operator == (const FileVersion & right) const { + return this->majorVersion == right.getMajor() && + this->minorVersion == right.getMinor(); + } + + bool operator != (const FileVersion & right) const { + return !(*this == right); + } + + std::string toString() const; + }; + + enum WriterId { + ORC_JAVA_WRITER = 0, + ORC_CPP_WRITER = 1, + PRESTO_WRITER = 2, SCRITCHLEY_GO = 3, TRINO_WRITER = 4, - UNKNOWN_WRITER = INT32_MAX - }; - + UNKNOWN_WRITER = INT32_MAX + }; + std::string writerIdToString(uint32_t id); - enum CompressionKind { - CompressionKind_NONE = 0, - CompressionKind_ZLIB = 1, - CompressionKind_SNAPPY = 2, - CompressionKind_LZO = 3, - CompressionKind_LZ4 = 4, - CompressionKind_ZSTD = 5, - CompressionKind_MAX = INT32_MAX - }; - - /** - * Get the name of the CompressionKind. - */ - std::string compressionKindToString(CompressionKind kind); - - enum WriterVersion { - WriterVersion_ORIGINAL = 0, - WriterVersion_HIVE_8732 = 1, - WriterVersion_HIVE_4243 = 2, - WriterVersion_HIVE_12055 = 3, - WriterVersion_HIVE_13083 = 4, - WriterVersion_ORC_101 = 5, - WriterVersion_ORC_135 = 6, + enum CompressionKind { + CompressionKind_NONE = 0, + CompressionKind_ZLIB = 1, + CompressionKind_SNAPPY = 2, + CompressionKind_LZO = 3, + CompressionKind_LZ4 = 4, + CompressionKind_ZSTD = 5, + CompressionKind_MAX = INT32_MAX + }; + + /** + * Get the name of the CompressionKind. + */ + std::string compressionKindToString(CompressionKind kind); + + enum WriterVersion { + WriterVersion_ORIGINAL = 0, + WriterVersion_HIVE_8732 = 1, + WriterVersion_HIVE_4243 = 2, + WriterVersion_HIVE_12055 = 3, + WriterVersion_HIVE_13083 = 4, + WriterVersion_ORC_101 = 5, + WriterVersion_ORC_135 = 6, WriterVersion_ORC_517 = 7, WriterVersion_ORC_203 = 8, WriterVersion_ORC_14 = 9, - WriterVersion_MAX = INT32_MAX - }; - - /** - * Get the name of the WriterVersion. - */ - std::string writerVersionToString(WriterVersion kind); - - enum StreamKind { - StreamKind_PRESENT = 0, - StreamKind_DATA = 1, - StreamKind_LENGTH = 2, - StreamKind_DICTIONARY_DATA = 3, - StreamKind_DICTIONARY_COUNT = 4, - StreamKind_SECONDARY = 5, - StreamKind_ROW_INDEX = 6, - StreamKind_BLOOM_FILTER = 7, - StreamKind_BLOOM_FILTER_UTF8 = 8 - }; - - /** - * Get the string representation of the StreamKind. - */ - std::string streamKindToString(StreamKind kind); - - class StreamInformation { - public: - virtual ~StreamInformation(); - - virtual StreamKind getKind() const = 0; - virtual uint64_t getColumnId() const = 0; - virtual uint64_t getOffset() const = 0; - virtual uint64_t getLength() const = 0; - }; - - enum ColumnEncodingKind { - ColumnEncodingKind_DIRECT = 0, - ColumnEncodingKind_DICTIONARY = 1, - ColumnEncodingKind_DIRECT_V2 = 2, - ColumnEncodingKind_DICTIONARY_V2 = 3 - }; - - std::string columnEncodingKindToString(ColumnEncodingKind kind); - - class StripeInformation { - public: - virtual ~StripeInformation(); - - /** - * Get the byte offset of the start of the stripe. - * @return the bytes from the start of the file - */ - virtual uint64_t getOffset() const = 0; - - /** - * Get the total length of the stripe in bytes. - * @return the number of bytes in the stripe - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the length of the stripe's indexes. - * @return the number of bytes in the index - */ - virtual uint64_t getIndexLength() const = 0; - - /** - * Get the length of the stripe's data. - * @return the number of bytes in the stripe - */ - virtual uint64_t getDataLength()const = 0; - - /** - * Get the length of the stripe's tail section, which contains its index. - * @return the number of bytes in the tail - */ - virtual uint64_t getFooterLength() const = 0; - - /** - * Get the number of rows in the stripe. - * @return a count of the number of rows - */ - virtual uint64_t getNumberOfRows() const = 0; - - /** - * Get the number of streams in the stripe. - */ - virtual uint64_t getNumberOfStreams() const = 0; - - /** - * Get the StreamInformation for the given stream. - */ - virtual ORC_UNIQUE_PTR<StreamInformation> - getStreamInformation(uint64_t streamId) const = 0; - - /** - * Get the column encoding for the given column. - * @param colId the columnId - */ - virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; - - /** - * Get the dictionary size. - * @param colId the columnId - * @return the size of the dictionary or 0 if there isn't one - */ - virtual uint64_t getDictionarySize(uint64_t colId) const = 0; - - /** - * Get the writer timezone. - */ - virtual const std::string& getWriterTimezone() const = 0; - }; - - // Return true if val1 < val2; otherwise return false - template <typename T> - inline bool compare(T val1, T val2) { - return (val1 < val2); - } - - // Specialization for Decimal - template <> - inline bool compare(Decimal val1, Decimal val2) { - // compare integral parts - Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, - val1.scale); - Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, - val2.scale); - - if (integral1 < integral2) { - return true; - } else if (integral1 > integral2) { - return false; - } - - // integral parts are equal, continue comparing fractional parts - // unnecessary to check overflow here because the scaled number will not - // exceed original ones - bool overflow = false, positive = val1.value >= 0; - val1.value -= scaleUpInt128ByPowerOfTen(integral1, - val1.scale, - overflow); - val2.value -= scaleUpInt128ByPowerOfTen(integral2, - val2.scale, - overflow); - - int32_t diff = val1.scale - val2.scale; - if (diff > 0) { - val2.value = scaleUpInt128ByPowerOfTen(val2.value, - diff, - overflow); - if (overflow) { - return positive ? true : false; - } - } else { - val1.value = scaleUpInt128ByPowerOfTen(val1.value, - -diff, - overflow); - if (overflow) { - return positive ? false : true; - } - } - - if (val1.value < val2.value) { - return true; - } - return false; - } - - enum BloomFilterVersion { - // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support - // both old and new readers. - ORIGINAL = 0, - // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8. - // See ORC-101 - UTF8 = 1, - FUTURE = INT32_MAX - }; - -} - -#endif + WriterVersion_MAX = INT32_MAX + }; + + /** + * Get the name of the WriterVersion. + */ + std::string writerVersionToString(WriterVersion kind); + + enum StreamKind { + StreamKind_PRESENT = 0, + StreamKind_DATA = 1, + StreamKind_LENGTH = 2, + StreamKind_DICTIONARY_DATA = 3, + StreamKind_DICTIONARY_COUNT = 4, + StreamKind_SECONDARY = 5, + StreamKind_ROW_INDEX = 6, + StreamKind_BLOOM_FILTER = 7, + StreamKind_BLOOM_FILTER_UTF8 = 8 + }; + + /** + * Get the string representation of the StreamKind. + */ + std::string streamKindToString(StreamKind kind); + + class StreamInformation { + public: + virtual ~StreamInformation(); + + virtual StreamKind getKind() const = 0; + virtual uint64_t getColumnId() const = 0; + virtual uint64_t getOffset() const = 0; + virtual uint64_t getLength() const = 0; + }; + + enum ColumnEncodingKind { + ColumnEncodingKind_DIRECT = 0, + ColumnEncodingKind_DICTIONARY = 1, + ColumnEncodingKind_DIRECT_V2 = 2, + ColumnEncodingKind_DICTIONARY_V2 = 3 + }; + + std::string columnEncodingKindToString(ColumnEncodingKind kind); + + class StripeInformation { + public: + virtual ~StripeInformation(); + + /** + * Get the byte offset of the start of the stripe. + * @return the bytes from the start of the file + */ + virtual uint64_t getOffset() const = 0; + + /** + * Get the total length of the stripe in bytes. + * @return the number of bytes in the stripe + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the length of the stripe's indexes. + * @return the number of bytes in the index + */ + virtual uint64_t getIndexLength() const = 0; + + /** + * Get the length of the stripe's data. + * @return the number of bytes in the stripe + */ + virtual uint64_t getDataLength()const = 0; + + /** + * Get the length of the stripe's tail section, which contains its index. + * @return the number of bytes in the tail + */ + virtual uint64_t getFooterLength() const = 0; + + /** + * Get the number of rows in the stripe. + * @return a count of the number of rows + */ + virtual uint64_t getNumberOfRows() const = 0; + + /** + * Get the number of streams in the stripe. + */ + virtual uint64_t getNumberOfStreams() const = 0; + + /** + * Get the StreamInformation for the given stream. + */ + virtual ORC_UNIQUE_PTR<StreamInformation> + getStreamInformation(uint64_t streamId) const = 0; + + /** + * Get the column encoding for the given column. + * @param colId the columnId + */ + virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; + + /** + * Get the dictionary size. + * @param colId the columnId + * @return the size of the dictionary or 0 if there isn't one + */ + virtual uint64_t getDictionarySize(uint64_t colId) const = 0; + + /** + * Get the writer timezone. + */ + virtual const std::string& getWriterTimezone() const = 0; + }; + + // Return true if val1 < val2; otherwise return false + template <typename T> + inline bool compare(T val1, T val2) { + return (val1 < val2); + } + + // Specialization for Decimal + template <> + inline bool compare(Decimal val1, Decimal val2) { + // compare integral parts + Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, + val1.scale); + Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, + val2.scale); + + if (integral1 < integral2) { + return true; + } else if (integral1 > integral2) { + return false; + } + + // integral parts are equal, continue comparing fractional parts + // unnecessary to check overflow here because the scaled number will not + // exceed original ones + bool overflow = false, positive = val1.value >= 0; + val1.value -= scaleUpInt128ByPowerOfTen(integral1, + val1.scale, + overflow); + val2.value -= scaleUpInt128ByPowerOfTen(integral2, + val2.scale, + overflow); + + int32_t diff = val1.scale - val2.scale; + if (diff > 0) { + val2.value = scaleUpInt128ByPowerOfTen(val2.value, + diff, + overflow); + if (overflow) { + return positive ? true : false; + } + } else { + val1.value = scaleUpInt128ByPowerOfTen(val1.value, + -diff, + overflow); + if (overflow) { + return positive ? false : true; + } + } + + if (val1.value < val2.value) { + return true; + } + return false; + } + + enum BloomFilterVersion { + // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support + // both old and new readers. + ORIGINAL = 0, + // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8. + // See ORC-101 + UTF8 = 1, + FUTURE = INT32_MAX + }; + +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh index 9765d4fd6b..e991f9eecd 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh @@ -1,60 +1,60 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_EXCEPTIONS_HH -#define ORC_EXCEPTIONS_HH - -#include "orc/orc-config.hh" - -#include <stdexcept> -#include <string> - -namespace orc { - - class NotImplementedYet: public std::logic_error { - public: - explicit NotImplementedYet(const std::string& what_arg); - explicit NotImplementedYet(const char* what_arg); - virtual ~NotImplementedYet() ORC_NOEXCEPT; - NotImplementedYet(const NotImplementedYet&); - private: - NotImplementedYet& operator=(const NotImplementedYet&); - }; - - class ParseError: public std::runtime_error { - public: - explicit ParseError(const std::string& what_arg); - explicit ParseError(const char* what_arg); - virtual ~ParseError() ORC_NOEXCEPT; - ParseError(const ParseError&); - private: - ParseError& operator=(const ParseError&); - }; - - class InvalidArgument: public std::runtime_error { - public: - explicit InvalidArgument(const std::string& what_arg); - explicit InvalidArgument(const char* what_arg); - virtual ~InvalidArgument() ORC_NOEXCEPT; - InvalidArgument(const InvalidArgument&); - private: - InvalidArgument& operator=(const InvalidArgument&); - }; -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_EXCEPTIONS_HH +#define ORC_EXCEPTIONS_HH + +#include "orc/orc-config.hh" + +#include <stdexcept> +#include <string> + +namespace orc { + + class NotImplementedYet: public std::logic_error { + public: + explicit NotImplementedYet(const std::string& what_arg); + explicit NotImplementedYet(const char* what_arg); + virtual ~NotImplementedYet() ORC_NOEXCEPT; + NotImplementedYet(const NotImplementedYet&); + private: + NotImplementedYet& operator=(const NotImplementedYet&); + }; + + class ParseError: public std::runtime_error { + public: + explicit ParseError(const std::string& what_arg); + explicit ParseError(const char* what_arg); + virtual ~ParseError() ORC_NOEXCEPT; + ParseError(const ParseError&); + private: + ParseError& operator=(const ParseError&); + }; + + class InvalidArgument: public std::runtime_error { + public: + explicit InvalidArgument(const std::string& what_arg); + explicit InvalidArgument(const char* what_arg); + virtual ~InvalidArgument() ORC_NOEXCEPT; + InvalidArgument(const InvalidArgument&); + private: + InvalidArgument& operator=(const InvalidArgument&); + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh index f86d8f08a6..63b84478c6 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh @@ -1,372 +1,372 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_INT_128 -#define ORC_INT_128 - -#include "orc/orc-config.hh" - -#include <stdexcept> -#include <string> - -namespace orc { - - /** - * Represents a signed 128-bit integer in two's complement. - * Calculations wrap around and overflow is ignored. - * - * For a discussion of the algorithms, look at Knuth's volume 2, - * Semi-numerical Algorithms section 4.3.1. - * - */ - class Int128 { - public: - Int128() { - highbits = 0; - lowbits = 0; - } - - /** - * Convert a signed 64 bit value into an Int128. - */ - Int128(int64_t right) { - if (right >= 0) { - highbits = 0; - lowbits = static_cast<uint64_t>(right); - } else { - highbits = -1; - lowbits = static_cast<uint64_t>(right); - } - } - - /** - * Create from the twos complement representation. - */ - Int128(int64_t high, uint64_t low) { - highbits = high; - lowbits = low; - } - - /** - * Parse the number from a base 10 string representation. - */ - explicit Int128(const std::string&); - - /** - * Maximum positive value allowed by the type. - */ - static Int128 maximumValue(); - - /** - * Minimum negative value allowed by the type. - */ - static Int128 minimumValue(); - - Int128& negate() { - lowbits = ~lowbits + 1; - highbits = ~highbits; - if (lowbits == 0) { - highbits += 1; - } - return *this; - } - - Int128& abs() { - if (highbits < 0) { - negate(); - } - return *this; - } - - Int128 abs() const { - Int128 value = *this; - value.abs(); - return value; - } - - Int128& invert() { - lowbits = ~lowbits; - highbits = ~highbits; - return *this; - } - - /** - * Add a number to this one. The result is truncated to 128 bits. - * @param right the number to add - * @return *this - */ - Int128& operator+=(const Int128 &right) { - uint64_t sum = lowbits + right.lowbits; - highbits += right.highbits; - if (sum < lowbits) { - highbits += 1; - } - lowbits = sum; - return *this; - } - - /** - * Subtract a number from this one. The result is truncated to 128 bits. - * @param right the number to subtract - * @return *this - */ - Int128& operator-=(const Int128 &right) { - uint64_t diff = lowbits - right.lowbits; - highbits -= right.highbits; - if (diff > lowbits) { - highbits -= 1; - } - lowbits = diff; - return *this; - } - - /** - * Multiply this number by a number. The result is truncated to 128 bits. - * @param right the number to multiply by - * @return *this - */ - Int128& operator*=(const Int128 &right); - - /** - * Divide this number by right and return the result. This operation is - * not destructive. - * - * The answer rounds to zero. Signs work like: - * 21 / 5 -> 4, 1 - * -21 / 5 -> -4, -1 - * 21 / -5 -> -4, 1 - * -21 / -5 -> 4, -1 - * @param right the number to divide by - * @param remainder the remainder after the division - */ - Int128 divide(const Int128 &right, Int128& remainder) const; - - /** - * Logical or between two Int128. - * @param right the number to or in - * @return *this - */ - Int128& operator|=(const Int128 &right) { - lowbits |= right.lowbits; - highbits |= right.highbits; - return *this; - } - - /** - * Logical and between two Int128. - * @param right the number to and in - * @return *this - */ - Int128& operator&=(const Int128 &right) { - lowbits &= right.lowbits; - highbits &= right.highbits; - return *this; - } - - /** - * Logical and between two Int128. - * @param right the number to and in - * @return logical and result - */ - Int128 operator&(const Int128 &right) { - Int128 value = *this; - value &= right; - return value; - } - - /** - * Shift left by the given number of bits. - * Values larger than 2**127 will shift into the sign bit. - */ - Int128& operator<<=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - highbits <<= bits; - highbits |= (lowbits >> (64 - bits)); - lowbits <<= bits; - } else if (bits < 128) { - highbits = static_cast<int64_t>(lowbits) << (bits - 64); - lowbits = 0; - } else { - highbits = 0; - lowbits = 0; - } - } - return *this; - } - - /** - * Shift right by the given number of bits. Negative values will - * sign extend and fill with one bits. - */ - Int128& operator>>=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - lowbits >>= bits; - lowbits |= static_cast<uint64_t>(highbits << (64 - bits)); - highbits = static_cast<int64_t> - (static_cast<uint64_t>(highbits) >> bits); - } else if (bits < 128) { - lowbits = static_cast<uint64_t>(highbits >> (bits - 64)); - highbits = highbits >= 0 ? 0 : -1l; - } else { - highbits = highbits >= 0 ? 0 : -1l; - lowbits = static_cast<uint64_t>(highbits); - } - } - return *this; - } - - bool operator==(const Int128& right) const { - return highbits == right.highbits && lowbits == right.lowbits; - } - - bool operator!=(const Int128& right) const { - return highbits != right.highbits || lowbits != right.lowbits; - } - - bool operator<(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits < right.lowbits; - } else { - return highbits < right.highbits; - } - } - - bool operator<=(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits <= right.lowbits; - } else { - return highbits <= right.highbits; - } - } - - bool operator>(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits > right.lowbits; - } else { - return highbits > right.highbits; - } - } - - bool operator>=(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits >= right.lowbits; - } else { - return highbits >= right.highbits; - } - } - - uint32_t hash() const { - return static_cast<uint32_t>(highbits >> 32) ^ - static_cast<uint32_t>(highbits) ^ - static_cast<uint32_t>(lowbits >> 32) ^ - static_cast<uint32_t>(lowbits); - } - - /** - * Does this value fit into a long? - */ - bool fitsInLong() const { - switch (highbits) { - case 0: - return 0 == (lowbits & LONG_SIGN_BIT); - case -1: - return 0 != (lowbits & LONG_SIGN_BIT); - default: - return false; - } - } - - /** - * Convert the value to a long and - */ - int64_t toLong() const { - if (fitsInLong()) { - return static_cast<int64_t>(lowbits); - } - throw std::range_error("Int128 too large to convert to long"); - } - - /** - * Return the base 10 string representation of the integer. - */ - std::string toString() const; - - /** - * Return the base 10 string representation with a decimal point, - * the given number of places after the decimal. - */ - std::string toDecimalString(int32_t scale=0) const; - - /** - * Return the base 16 string representation of the two's complement with - * a prefix of "0x". - * Int128(-1).toHexString() = "0xffffffffffffffffffffffffffffffff". - */ - std::string toHexString() const; - - /** - * Get the high bits of the twos complement representation of the number. - */ - int64_t getHighBits() { - return highbits; - } - - /** - * Get the low bits of the twos complement representation of the number. - */ - uint64_t getLowBits() { - return lowbits; - } - - /** - * Represent the absolute number as a list of uint32. - * Visible for testing only. - * @param array the array that is set to the value of the number - * @param wasNegative set to true if the original number was negative - * @return the number of elements that were set in the array (1 to 4) - */ - int64_t fillInArray(uint32_t* array, bool &wasNegative) const; - - private: - static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u; - int64_t highbits; - uint64_t lowbits; - }; - - - /** - * Scales up an Int128 value - * @param value the Int128 value to scale - * @param power the scale offset. Result of a negative factor is undefined. - * @param overflow returns whether the result overflows or not - * @return the scaled value - */ - Int128 scaleUpInt128ByPowerOfTen(Int128 value, - int32_t power, - bool &overflow); - /** - * Scales down an Int128 value - * @param value the Int128 value to scale - * @param power the scale offset. Result of a negative factor is undefined. - * @return the scaled value - */ - Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power); -} -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_INT_128 +#define ORC_INT_128 + +#include "orc/orc-config.hh" + +#include <stdexcept> +#include <string> + +namespace orc { + + /** + * Represents a signed 128-bit integer in two's complement. + * Calculations wrap around and overflow is ignored. + * + * For a discussion of the algorithms, look at Knuth's volume 2, + * Semi-numerical Algorithms section 4.3.1. + * + */ + class Int128 { + public: + Int128() { + highbits = 0; + lowbits = 0; + } + + /** + * Convert a signed 64 bit value into an Int128. + */ + Int128(int64_t right) { + if (right >= 0) { + highbits = 0; + lowbits = static_cast<uint64_t>(right); + } else { + highbits = -1; + lowbits = static_cast<uint64_t>(right); + } + } + + /** + * Create from the twos complement representation. + */ + Int128(int64_t high, uint64_t low) { + highbits = high; + lowbits = low; + } + + /** + * Parse the number from a base 10 string representation. + */ + explicit Int128(const std::string&); + + /** + * Maximum positive value allowed by the type. + */ + static Int128 maximumValue(); + + /** + * Minimum negative value allowed by the type. + */ + static Int128 minimumValue(); + + Int128& negate() { + lowbits = ~lowbits + 1; + highbits = ~highbits; + if (lowbits == 0) { + highbits += 1; + } + return *this; + } + + Int128& abs() { + if (highbits < 0) { + negate(); + } + return *this; + } + + Int128 abs() const { + Int128 value = *this; + value.abs(); + return value; + } + + Int128& invert() { + lowbits = ~lowbits; + highbits = ~highbits; + return *this; + } + + /** + * Add a number to this one. The result is truncated to 128 bits. + * @param right the number to add + * @return *this + */ + Int128& operator+=(const Int128 &right) { + uint64_t sum = lowbits + right.lowbits; + highbits += right.highbits; + if (sum < lowbits) { + highbits += 1; + } + lowbits = sum; + return *this; + } + + /** + * Subtract a number from this one. The result is truncated to 128 bits. + * @param right the number to subtract + * @return *this + */ + Int128& operator-=(const Int128 &right) { + uint64_t diff = lowbits - right.lowbits; + highbits -= right.highbits; + if (diff > lowbits) { + highbits -= 1; + } + lowbits = diff; + return *this; + } + + /** + * Multiply this number by a number. The result is truncated to 128 bits. + * @param right the number to multiply by + * @return *this + */ + Int128& operator*=(const Int128 &right); + + /** + * Divide this number by right and return the result. This operation is + * not destructive. + * + * The answer rounds to zero. Signs work like: + * 21 / 5 -> 4, 1 + * -21 / 5 -> -4, -1 + * 21 / -5 -> -4, 1 + * -21 / -5 -> 4, -1 + * @param right the number to divide by + * @param remainder the remainder after the division + */ + Int128 divide(const Int128 &right, Int128& remainder) const; + + /** + * Logical or between two Int128. + * @param right the number to or in + * @return *this + */ + Int128& operator|=(const Int128 &right) { + lowbits |= right.lowbits; + highbits |= right.highbits; + return *this; + } + + /** + * Logical and between two Int128. + * @param right the number to and in + * @return *this + */ + Int128& operator&=(const Int128 &right) { + lowbits &= right.lowbits; + highbits &= right.highbits; + return *this; + } + + /** + * Logical and between two Int128. + * @param right the number to and in + * @return logical and result + */ + Int128 operator&(const Int128 &right) { + Int128 value = *this; + value &= right; + return value; + } + + /** + * Shift left by the given number of bits. + * Values larger than 2**127 will shift into the sign bit. + */ + Int128& operator<<=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + highbits <<= bits; + highbits |= (lowbits >> (64 - bits)); + lowbits <<= bits; + } else if (bits < 128) { + highbits = static_cast<int64_t>(lowbits) << (bits - 64); + lowbits = 0; + } else { + highbits = 0; + lowbits = 0; + } + } + return *this; + } + + /** + * Shift right by the given number of bits. Negative values will + * sign extend and fill with one bits. + */ + Int128& operator>>=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + lowbits >>= bits; + lowbits |= static_cast<uint64_t>(highbits << (64 - bits)); + highbits = static_cast<int64_t> + (static_cast<uint64_t>(highbits) >> bits); + } else if (bits < 128) { + lowbits = static_cast<uint64_t>(highbits >> (bits - 64)); + highbits = highbits >= 0 ? 0 : -1l; + } else { + highbits = highbits >= 0 ? 0 : -1l; + lowbits = static_cast<uint64_t>(highbits); + } + } + return *this; + } + + bool operator==(const Int128& right) const { + return highbits == right.highbits && lowbits == right.lowbits; + } + + bool operator!=(const Int128& right) const { + return highbits != right.highbits || lowbits != right.lowbits; + } + + bool operator<(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits < right.lowbits; + } else { + return highbits < right.highbits; + } + } + + bool operator<=(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits <= right.lowbits; + } else { + return highbits <= right.highbits; + } + } + + bool operator>(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits > right.lowbits; + } else { + return highbits > right.highbits; + } + } + + bool operator>=(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits >= right.lowbits; + } else { + return highbits >= right.highbits; + } + } + + uint32_t hash() const { + return static_cast<uint32_t>(highbits >> 32) ^ + static_cast<uint32_t>(highbits) ^ + static_cast<uint32_t>(lowbits >> 32) ^ + static_cast<uint32_t>(lowbits); + } + + /** + * Does this value fit into a long? + */ + bool fitsInLong() const { + switch (highbits) { + case 0: + return 0 == (lowbits & LONG_SIGN_BIT); + case -1: + return 0 != (lowbits & LONG_SIGN_BIT); + default: + return false; + } + } + + /** + * Convert the value to a long and + */ + int64_t toLong() const { + if (fitsInLong()) { + return static_cast<int64_t>(lowbits); + } + throw std::range_error("Int128 too large to convert to long"); + } + + /** + * Return the base 10 string representation of the integer. + */ + std::string toString() const; + + /** + * Return the base 10 string representation with a decimal point, + * the given number of places after the decimal. + */ + std::string toDecimalString(int32_t scale=0) const; + + /** + * Return the base 16 string representation of the two's complement with + * a prefix of "0x". + * Int128(-1).toHexString() = "0xffffffffffffffffffffffffffffffff". + */ + std::string toHexString() const; + + /** + * Get the high bits of the twos complement representation of the number. + */ + int64_t getHighBits() { + return highbits; + } + + /** + * Get the low bits of the twos complement representation of the number. + */ + uint64_t getLowBits() { + return lowbits; + } + + /** + * Represent the absolute number as a list of uint32. + * Visible for testing only. + * @param array the array that is set to the value of the number + * @param wasNegative set to true if the original number was negative + * @return the number of elements that were set in the array (1 to 4) + */ + int64_t fillInArray(uint32_t* array, bool &wasNegative) const; + + private: + static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u; + int64_t highbits; + uint64_t lowbits; + }; + + + /** + * Scales up an Int128 value + * @param value the Int128 value to scale + * @param power the scale offset. Result of a negative factor is undefined. + * @param overflow returns whether the result overflows or not + * @return the scaled value + */ + Int128 scaleUpInt128ByPowerOfTen(Int128 value, + int32_t power, + bool &overflow); + /** + * Scales down an Int128 value + * @param value the Int128 value to scale + * @param power the scale offset. Result of a negative factor is undefined. + * @return the scaled value + */ + Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power); +} +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh index 71d76c438a..a34651721f 100644 --- a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh +++ b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh @@ -1,150 +1,150 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MEMORYPOOL_HH_ -#define MEMORYPOOL_HH_ - -#include "orc/orc-config.hh" -#include "orc/Int128.hh" - -#include <memory> - -namespace orc { - - class MemoryPool { - public: - virtual ~MemoryPool(); - - virtual char* malloc(uint64_t size) = 0; - virtual void free(char* p) = 0; - }; - MemoryPool* getDefaultPool(); - - template <class T> - class DataBuffer { - private: - MemoryPool& memoryPool; - T* buf; - // current size - uint64_t currentSize; - // maximal capacity (actual allocated memory) - uint64_t currentCapacity; - - // not implemented - DataBuffer(DataBuffer& buffer); - DataBuffer& operator=(DataBuffer& buffer); - - public: - DataBuffer(MemoryPool& pool, uint64_t _size = 0); - - DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT; - - virtual ~DataBuffer(); - - T* data() { - return buf; - } - - const T* data() const { - return buf; - } - - uint64_t size() { - return currentSize; - } - - uint64_t capacity() { - return currentCapacity; - } - - T& operator[](uint64_t i) { - return buf[i]; - } - - void reserve(uint64_t _size); - void resize(uint64_t _size); - }; - - // Specializations for char - - template <> - DataBuffer<char>::~DataBuffer(); - - template <> - void DataBuffer<char>::resize(uint64_t newSize); - - // Specializations for char* - - template <> - DataBuffer<char*>::~DataBuffer(); - - template <> - void DataBuffer<char*>::resize(uint64_t newSize); - - // Specializations for double - - template <> - DataBuffer<double>::~DataBuffer(); - - template <> - void DataBuffer<double>::resize(uint64_t newSize); - - // Specializations for int64_t - - template <> - DataBuffer<int64_t>::~DataBuffer(); - - template <> - void DataBuffer<int64_t>::resize(uint64_t newSize); - - // Specializations for uint64_t - - template <> - DataBuffer<uint64_t>::~DataBuffer(); - - template <> - void DataBuffer<uint64_t>::resize(uint64_t newSize); - - // Specializations for unsigned char - - template <> - DataBuffer<unsigned char>::~DataBuffer(); - - template <> - void DataBuffer<unsigned char>::resize(uint64_t newSize); - - #ifdef __clang__ - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wweak-template-vtables" - #endif - - extern template class DataBuffer<char>; - extern template class DataBuffer<char*>; - extern template class DataBuffer<double>; - extern template class DataBuffer<Int128>; - extern template class DataBuffer<int64_t>; - extern template class DataBuffer<uint64_t>; - extern template class DataBuffer<unsigned char>; - - #ifdef __clang__ - #pragma clang diagnostic pop - #endif -} // namespace orc - - -#endif /* MEMORYPOOL_HH_ */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MEMORYPOOL_HH_ +#define MEMORYPOOL_HH_ + +#include "orc/orc-config.hh" +#include "orc/Int128.hh" + +#include <memory> + +namespace orc { + + class MemoryPool { + public: + virtual ~MemoryPool(); + + virtual char* malloc(uint64_t size) = 0; + virtual void free(char* p) = 0; + }; + MemoryPool* getDefaultPool(); + + template <class T> + class DataBuffer { + private: + MemoryPool& memoryPool; + T* buf; + // current size + uint64_t currentSize; + // maximal capacity (actual allocated memory) + uint64_t currentCapacity; + + // not implemented + DataBuffer(DataBuffer& buffer); + DataBuffer& operator=(DataBuffer& buffer); + + public: + DataBuffer(MemoryPool& pool, uint64_t _size = 0); + + DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT; + + virtual ~DataBuffer(); + + T* data() { + return buf; + } + + const T* data() const { + return buf; + } + + uint64_t size() { + return currentSize; + } + + uint64_t capacity() { + return currentCapacity; + } + + T& operator[](uint64_t i) { + return buf[i]; + } + + void reserve(uint64_t _size); + void resize(uint64_t _size); + }; + + // Specializations for char + + template <> + DataBuffer<char>::~DataBuffer(); + + template <> + void DataBuffer<char>::resize(uint64_t newSize); + + // Specializations for char* + + template <> + DataBuffer<char*>::~DataBuffer(); + + template <> + void DataBuffer<char*>::resize(uint64_t newSize); + + // Specializations for double + + template <> + DataBuffer<double>::~DataBuffer(); + + template <> + void DataBuffer<double>::resize(uint64_t newSize); + + // Specializations for int64_t + + template <> + DataBuffer<int64_t>::~DataBuffer(); + + template <> + void DataBuffer<int64_t>::resize(uint64_t newSize); + + // Specializations for uint64_t + + template <> + DataBuffer<uint64_t>::~DataBuffer(); + + template <> + void DataBuffer<uint64_t>::resize(uint64_t newSize); + + // Specializations for unsigned char + + template <> + DataBuffer<unsigned char>::~DataBuffer(); + + template <> + void DataBuffer<unsigned char>::resize(uint64_t newSize); + + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wweak-template-vtables" + #endif + + extern template class DataBuffer<char>; + extern template class DataBuffer<char*>; + extern template class DataBuffer<double>; + extern template class DataBuffer<Int128>; + extern template class DataBuffer<int64_t>; + extern template class DataBuffer<uint64_t>; + extern template class DataBuffer<unsigned char>; + + #ifdef __clang__ + #pragma clang diagnostic pop + #endif +} // namespace orc + + +#endif /* MEMORYPOOL_HH_ */ diff --git a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh index c64853168a..541d725bfc 100644 --- a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh +++ b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh @@ -1,148 +1,148 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_FILE_HH -#define ORC_FILE_HH - -#include <string> - -#include "orc/orc-config.hh" -#include "orc/Reader.hh" -#include "orc/Writer.hh" - -/** /file orc/OrcFile.hh - @brief The top level interface to ORC. -*/ - -namespace orc { - - /** - * An abstract interface for providing ORC readers a stream of bytes. - */ - class InputStream { - public: - virtual ~InputStream(); - - /** - * Get the total length of the file in bytes. - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the natural size for reads. - * @return the number of bytes that should be read at once - */ - virtual uint64_t getNaturalReadSize() const = 0; - - /** - * Read length bytes from the file starting at offset into - * the buffer starting at buf. - * @param buf the starting position of a buffer. - * @param length the number of bytes to read. - * @param offset the position in the stream to read from. - */ - virtual void read(void* buf, - uint64_t length, - uint64_t offset) = 0; - - /** - * Get the name of the stream for error messages. - */ - virtual const std::string& getName() const = 0; - }; - - /** - * An abstract interface for providing ORC writer a stream of bytes. - */ - class OutputStream { - public: - virtual ~OutputStream(); - - /** - * Get the total length of bytes written. - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the natural size for reads. - * @return the number of bytes that should be written at once - */ - virtual uint64_t getNaturalWriteSize() const = 0; - - /** - * Write/Append length bytes pointed by buf to the file stream - * @param buf the starting position of a buffer. - * @param length the number of bytes to write. - */ - virtual void write(const void* buf, size_t length) = 0; - - /** - * Get the name of the stream for error messages. - */ - virtual const std::string& getName() const = 0; - - /** - * Close the stream and flush any pending data to the disk. - */ - virtual void close() = 0; - }; - - /** - * Create a stream to a local file or HDFS file if path begins with "hdfs://" - * @param path the name of the file in the local file system or HDFS - */ - ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path); - - /** - * Create a stream to a local file. - * @param path the name of the file in the local file system - */ - ORC_UNIQUE_PTR<InputStream> readLocalFile(const std::string& path); - - /** - * Create a stream to an HDFS file. - * @param path the uri of the file in HDFS - */ - ORC_UNIQUE_PTR<InputStream> readHdfsFile(const std::string& path); - - /** - * Create a reader to read the ORC file. - * @param stream the stream to read - * @param options the options for reading the file - */ - ORC_UNIQUE_PTR<Reader> createReader(ORC_UNIQUE_PTR<InputStream> stream, - const ReaderOptions& options); - /** - * Create a stream to write to a local file. - * @param path the name of the file in the local file system - */ - ORC_UNIQUE_PTR<OutputStream> writeLocalFile(const std::string& path); - - /** - * Create a writer to write the ORC file. - * @param type the type of data to be written - * @param stream the stream to write to - * @param options the options for writing the file - */ - ORC_UNIQUE_PTR<Writer> createWriter( - const Type& type, - OutputStream* stream, - const WriterOptions& options); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_FILE_HH +#define ORC_FILE_HH + +#include <string> + +#include "orc/orc-config.hh" +#include "orc/Reader.hh" +#include "orc/Writer.hh" + +/** /file orc/OrcFile.hh + @brief The top level interface to ORC. +*/ + +namespace orc { + + /** + * An abstract interface for providing ORC readers a stream of bytes. + */ + class InputStream { + public: + virtual ~InputStream(); + + /** + * Get the total length of the file in bytes. + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the natural size for reads. + * @return the number of bytes that should be read at once + */ + virtual uint64_t getNaturalReadSize() const = 0; + + /** + * Read length bytes from the file starting at offset into + * the buffer starting at buf. + * @param buf the starting position of a buffer. + * @param length the number of bytes to read. + * @param offset the position in the stream to read from. + */ + virtual void read(void* buf, + uint64_t length, + uint64_t offset) = 0; + + /** + * Get the name of the stream for error messages. + */ + virtual const std::string& getName() const = 0; + }; + + /** + * An abstract interface for providing ORC writer a stream of bytes. + */ + class OutputStream { + public: + virtual ~OutputStream(); + + /** + * Get the total length of bytes written. + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the natural size for reads. + * @return the number of bytes that should be written at once + */ + virtual uint64_t getNaturalWriteSize() const = 0; + + /** + * Write/Append length bytes pointed by buf to the file stream + * @param buf the starting position of a buffer. + * @param length the number of bytes to write. + */ + virtual void write(const void* buf, size_t length) = 0; + + /** + * Get the name of the stream for error messages. + */ + virtual const std::string& getName() const = 0; + + /** + * Close the stream and flush any pending data to the disk. + */ + virtual void close() = 0; + }; + + /** + * Create a stream to a local file or HDFS file if path begins with "hdfs://" + * @param path the name of the file in the local file system or HDFS + */ + ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path); + + /** + * Create a stream to a local file. + * @param path the name of the file in the local file system + */ + ORC_UNIQUE_PTR<InputStream> readLocalFile(const std::string& path); + + /** + * Create a stream to an HDFS file. + * @param path the uri of the file in HDFS + */ + ORC_UNIQUE_PTR<InputStream> readHdfsFile(const std::string& path); + + /** + * Create a reader to read the ORC file. + * @param stream the stream to read + * @param options the options for reading the file + */ + ORC_UNIQUE_PTR<Reader> createReader(ORC_UNIQUE_PTR<InputStream> stream, + const ReaderOptions& options); + /** + * Create a stream to write to a local file. + * @param path the name of the file in the local file system + */ + ORC_UNIQUE_PTR<OutputStream> writeLocalFile(const std::string& path); + + /** + * Create a writer to write the ORC file. + * @param type the type of data to be written + * @param stream the stream to write to + * @param options the options for writing the file + */ + ORC_UNIQUE_PTR<Writer> createWriter( + const Type& type, + OutputStream* stream, + const WriterOptions& options); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh index 5d9a532c11..55c95557fc 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh @@ -1,550 +1,550 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_READER_HH -#define ORC_READER_HH - -#include "orc/BloomFilter.hh" -#include "orc/Common.hh" -#include "orc/orc-config.hh" -#include "orc/Statistics.hh" -#include "orc/Type.hh" -#include "orc/Vector.hh" - -#include <map> -#include <memory> -#include <set> -#include <string> -#include <vector> - -namespace orc { - - // classes that hold data members so we can maintain binary compatibility - struct ReaderOptionsPrivate; - struct RowReaderOptionsPrivate; - - /** - * Options for creating a Reader. - */ - class ReaderOptions { - private: - ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits; - - public: - ReaderOptions(); - ReaderOptions(const ReaderOptions&); - ReaderOptions(ReaderOptions&); - ReaderOptions& operator=(const ReaderOptions&); - virtual ~ReaderOptions(); - - /** - * Set the stream to use for printing warning or error messages. - */ - ReaderOptions& setErrorStream(std::ostream& stream); - - /** - * Set a serialized copy of the file tail to be used when opening the file. - * - * When one process opens the file and other processes need to read - * the rows, we want to enable clients to just read the tail once. - * By passing the string returned by Reader.getSerializedFileTail(), to - * this function, the second reader will not need to read the file tail - * from disk. - * - * @param serialization the bytes of the serialized tail to use - */ - ReaderOptions& setSerializedFileTail(const std::string& serialization); - - /** - * Set the memory allocator. - */ - ReaderOptions& setMemoryPool(MemoryPool& pool); - - /** - * Set the location of the tail as defined by the logical length of the - * file. - */ - ReaderOptions& setTailLocation(uint64_t offset); - - /** - * Get the stream to write warnings or errors to. - */ - std::ostream* getErrorStream() const; - - /** - * Get the serialized file tail that the user passed in. - */ - std::string getSerializedFileTail() const; - - /** - * Get the desired tail location. - * @return if not set, return the maximum long. - */ - uint64_t getTailLocation() const; - - /** - * Get the memory allocator. - */ - MemoryPool* getMemoryPool() const; - }; - - /** - * Options for creating a RowReader. - */ - class RowReaderOptions { - private: - ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits; - - public: - RowReaderOptions(); - RowReaderOptions(const RowReaderOptions&); - RowReaderOptions(RowReaderOptions&); - RowReaderOptions& operator=(const RowReaderOptions&); - virtual ~RowReaderOptions(); - - /** - * For files that have structs as the top-level object, select the fields - * to read. The first field is 0, the second 1, and so on. By default, - * all columns are read. This option clears any previous setting of - * the selected columns. - * @param include a list of fields to read - * @return this - */ - RowReaderOptions& include(const std::list<uint64_t>& include); - - /** - * For files that have structs as the top-level object, select the fields - * to read by name. By default, all columns are read. This option clears - * any previous setting of the selected columns. - * @param include a list of fields to read - * @return this - */ - RowReaderOptions& include(const std::list<std::string>& include); - - /** - * Selects which type ids to read. The root type is always 0 and the - * rest of the types are labeled in a preorder traversal of the tree. - * The parent types are automatically selected, but the children are not. - * - * This option clears any previous setting of the selected columns or - * types. - * @param types a list of the type ids to read - * @return this - */ - RowReaderOptions& includeTypes(const std::list<uint64_t>& types); - - /** - * Set the section of the file to process. - * @param offset the starting byte offset - * @param length the number of bytes to read - * @return this - */ - RowReaderOptions& range(uint64_t offset, uint64_t length); - - /** - * For Hive 0.11 (and 0.12) decimals, the precision was unlimited - * and thus may overflow the 38 digits that is supported. If one - * of the Hive 0.11 decimals is too large, the reader may either convert - * the value to NULL or throw an exception. That choice is controlled - * by this setting. - * - * Defaults to true. - * - * @param shouldThrow should the reader throw a ParseError? - * @return returns *this - */ - RowReaderOptions& throwOnHive11DecimalOverflow(bool shouldThrow); - - /** - * For Hive 0.11 (and 0.12) written decimals, which have unlimited - * scale and precision, the reader forces the scale to a consistent - * number that is configured. This setting changes the scale that is - * forced upon these old decimals. See also throwOnHive11DecimalOverflow. - * - * Defaults to 6. - * - * @param forcedScale the scale that will be forced on Hive 0.11 decimals - * @return returns *this - */ - RowReaderOptions& forcedScaleOnHive11Decimal(int32_t forcedScale); - - /** - * Set enable encoding block mode. - * By enable encoding block mode, Row Reader will not decode - * dictionary encoded string vector, but instead return an index array with - * reference to corresponding dictionary. - */ - RowReaderOptions& setEnableLazyDecoding(bool enable); - - /** - * Should enable encoding block mode - */ - bool getEnableLazyDecoding() const; - - /** - * Were the field ids set? - */ - bool getIndexesSet() const; - - /** - * Were the type ids set? - */ - bool getTypeIdsSet() const; - - /** - * Get the list of selected field or type ids to read. - */ - const std::list<uint64_t>& getInclude() const; - - /** - * Were the include names set? - */ - bool getNamesSet() const; - - /** - * Get the list of selected columns to read. All children of the selected - * columns are also selected. - */ - const std::list<std::string>& getIncludeNames() const; - - /** - * Get the start of the range for the data being processed. - * @return if not set, return 0 - */ - uint64_t getOffset() const; - - /** - * Get the end of the range for the data being processed. - * @return if not set, return the maximum long - */ - uint64_t getLength() const; - - /** - * Should the reader throw a ParseError when a Hive 0.11 decimal is - * larger than the supported 38 digits of precision? Otherwise, the - * data item is replaced by a NULL. - */ - bool getThrowOnHive11DecimalOverflow() const; - - /** - * What scale should all Hive 0.11 decimals be normalized to? - */ - int32_t getForcedScaleOnHive11Decimal() const; - }; - - - class RowReader; - - /** - * The interface for reading ORC file meta-data and constructing RowReaders. - * This is an an abstract class that will be subclassed as necessary. - */ - class Reader { - public: - virtual ~Reader(); - - /** - * Get the format version of the file. Currently known values are: - * 0.11 and 0.12 - * @return the FileVersion object - */ - virtual FileVersion getFormatVersion() const = 0; - - /** - * Get the number of rows in the file. - * @return the number of rows - */ - virtual uint64_t getNumberOfRows() const = 0; - - /** +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_READER_HH +#define ORC_READER_HH + +#include "orc/BloomFilter.hh" +#include "orc/Common.hh" +#include "orc/orc-config.hh" +#include "orc/Statistics.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" + +#include <map> +#include <memory> +#include <set> +#include <string> +#include <vector> + +namespace orc { + + // classes that hold data members so we can maintain binary compatibility + struct ReaderOptionsPrivate; + struct RowReaderOptionsPrivate; + + /** + * Options for creating a Reader. + */ + class ReaderOptions { + private: + ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits; + + public: + ReaderOptions(); + ReaderOptions(const ReaderOptions&); + ReaderOptions(ReaderOptions&); + ReaderOptions& operator=(const ReaderOptions&); + virtual ~ReaderOptions(); + + /** + * Set the stream to use for printing warning or error messages. + */ + ReaderOptions& setErrorStream(std::ostream& stream); + + /** + * Set a serialized copy of the file tail to be used when opening the file. + * + * When one process opens the file and other processes need to read + * the rows, we want to enable clients to just read the tail once. + * By passing the string returned by Reader.getSerializedFileTail(), to + * this function, the second reader will not need to read the file tail + * from disk. + * + * @param serialization the bytes of the serialized tail to use + */ + ReaderOptions& setSerializedFileTail(const std::string& serialization); + + /** + * Set the memory allocator. + */ + ReaderOptions& setMemoryPool(MemoryPool& pool); + + /** + * Set the location of the tail as defined by the logical length of the + * file. + */ + ReaderOptions& setTailLocation(uint64_t offset); + + /** + * Get the stream to write warnings or errors to. + */ + std::ostream* getErrorStream() const; + + /** + * Get the serialized file tail that the user passed in. + */ + std::string getSerializedFileTail() const; + + /** + * Get the desired tail location. + * @return if not set, return the maximum long. + */ + uint64_t getTailLocation() const; + + /** + * Get the memory allocator. + */ + MemoryPool* getMemoryPool() const; + }; + + /** + * Options for creating a RowReader. + */ + class RowReaderOptions { + private: + ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits; + + public: + RowReaderOptions(); + RowReaderOptions(const RowReaderOptions&); + RowReaderOptions(RowReaderOptions&); + RowReaderOptions& operator=(const RowReaderOptions&); + virtual ~RowReaderOptions(); + + /** + * For files that have structs as the top-level object, select the fields + * to read. The first field is 0, the second 1, and so on. By default, + * all columns are read. This option clears any previous setting of + * the selected columns. + * @param include a list of fields to read + * @return this + */ + RowReaderOptions& include(const std::list<uint64_t>& include); + + /** + * For files that have structs as the top-level object, select the fields + * to read by name. By default, all columns are read. This option clears + * any previous setting of the selected columns. + * @param include a list of fields to read + * @return this + */ + RowReaderOptions& include(const std::list<std::string>& include); + + /** + * Selects which type ids to read. The root type is always 0 and the + * rest of the types are labeled in a preorder traversal of the tree. + * The parent types are automatically selected, but the children are not. + * + * This option clears any previous setting of the selected columns or + * types. + * @param types a list of the type ids to read + * @return this + */ + RowReaderOptions& includeTypes(const std::list<uint64_t>& types); + + /** + * Set the section of the file to process. + * @param offset the starting byte offset + * @param length the number of bytes to read + * @return this + */ + RowReaderOptions& range(uint64_t offset, uint64_t length); + + /** + * For Hive 0.11 (and 0.12) decimals, the precision was unlimited + * and thus may overflow the 38 digits that is supported. If one + * of the Hive 0.11 decimals is too large, the reader may either convert + * the value to NULL or throw an exception. That choice is controlled + * by this setting. + * + * Defaults to true. + * + * @param shouldThrow should the reader throw a ParseError? + * @return returns *this + */ + RowReaderOptions& throwOnHive11DecimalOverflow(bool shouldThrow); + + /** + * For Hive 0.11 (and 0.12) written decimals, which have unlimited + * scale and precision, the reader forces the scale to a consistent + * number that is configured. This setting changes the scale that is + * forced upon these old decimals. See also throwOnHive11DecimalOverflow. + * + * Defaults to 6. + * + * @param forcedScale the scale that will be forced on Hive 0.11 decimals + * @return returns *this + */ + RowReaderOptions& forcedScaleOnHive11Decimal(int32_t forcedScale); + + /** + * Set enable encoding block mode. + * By enable encoding block mode, Row Reader will not decode + * dictionary encoded string vector, but instead return an index array with + * reference to corresponding dictionary. + */ + RowReaderOptions& setEnableLazyDecoding(bool enable); + + /** + * Should enable encoding block mode + */ + bool getEnableLazyDecoding() const; + + /** + * Were the field ids set? + */ + bool getIndexesSet() const; + + /** + * Were the type ids set? + */ + bool getTypeIdsSet() const; + + /** + * Get the list of selected field or type ids to read. + */ + const std::list<uint64_t>& getInclude() const; + + /** + * Were the include names set? + */ + bool getNamesSet() const; + + /** + * Get the list of selected columns to read. All children of the selected + * columns are also selected. + */ + const std::list<std::string>& getIncludeNames() const; + + /** + * Get the start of the range for the data being processed. + * @return if not set, return 0 + */ + uint64_t getOffset() const; + + /** + * Get the end of the range for the data being processed. + * @return if not set, return the maximum long + */ + uint64_t getLength() const; + + /** + * Should the reader throw a ParseError when a Hive 0.11 decimal is + * larger than the supported 38 digits of precision? Otherwise, the + * data item is replaced by a NULL. + */ + bool getThrowOnHive11DecimalOverflow() const; + + /** + * What scale should all Hive 0.11 decimals be normalized to? + */ + int32_t getForcedScaleOnHive11Decimal() const; + }; + + + class RowReader; + + /** + * The interface for reading ORC file meta-data and constructing RowReaders. + * This is an an abstract class that will be subclassed as necessary. + */ + class Reader { + public: + virtual ~Reader(); + + /** + * Get the format version of the file. Currently known values are: + * 0.11 and 0.12 + * @return the FileVersion object + */ + virtual FileVersion getFormatVersion() const = 0; + + /** + * Get the number of rows in the file. + * @return the number of rows + */ + virtual uint64_t getNumberOfRows() const = 0; + + /** * Get the software instance and version that wrote this file. * @return a user-facing string that specifies the software version */ virtual std::string getSoftwareVersion() const = 0; /** - * Get the user metadata keys. - * @return the set of user metadata keys - */ - virtual std::list<std::string> getMetadataKeys() const = 0; - - /** - * Get a user metadata value. - * @param key a key given by the user - * @return the bytes associated with the given key - */ - virtual std::string getMetadataValue(const std::string& key) const = 0; - - /** - * Did the user set the given metadata value. - * @param key the key to check - * @return true if the metadata value was set - */ - virtual bool hasMetadataValue(const std::string& key) const = 0; - - /** - * Get the compression kind. - * @return the kind of compression in the file - */ - virtual CompressionKind getCompression() const = 0; - - /** - * Get the buffer size for the compression. - * @return number of bytes to buffer for the compression codec. - */ - virtual uint64_t getCompressionSize() const = 0; - - /** - * Get ID of writer that generated the file. - * @return UNKNOWN_WRITER if the writer ID is undefined - */ - virtual WriterId getWriterId() const = 0; - - /** - * Get the writer id value when getWriterId() returns an unknown writer. - * @return the integer value of the writer ID. - */ - virtual uint32_t getWriterIdValue() const = 0; - - /** - * Get the version of the writer. - * @return the version of the writer. - */ - virtual WriterVersion getWriterVersion() const = 0; - - /** - * Get the number of rows per an entry in the row index. - * @return the number of rows per an entry in the row index or 0 if there - * is no row index. - */ - virtual uint64_t getRowIndexStride() const = 0; - - /** - * Get the number of stripes in the file. - * @return the number of stripes - */ - virtual uint64_t getNumberOfStripes() const = 0; - - /** - * Get the information about a stripe. - * @param stripeIndex the index of the stripe (0 to N-1) to get information about - * @return the information about that stripe - */ - virtual ORC_UNIQUE_PTR<StripeInformation> - getStripe(uint64_t stripeIndex) const = 0; - - /** - * Get the number of stripe statistics in the file. - * @return the number of stripe statistics - */ - virtual uint64_t getNumberOfStripeStatistics() const = 0; - - /** - * Get the statistics about a stripe. - * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe - */ - virtual ORC_UNIQUE_PTR<StripeStatistics> - getStripeStatistics(uint64_t stripeIndex) const = 0; - - /** - * Get the length of the data stripes in the file. - * @return the number of bytes in stripes - */ - virtual uint64_t getContentLength() const = 0; - - /** - * Get the length of the file stripe statistics. - * @return the number of compressed bytes in the file stripe statistics - */ - virtual uint64_t getStripeStatisticsLength() const = 0; - - /** - * Get the length of the file footer. - * @return the number of compressed bytes in the file footer - */ - virtual uint64_t getFileFooterLength() const = 0; - - /** - * Get the length of the file postscript. - * @return the number of bytes in the file postscript - */ - virtual uint64_t getFilePostscriptLength() const = 0; - - /** - * Get the total length of the file. - * @return the number of bytes in the file - */ - virtual uint64_t getFileLength() const = 0; - - /** - * Get the statistics about the columns in the file. - * @return the information about the column - */ - virtual ORC_UNIQUE_PTR<Statistics> getStatistics() const = 0; - - /** - * Get the statistics about a single column in the file. - * @param columnId id of the column - * @return the information about the column - */ - virtual ORC_UNIQUE_PTR<ColumnStatistics> - getColumnStatistics(uint32_t columnId) const = 0; - - /** - * Check if the file has correct column statistics. - */ - virtual bool hasCorrectStatistics() const = 0; - - /** - * Get the serialized file tail. - * Usefull if another reader of the same file wants to avoid re-reading - * the file tail. See ReaderOptions.setSerializedFileTail(). - * @return a string of bytes with the file tail - */ - virtual std::string getSerializedFileTail() const = 0; - - /** - * Get the type of the rows in the file. The top level is typically a - * struct. - * @return the root type - */ - virtual const Type& getType() const = 0; - - /** - * Create a RowReader based on this reader with the default options. - * @return a RowReader to read the rows - */ - virtual ORC_UNIQUE_PTR<RowReader> createRowReader() const = 0; - - /** - * Create a RowReader based on this reader. - * @param options RowReader Options - * @return a RowReader to read the rows - */ - virtual ORC_UNIQUE_PTR<RowReader> createRowReader(const RowReaderOptions& options) const = 0; - - /** - * Get the name of the input stream. - */ - virtual const std::string& getStreamName() const = 0; - - /** - * Estimate an upper bound on heap memory allocation by the Reader - * based on the information in the file footer. - * The bound is less tight if only few columns are read or compression is - * used. - */ - /** - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by all columns - */ - virtual uint64_t getMemoryUse(int stripeIx=-1) = 0; - - /** - * @param include Column Field Ids - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by selected columns - */ - virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; - - /** - * @param names Column Names - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by selected columns - */ - virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0; - - /** - * @param include Column Type Ids - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by selected columns - */ - virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; - - /** - * Get BloomFiters of all selected columns in the specified stripe - * @param stripeIndex index of the stripe to be read for bloom filters. - * @param included index of selected columns to return (if not specified, - * all columns that have bloom filters are considered). - * @return map of bloom filters with the key standing for the index of column. - */ - virtual std::map<uint32_t, BloomFilterIndex> - getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; - }; - - /** - * The interface for reading rows in ORC files. - * This is an an abstract class that will be subclassed as necessary. - */ - class RowReader { - public: - virtual ~RowReader(); - /** - * Get the selected type of the rows in the file. The file's row type - * is projected down to just the selected columns. Thus, if the file's - * type is struct<col0:int,col1:double,col2:string> and the selected - * columns are "col0,col2" the selected type would be - * struct<col0:int,col2:string>. - * @return the root type - */ - virtual const Type& getSelectedType() const = 0; - - /** - * Get the selected columns of the file. - */ - virtual const std::vector<bool> getSelectedColumns() const = 0; - - /** - * Create a row batch for reading the selected columns of this file. - * @param size the number of rows to read - * @return a new ColumnVectorBatch to read into - */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; - - /** - * Read the next row batch from the current position. - * Caller must look at numElements in the row batch to determine how - * many rows were read. - * @param data the row batch to read into. - * @return true if a non-zero number of rows were read or false if the - * end of the file was reached. - */ - virtual bool next(ColumnVectorBatch& data) = 0; - - /** - * Get the row number of the first row in the previously read batch. - * @return the row number of the previous batch. - */ - virtual uint64_t getRowNumber() const = 0; - - /** - * Seek to a given row. - * @param rowNumber the next row the reader should return - */ - virtual void seekToRow(uint64_t rowNumber) = 0; - - }; -} - -#endif + * Get the user metadata keys. + * @return the set of user metadata keys + */ + virtual std::list<std::string> getMetadataKeys() const = 0; + + /** + * Get a user metadata value. + * @param key a key given by the user + * @return the bytes associated with the given key + */ + virtual std::string getMetadataValue(const std::string& key) const = 0; + + /** + * Did the user set the given metadata value. + * @param key the key to check + * @return true if the metadata value was set + */ + virtual bool hasMetadataValue(const std::string& key) const = 0; + + /** + * Get the compression kind. + * @return the kind of compression in the file + */ + virtual CompressionKind getCompression() const = 0; + + /** + * Get the buffer size for the compression. + * @return number of bytes to buffer for the compression codec. + */ + virtual uint64_t getCompressionSize() const = 0; + + /** + * Get ID of writer that generated the file. + * @return UNKNOWN_WRITER if the writer ID is undefined + */ + virtual WriterId getWriterId() const = 0; + + /** + * Get the writer id value when getWriterId() returns an unknown writer. + * @return the integer value of the writer ID. + */ + virtual uint32_t getWriterIdValue() const = 0; + + /** + * Get the version of the writer. + * @return the version of the writer. + */ + virtual WriterVersion getWriterVersion() const = 0; + + /** + * Get the number of rows per an entry in the row index. + * @return the number of rows per an entry in the row index or 0 if there + * is no row index. + */ + virtual uint64_t getRowIndexStride() const = 0; + + /** + * Get the number of stripes in the file. + * @return the number of stripes + */ + virtual uint64_t getNumberOfStripes() const = 0; + + /** + * Get the information about a stripe. + * @param stripeIndex the index of the stripe (0 to N-1) to get information about + * @return the information about that stripe + */ + virtual ORC_UNIQUE_PTR<StripeInformation> + getStripe(uint64_t stripeIndex) const = 0; + + /** + * Get the number of stripe statistics in the file. + * @return the number of stripe statistics + */ + virtual uint64_t getNumberOfStripeStatistics() const = 0; + + /** + * Get the statistics about a stripe. + * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about + * @return the statistics about that stripe + */ + virtual ORC_UNIQUE_PTR<StripeStatistics> + getStripeStatistics(uint64_t stripeIndex) const = 0; + + /** + * Get the length of the data stripes in the file. + * @return the number of bytes in stripes + */ + virtual uint64_t getContentLength() const = 0; + + /** + * Get the length of the file stripe statistics. + * @return the number of compressed bytes in the file stripe statistics + */ + virtual uint64_t getStripeStatisticsLength() const = 0; + + /** + * Get the length of the file footer. + * @return the number of compressed bytes in the file footer + */ + virtual uint64_t getFileFooterLength() const = 0; + + /** + * Get the length of the file postscript. + * @return the number of bytes in the file postscript + */ + virtual uint64_t getFilePostscriptLength() const = 0; + + /** + * Get the total length of the file. + * @return the number of bytes in the file + */ + virtual uint64_t getFileLength() const = 0; + + /** + * Get the statistics about the columns in the file. + * @return the information about the column + */ + virtual ORC_UNIQUE_PTR<Statistics> getStatistics() const = 0; + + /** + * Get the statistics about a single column in the file. + * @param columnId id of the column + * @return the information about the column + */ + virtual ORC_UNIQUE_PTR<ColumnStatistics> + getColumnStatistics(uint32_t columnId) const = 0; + + /** + * Check if the file has correct column statistics. + */ + virtual bool hasCorrectStatistics() const = 0; + + /** + * Get the serialized file tail. + * Usefull if another reader of the same file wants to avoid re-reading + * the file tail. See ReaderOptions.setSerializedFileTail(). + * @return a string of bytes with the file tail + */ + virtual std::string getSerializedFileTail() const = 0; + + /** + * Get the type of the rows in the file. The top level is typically a + * struct. + * @return the root type + */ + virtual const Type& getType() const = 0; + + /** + * Create a RowReader based on this reader with the default options. + * @return a RowReader to read the rows + */ + virtual ORC_UNIQUE_PTR<RowReader> createRowReader() const = 0; + + /** + * Create a RowReader based on this reader. + * @param options RowReader Options + * @return a RowReader to read the rows + */ + virtual ORC_UNIQUE_PTR<RowReader> createRowReader(const RowReaderOptions& options) const = 0; + + /** + * Get the name of the input stream. + */ + virtual const std::string& getStreamName() const = 0; + + /** + * Estimate an upper bound on heap memory allocation by the Reader + * based on the information in the file footer. + * The bound is less tight if only few columns are read or compression is + * used. + */ + /** + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by all columns + */ + virtual uint64_t getMemoryUse(int stripeIx=-1) = 0; + + /** + * @param include Column Field Ids + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by selected columns + */ + virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; + + /** + * @param names Column Names + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by selected columns + */ + virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0; + + /** + * @param include Column Type Ids + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by selected columns + */ + virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; + + /** + * Get BloomFiters of all selected columns in the specified stripe + * @param stripeIndex index of the stripe to be read for bloom filters. + * @param included index of selected columns to return (if not specified, + * all columns that have bloom filters are considered). + * @return map of bloom filters with the key standing for the index of column. + */ + virtual std::map<uint32_t, BloomFilterIndex> + getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; + }; + + /** + * The interface for reading rows in ORC files. + * This is an an abstract class that will be subclassed as necessary. + */ + class RowReader { + public: + virtual ~RowReader(); + /** + * Get the selected type of the rows in the file. The file's row type + * is projected down to just the selected columns. Thus, if the file's + * type is struct<col0:int,col1:double,col2:string> and the selected + * columns are "col0,col2" the selected type would be + * struct<col0:int,col2:string>. + * @return the root type + */ + virtual const Type& getSelectedType() const = 0; + + /** + * Get the selected columns of the file. + */ + virtual const std::vector<bool> getSelectedColumns() const = 0; + + /** + * Create a row batch for reading the selected columns of this file. + * @param size the number of rows to read + * @return a new ColumnVectorBatch to read into + */ + virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size + ) const = 0; + + /** + * Read the next row batch from the current position. + * Caller must look at numElements in the row batch to determine how + * many rows were read. + * @param data the row batch to read into. + * @return true if a non-zero number of rows were read or false if the + * end of the file was reached. + */ + virtual bool next(ColumnVectorBatch& data) = 0; + + /** + * Get the row number of the first row in the previously read batch. + * @return the row number of the previous batch. + */ + virtual uint64_t getRowNumber() const = 0; + + /** + * Seek to a given row. + * @param rowNumber the next row the reader should return + */ + virtual void seekToRow(uint64_t rowNumber) = 0; + + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh index 1d4b0b6558..c7da63a542 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh @@ -1,400 +1,400 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_STATISTICS_HH -#define ORC_STATISTICS_HH - -#include "orc/orc-config.hh" -#include "orc/Type.hh" -#include "orc/Vector.hh" - -namespace orc { - - /** - * Statistics that are available for all types of columns. - */ - class ColumnStatistics { - public: - virtual ~ColumnStatistics(); - - /** - * Get the number of values in this column. It will differ from the number - * of rows because of NULL values. - * @return the number of values - */ - virtual uint64_t getNumberOfValues() const = 0; - - /** - * Check whether column has null value. - * @return true if has null value - */ - virtual bool hasNull() const = 0; - - /** - * Print out statistics of column if any. - */ - virtual std::string toString() const = 0; - }; - - /** - * Statistics for binary columns. - */ - class BinaryColumnStatistics: public ColumnStatistics { - public: - virtual ~BinaryColumnStatistics(); - - /** - * Check whether column has total length. - * @return true if has total length - */ - virtual bool hasTotalLength() const = 0; - - virtual uint64_t getTotalLength() const = 0; - }; - - /** - * Statistics for boolean columns. - */ - class BooleanColumnStatistics: public ColumnStatistics { - public: - virtual ~BooleanColumnStatistics(); - - /** - * Check whether column has true/false count. - * @return true if has true/false count - */ - virtual bool hasCount() const = 0; - - virtual uint64_t getFalseCount() const = 0; - virtual uint64_t getTrueCount() const = 0; - }; - - /** - * Statistics for date columns. - */ - class DateColumnStatistics: public ColumnStatistics { - public: - virtual ~DateColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ - virtual int32_t getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ - virtual int32_t getMaximum() const = 0; - }; - - /** - * Statistics for decimal columns. - */ - class DecimalColumnStatistics: public ColumnStatistics { - public: - virtual ~DecimalColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has sum. - * @return true if has sum - */ - virtual bool hasSum() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ - virtual Decimal getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ - virtual Decimal getMaximum() const = 0; - - /** - * Get the sum for the column. - * @return sum of all the values - */ - virtual Decimal getSum() const = 0; - }; - - /** - * Statistics for float and double columns. - */ - class DoubleColumnStatistics: public ColumnStatistics { - public: - virtual ~DoubleColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has sum. - * @return true if has sum - */ - virtual bool hasSum() const = 0; - - /** - * Get the smallest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the minimum - */ - virtual double getMinimum() const = 0; - - /** - * Get the largest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the maximum - */ - virtual double getMaximum() const = 0; - - /** - * Get the sum of the values in the column. - * @return the sum - */ - virtual double getSum() const = 0; - }; - - /** - * Statistics for all of the integer columns, such as byte, short, int, and - * long. - */ - class IntegerColumnStatistics: public ColumnStatistics { - public: - virtual ~IntegerColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has sum. - * @return true if has sum - */ - virtual bool hasSum() const = 0; - - /** - * Get the smallest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the minimum - */ - virtual int64_t getMinimum() const = 0; - - /** - * Get the largest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the maximum - */ - virtual int64_t getMaximum() const = 0; - - /** - * Get the sum of the column. Only valid if isSumDefined returns true. - * @return the sum of the column - */ - virtual int64_t getSum() const = 0; - }; - - /** - * Statistics for string columns. - */ - class StringColumnStatistics: public ColumnStatistics { - public: - virtual ~StringColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has total length. - * @return true if has total length - */ - virtual bool hasTotalLength() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STATISTICS_HH +#define ORC_STATISTICS_HH + +#include "orc/orc-config.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" + +namespace orc { + + /** + * Statistics that are available for all types of columns. + */ + class ColumnStatistics { + public: + virtual ~ColumnStatistics(); + + /** + * Get the number of values in this column. It will differ from the number + * of rows because of NULL values. + * @return the number of values + */ + virtual uint64_t getNumberOfValues() const = 0; + + /** + * Check whether column has null value. + * @return true if has null value + */ + virtual bool hasNull() const = 0; + + /** + * Print out statistics of column if any. + */ + virtual std::string toString() const = 0; + }; + + /** + * Statistics for binary columns. + */ + class BinaryColumnStatistics: public ColumnStatistics { + public: + virtual ~BinaryColumnStatistics(); + + /** + * Check whether column has total length. + * @return true if has total length + */ + virtual bool hasTotalLength() const = 0; + + virtual uint64_t getTotalLength() const = 0; + }; + + /** + * Statistics for boolean columns. + */ + class BooleanColumnStatistics: public ColumnStatistics { + public: + virtual ~BooleanColumnStatistics(); + + /** + * Check whether column has true/false count. + * @return true if has true/false count + */ + virtual bool hasCount() const = 0; + + virtual uint64_t getFalseCount() const = 0; + virtual uint64_t getTrueCount() const = 0; + }; + + /** + * Statistics for date columns. + */ + class DateColumnStatistics: public ColumnStatistics { + public: + virtual ~DateColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ + virtual int32_t getMinimum() const = 0; + + /** + * Get the maximum value for the column. + * @return maximum value + */ + virtual int32_t getMaximum() const = 0; + }; + + /** + * Statistics for decimal columns. + */ + class DecimalColumnStatistics: public ColumnStatistics { + public: + virtual ~DecimalColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has sum. + * @return true if has sum + */ + virtual bool hasSum() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ + virtual Decimal getMinimum() const = 0; + + /** + * Get the maximum value for the column. + * @return maximum value + */ + virtual Decimal getMaximum() const = 0; + + /** + * Get the sum for the column. + * @return sum of all the values + */ + virtual Decimal getSum() const = 0; + }; + + /** + * Statistics for float and double columns. + */ + class DoubleColumnStatistics: public ColumnStatistics { + public: + virtual ~DoubleColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has sum. + * @return true if has sum + */ + virtual bool hasSum() const = 0; + + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + virtual double getMinimum() const = 0; + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + virtual double getMaximum() const = 0; + + /** + * Get the sum of the values in the column. + * @return the sum + */ + virtual double getSum() const = 0; + }; + + /** + * Statistics for all of the integer columns, such as byte, short, int, and + * long. + */ + class IntegerColumnStatistics: public ColumnStatistics { + public: + virtual ~IntegerColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has sum. + * @return true if has sum + */ + virtual bool hasSum() const = 0; + + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + virtual int64_t getMinimum() const = 0; + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + virtual int64_t getMaximum() const = 0; + + /** + * Get the sum of the column. Only valid if isSumDefined returns true. + * @return the sum of the column + */ + virtual int64_t getSum() const = 0; + }; + + /** + * Statistics for string columns. + */ + class StringColumnStatistics: public ColumnStatistics { + public: + virtual ~StringColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has total length. + * @return true if has total length + */ + virtual bool hasTotalLength() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ virtual const std::string & getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ + + /** + * Get the maximum value for the column. + * @return maximum value + */ virtual const std::string & getMaximum() const = 0; - - /** - * Get the total length of all values. - * @return total length of all the values - */ - virtual uint64_t getTotalLength() const = 0; - }; - - /** - * Statistics for timestamp columns. - */ - class TimestampColumnStatistics: public ColumnStatistics { - public: - virtual ~TimestampColumnStatistics(); - - /** - * Check whether column minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ - virtual int64_t getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ - virtual int64_t getMaximum() const = 0; - - /** - * Check whether column has a lowerBound. - * @return true if column has a lowerBound - */ - virtual bool hasLowerBound() const = 0; - - /** - * Check whether column has an upperBound. - * @return true if column has an upperBound - */ - virtual bool hasUpperBound() const = 0; - - /** - * Get the lowerBound value for the column. - * @return lowerBound value - */ - virtual int64_t getLowerBound() const = 0; - - /** - * Get the upperBound value for the column. - * @return upperBound value - */ - virtual int64_t getUpperBound() const = 0; - - - }; - - class Statistics { - public: - virtual ~Statistics(); - - /** - * Get the statistics of the given column. - * @param colId id of the column - * @return one column's statistics - */ - virtual const ColumnStatistics* getColumnStatistics(uint32_t colId - ) const = 0; - - /** - * Get the number of columns. - * @return the number of columns - */ - virtual uint32_t getNumberOfColumns() const = 0; - }; - - class StripeStatistics : public Statistics { - public: - virtual ~StripeStatistics(); - - /** - * Get the statistics of a given RowIndex entry in a given column. - * @param columnId id of the column - * @param rowIndexId RowIndex entry id - * @return statistics of the given RowIndex entry - */ - virtual const ColumnStatistics* - getRowIndexStatistics( - uint32_t columnId, uint32_t rowIndexId) const = 0; - - /** - * Get the number of RowIndex statistics in a given column. - * @param columnId id of the column - * @return the number of RowIndex statistics - */ - virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0; - }; -} - -#endif + + /** + * Get the total length of all values. + * @return total length of all the values + */ + virtual uint64_t getTotalLength() const = 0; + }; + + /** + * Statistics for timestamp columns. + */ + class TimestampColumnStatistics: public ColumnStatistics { + public: + virtual ~TimestampColumnStatistics(); + + /** + * Check whether column minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ + virtual int64_t getMinimum() const = 0; + + /** + * Get the maximum value for the column. + * @return maximum value + */ + virtual int64_t getMaximum() const = 0; + + /** + * Check whether column has a lowerBound. + * @return true if column has a lowerBound + */ + virtual bool hasLowerBound() const = 0; + + /** + * Check whether column has an upperBound. + * @return true if column has an upperBound + */ + virtual bool hasUpperBound() const = 0; + + /** + * Get the lowerBound value for the column. + * @return lowerBound value + */ + virtual int64_t getLowerBound() const = 0; + + /** + * Get the upperBound value for the column. + * @return upperBound value + */ + virtual int64_t getUpperBound() const = 0; + + + }; + + class Statistics { + public: + virtual ~Statistics(); + + /** + * Get the statistics of the given column. + * @param colId id of the column + * @return one column's statistics + */ + virtual const ColumnStatistics* getColumnStatistics(uint32_t colId + ) const = 0; + + /** + * Get the number of columns. + * @return the number of columns + */ + virtual uint32_t getNumberOfColumns() const = 0; + }; + + class StripeStatistics : public Statistics { + public: + virtual ~StripeStatistics(); + + /** + * Get the statistics of a given RowIndex entry in a given column. + * @param columnId id of the column + * @param rowIndexId RowIndex entry id + * @return statistics of the given RowIndex entry + */ + virtual const ColumnStatistics* + getRowIndexStatistics( + uint32_t columnId, uint32_t rowIndexId) const = 0; + + /** + * Get the number of RowIndex statistics in a given column. + * @param columnId id of the column + * @return the number of RowIndex statistics + */ + virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0; + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Type.hh b/contrib/libs/apache/orc/c++/include/orc/Type.hh index c0cbf2d671..ba0f87e9b2 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Type.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Type.hh @@ -1,111 +1,111 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_TYPE_HH -#define ORC_TYPE_HH - -#include "orc/orc-config.hh" -#include "orc/Vector.hh" -#include "MemoryPool.hh" - -namespace orc { - - enum TypeKind { - BOOLEAN = 0, - BYTE = 1, - SHORT = 2, - INT = 3, - LONG = 4, - FLOAT = 5, - DOUBLE = 6, - STRING = 7, - BINARY = 8, - TIMESTAMP = 9, - LIST = 10, - MAP = 11, - STRUCT = 12, - UNION = 13, - DECIMAL = 14, - DATE = 15, - VARCHAR = 16, - CHAR = 17 - }; - - class Type { - public: - virtual ~Type(); - virtual uint64_t getColumnId() const = 0; - virtual uint64_t getMaximumColumnId() const = 0; - virtual TypeKind getKind() const = 0; - virtual uint64_t getSubtypeCount() const = 0; - virtual const Type* getSubtype(uint64_t childId) const = 0; - virtual const std::string& getFieldName(uint64_t childId) const = 0; - virtual uint64_t getMaximumLength() const = 0; - virtual uint64_t getPrecision() const = 0; - virtual uint64_t getScale() const = 0; - virtual std::string toString() const = 0; - - /** - * Create a row batch for this type. - */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size, - MemoryPool& pool, - bool encoded = false - ) const = 0; - - /** - * Add a new field to a struct type. - * @param fieldName the name of the new field - * @param fieldType the type of the new field - * @return a reference to the struct type - */ - virtual Type* addStructField(const std::string& fieldName, - ORC_UNIQUE_PTR<Type> fieldType) = 0; - - /** - * Add a new child to a union type. - * @param fieldType the type of the new field - * @return a reference to the union type - */ - virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0; - - /** - * Build a Type object from string text representation. - */ - static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input); - }; - - const int64_t DEFAULT_DECIMAL_SCALE = 18; - const int64_t DEFAULT_DECIMAL_PRECISION = 38; - - ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind); - ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind, - uint64_t maxLength); - ORC_UNIQUE_PTR<Type> - createDecimalType(uint64_t precision= - DEFAULT_DECIMAL_PRECISION, - uint64_t scale=DEFAULT_DECIMAL_SCALE); - - ORC_UNIQUE_PTR<Type> createStructType(); - ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements); - ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key, - ORC_UNIQUE_PTR<Type> value); - ORC_UNIQUE_PTR<Type> createUnionType(); - -} -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_TYPE_HH +#define ORC_TYPE_HH + +#include "orc/orc-config.hh" +#include "orc/Vector.hh" +#include "MemoryPool.hh" + +namespace orc { + + enum TypeKind { + BOOLEAN = 0, + BYTE = 1, + SHORT = 2, + INT = 3, + LONG = 4, + FLOAT = 5, + DOUBLE = 6, + STRING = 7, + BINARY = 8, + TIMESTAMP = 9, + LIST = 10, + MAP = 11, + STRUCT = 12, + UNION = 13, + DECIMAL = 14, + DATE = 15, + VARCHAR = 16, + CHAR = 17 + }; + + class Type { + public: + virtual ~Type(); + virtual uint64_t getColumnId() const = 0; + virtual uint64_t getMaximumColumnId() const = 0; + virtual TypeKind getKind() const = 0; + virtual uint64_t getSubtypeCount() const = 0; + virtual const Type* getSubtype(uint64_t childId) const = 0; + virtual const std::string& getFieldName(uint64_t childId) const = 0; + virtual uint64_t getMaximumLength() const = 0; + virtual uint64_t getPrecision() const = 0; + virtual uint64_t getScale() const = 0; + virtual std::string toString() const = 0; + + /** + * Create a row batch for this type. + */ + virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size, + MemoryPool& pool, + bool encoded = false + ) const = 0; + + /** + * Add a new field to a struct type. + * @param fieldName the name of the new field + * @param fieldType the type of the new field + * @return a reference to the struct type + */ + virtual Type* addStructField(const std::string& fieldName, + ORC_UNIQUE_PTR<Type> fieldType) = 0; + + /** + * Add a new child to a union type. + * @param fieldType the type of the new field + * @return a reference to the union type + */ + virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0; + + /** + * Build a Type object from string text representation. + */ + static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input); + }; + + const int64_t DEFAULT_DECIMAL_SCALE = 18; + const int64_t DEFAULT_DECIMAL_PRECISION = 38; + + ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind); + ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind, + uint64_t maxLength); + ORC_UNIQUE_PTR<Type> + createDecimalType(uint64_t precision= + DEFAULT_DECIMAL_PRECISION, + uint64_t scale=DEFAULT_DECIMAL_SCALE); + + ORC_UNIQUE_PTR<Type> createStructType(); + ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements); + ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key, + ORC_UNIQUE_PTR<Type> value); + ORC_UNIQUE_PTR<Type> createUnionType(); + +} +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Vector.hh b/contrib/libs/apache/orc/c++/include/orc/Vector.hh index 629c0b7f6b..97bba1ef83 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Vector.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Vector.hh @@ -1,326 +1,326 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_VECTOR_HH -#define ORC_VECTOR_HH - -#include "orc/orc-config.hh" -#include "MemoryPool.hh" -#include "Int128.hh" - -#include <list> -#include <memory> -#include <cstring> -#include <vector> -#include <stdexcept> -#include <cstdlib> -#include <iostream> - -namespace orc { - - /** - * The base class for each of the column vectors. This class handles - * the generic attributes such as number of elements, capacity, and - * notNull vector. - */ - struct ColumnVectorBatch { - ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~ColumnVectorBatch(); - - // the number of slots available - uint64_t capacity; - // the number of current occupied slots - uint64_t numElements; - // an array of capacity length marking non-null values - DataBuffer<char> notNull; - // whether there are any null values - bool hasNulls; - // whether the vector batch is encoded - bool isEncoded; - - // custom memory pool - MemoryPool& memoryPool; - - /** - * Generate a description of this vector as a string. - */ - virtual std::string toString() const = 0; - - /** - * Change the number of slots to at least the given capacity. - * This function is not recursive into subtypes. - */ - virtual void resize(uint64_t capacity); - - /** - * Empties the vector from all its elements, recursively. - * Do not alter the current capacity. - */ - virtual void clear(); - - /** - * Heap memory used by the batch. - */ - virtual uint64_t getMemoryUsage(); - - /** - * Check whether the batch length varies depending on data. - */ - virtual bool hasVariableLength(); - - private: - ColumnVectorBatch(const ColumnVectorBatch&); - ColumnVectorBatch& operator=(const ColumnVectorBatch&); - }; - - struct LongVectorBatch: public ColumnVectorBatch { - LongVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~LongVectorBatch(); - - DataBuffer<int64_t> data; - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - }; - - struct DoubleVectorBatch: public ColumnVectorBatch { - DoubleVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~DoubleVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - DataBuffer<double> data; - }; - - struct StringVectorBatch: public ColumnVectorBatch { - StringVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~StringVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // pointers to the start of each string - DataBuffer<char*> data; - // the length of each string - DataBuffer<int64_t> length; - // string blob - DataBuffer<char> blob; - }; - - struct StringDictionary { - StringDictionary(MemoryPool& pool); - DataBuffer<char> dictionaryBlob; - - // Offset for each dictionary key entry. - DataBuffer<int64_t> dictionaryOffset; - - void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) { - if (index < 0 || static_cast<uint64_t>(index) >= dictionaryOffset.size()) { - throw std::out_of_range("index out of range."); - } - - int64_t* offsetPtr = dictionaryOffset.data(); - - valPtr = dictionaryBlob.data() + offsetPtr[index]; - length = offsetPtr[index + 1] - offsetPtr[index]; - } - }; - - /** - * Include a index array with reference to corresponding dictionary. - * User first obtain index from index array and retrieve string pointer - * and length by calling getValueByIndex() from dictionary. - */ - struct EncodedStringVectorBatch : public StringVectorBatch { - EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~EncodedStringVectorBatch(); - std::string toString() const; - std::shared_ptr<StringDictionary> dictionary; - - // index for dictionary entry - DataBuffer<int64_t> index; - }; - - struct StructVectorBatch: public ColumnVectorBatch { - StructVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~StructVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - std::vector<ColumnVectorBatch*> fields; - }; - - struct ListVectorBatch: public ColumnVectorBatch { - ListVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~ListVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - /** - * The offset of the first element of each list. - * The length of list i is offsets[i+1] - offsets[i]. - */ - DataBuffer<int64_t> offsets; - - // the concatenated elements - ORC_UNIQUE_PTR<ColumnVectorBatch> elements; - }; - - struct MapVectorBatch: public ColumnVectorBatch { - MapVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~MapVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - /** - * The offset of the first element of each map. - * The size of map i is offsets[i+1] - offsets[i]. - */ - DataBuffer<int64_t> offsets; - - // the concatenated keys - ORC_UNIQUE_PTR<ColumnVectorBatch> keys; - // the concatenated elements - ORC_UNIQUE_PTR<ColumnVectorBatch> elements; - }; - - struct UnionVectorBatch: public ColumnVectorBatch { - UnionVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~UnionVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - /** - * For each value, which element of children has the value. - */ - DataBuffer<unsigned char> tags; - - /** - * For each value, the index inside of the child ColumnVectorBatch. - */ - DataBuffer<uint64_t> offsets; - - // the sub-columns - std::vector<ColumnVectorBatch*> children; - }; - - struct Decimal { - Decimal(const Int128& value, int32_t scale); - explicit Decimal(const std::string& value); - Decimal(); - - std::string toString() const; - Int128 value; - int32_t scale; - }; - - struct Decimal64VectorBatch: public ColumnVectorBatch { - Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~Decimal64VectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // total number of digits - int32_t precision; - // the number of places after the decimal - int32_t scale; - - // the numeric values - DataBuffer<int64_t> values; - - protected: - /** - * Contains the scales that were read from the file. Should NOT be - * used. - */ - DataBuffer<int64_t> readScales; - friend class Decimal64ColumnReader; - friend class Decimal64ColumnWriter; - }; - - struct Decimal128VectorBatch: public ColumnVectorBatch { - Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~Decimal128VectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // total number of digits - int32_t precision; - // the number of places after the decimal - int32_t scale; - - // the numeric values - DataBuffer<Int128> values; - - protected: - /** - * Contains the scales that were read from the file. Should NOT be - * used. - */ - DataBuffer<int64_t> readScales; - friend class Decimal128ColumnReader; - friend class DecimalHive11ColumnReader; - friend class Decimal128ColumnWriter; - }; - - /** - * A column vector batch for storing timestamp values. - * The timestamps are stored split into the time_t value (seconds since - * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. - */ - struct TimestampVectorBatch: public ColumnVectorBatch { - TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~TimestampVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) - // Note that we always assume data is in GMT timezone; therefore it is - // user's responsibility to convert wall clock time in local timezone - // to GMT. - DataBuffer<int64_t> data; - - // the nanoseconds of each value - DataBuffer<int64_t> nanoseconds; - }; - -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_VECTOR_HH +#define ORC_VECTOR_HH + +#include "orc/orc-config.hh" +#include "MemoryPool.hh" +#include "Int128.hh" + +#include <list> +#include <memory> +#include <cstring> +#include <vector> +#include <stdexcept> +#include <cstdlib> +#include <iostream> + +namespace orc { + + /** + * The base class for each of the column vectors. This class handles + * the generic attributes such as number of elements, capacity, and + * notNull vector. + */ + struct ColumnVectorBatch { + ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~ColumnVectorBatch(); + + // the number of slots available + uint64_t capacity; + // the number of current occupied slots + uint64_t numElements; + // an array of capacity length marking non-null values + DataBuffer<char> notNull; + // whether there are any null values + bool hasNulls; + // whether the vector batch is encoded + bool isEncoded; + + // custom memory pool + MemoryPool& memoryPool; + + /** + * Generate a description of this vector as a string. + */ + virtual std::string toString() const = 0; + + /** + * Change the number of slots to at least the given capacity. + * This function is not recursive into subtypes. + */ + virtual void resize(uint64_t capacity); + + /** + * Empties the vector from all its elements, recursively. + * Do not alter the current capacity. + */ + virtual void clear(); + + /** + * Heap memory used by the batch. + */ + virtual uint64_t getMemoryUsage(); + + /** + * Check whether the batch length varies depending on data. + */ + virtual bool hasVariableLength(); + + private: + ColumnVectorBatch(const ColumnVectorBatch&); + ColumnVectorBatch& operator=(const ColumnVectorBatch&); + }; + + struct LongVectorBatch: public ColumnVectorBatch { + LongVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~LongVectorBatch(); + + DataBuffer<int64_t> data; + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + }; + + struct DoubleVectorBatch: public ColumnVectorBatch { + DoubleVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~DoubleVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + DataBuffer<double> data; + }; + + struct StringVectorBatch: public ColumnVectorBatch { + StringVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~StringVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // pointers to the start of each string + DataBuffer<char*> data; + // the length of each string + DataBuffer<int64_t> length; + // string blob + DataBuffer<char> blob; + }; + + struct StringDictionary { + StringDictionary(MemoryPool& pool); + DataBuffer<char> dictionaryBlob; + + // Offset for each dictionary key entry. + DataBuffer<int64_t> dictionaryOffset; + + void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) { + if (index < 0 || static_cast<uint64_t>(index) >= dictionaryOffset.size()) { + throw std::out_of_range("index out of range."); + } + + int64_t* offsetPtr = dictionaryOffset.data(); + + valPtr = dictionaryBlob.data() + offsetPtr[index]; + length = offsetPtr[index + 1] - offsetPtr[index]; + } + }; + + /** + * Include a index array with reference to corresponding dictionary. + * User first obtain index from index array and retrieve string pointer + * and length by calling getValueByIndex() from dictionary. + */ + struct EncodedStringVectorBatch : public StringVectorBatch { + EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~EncodedStringVectorBatch(); + std::string toString() const; + std::shared_ptr<StringDictionary> dictionary; + + // index for dictionary entry + DataBuffer<int64_t> index; + }; + + struct StructVectorBatch: public ColumnVectorBatch { + StructVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~StructVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + std::vector<ColumnVectorBatch*> fields; + }; + + struct ListVectorBatch: public ColumnVectorBatch { + ListVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~ListVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + /** + * The offset of the first element of each list. + * The length of list i is offsets[i+1] - offsets[i]. + */ + DataBuffer<int64_t> offsets; + + // the concatenated elements + ORC_UNIQUE_PTR<ColumnVectorBatch> elements; + }; + + struct MapVectorBatch: public ColumnVectorBatch { + MapVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~MapVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + /** + * The offset of the first element of each map. + * The size of map i is offsets[i+1] - offsets[i]. + */ + DataBuffer<int64_t> offsets; + + // the concatenated keys + ORC_UNIQUE_PTR<ColumnVectorBatch> keys; + // the concatenated elements + ORC_UNIQUE_PTR<ColumnVectorBatch> elements; + }; + + struct UnionVectorBatch: public ColumnVectorBatch { + UnionVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~UnionVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + /** + * For each value, which element of children has the value. + */ + DataBuffer<unsigned char> tags; + + /** + * For each value, the index inside of the child ColumnVectorBatch. + */ + DataBuffer<uint64_t> offsets; + + // the sub-columns + std::vector<ColumnVectorBatch*> children; + }; + + struct Decimal { + Decimal(const Int128& value, int32_t scale); + explicit Decimal(const std::string& value); + Decimal(); + + std::string toString() const; + Int128 value; + int32_t scale; + }; + + struct Decimal64VectorBatch: public ColumnVectorBatch { + Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~Decimal64VectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // total number of digits + int32_t precision; + // the number of places after the decimal + int32_t scale; + + // the numeric values + DataBuffer<int64_t> values; + + protected: + /** + * Contains the scales that were read from the file. Should NOT be + * used. + */ + DataBuffer<int64_t> readScales; + friend class Decimal64ColumnReader; + friend class Decimal64ColumnWriter; + }; + + struct Decimal128VectorBatch: public ColumnVectorBatch { + Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~Decimal128VectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // total number of digits + int32_t precision; + // the number of places after the decimal + int32_t scale; + + // the numeric values + DataBuffer<Int128> values; + + protected: + /** + * Contains the scales that were read from the file. Should NOT be + * used. + */ + DataBuffer<int64_t> readScales; + friend class Decimal128ColumnReader; + friend class DecimalHive11ColumnReader; + friend class Decimal128ColumnWriter; + }; + + /** + * A column vector batch for storing timestamp values. + * The timestamps are stored split into the time_t value (seconds since + * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. + */ + struct TimestampVectorBatch: public ColumnVectorBatch { + TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~TimestampVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) + // Note that we always assume data is in GMT timezone; therefore it is + // user's responsibility to convert wall clock time in local timezone + // to GMT. + DataBuffer<int64_t> data; + + // the nanoseconds of each value + DataBuffer<int64_t> nanoseconds; + }; + +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Writer.hh b/contrib/libs/apache/orc/c++/include/orc/Writer.hh index 5b333861b1..2588d62151 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Writer.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Writer.hh @@ -1,252 +1,252 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_WRITER_HH -#define ORC_WRITER_HH - -#include "orc/Common.hh" -#include "orc/orc-config.hh" -#include "orc/Type.hh" -#include "orc/Vector.hh" - -#include <memory> -#include <set> -#include <string> -#include <vector> - -namespace orc { - - // classes that hold data members so we can maintain binary compatibility - struct WriterOptionsPrivate; - - enum CompressionStrategy { - CompressionStrategy_SPEED = 0, - CompressionStrategy_COMPRESSION - }; - - enum RleVersion { - RleVersion_1 = 0, - RleVersion_2 = 1 - }; - - class Timezone; - - /** - * Options for creating a Writer. - */ - class WriterOptions { - private: - ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits; - - public: - WriterOptions(); - WriterOptions(const WriterOptions&); - WriterOptions(WriterOptions&); - WriterOptions& operator=(const WriterOptions&); - virtual ~WriterOptions(); - - /** - * Set the strip size. - */ - WriterOptions& setStripeSize(uint64_t size); - - /** - * Get the strip size. - * @return if not set, return default value. - */ - uint64_t getStripeSize() const; - - /** - * Set the data compression block size. - */ - WriterOptions& setCompressionBlockSize(uint64_t size); - - /** - * Get the data compression block size. - * @return if not set, return default value. - */ - uint64_t getCompressionBlockSize() const; - - /** - * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index. - */ - WriterOptions& setRowIndexStride(uint64_t stride); - - /** - * Get the row index stride (the number of rows per an entry in the row index). - * @return if not set, return default value. - */ - uint64_t getRowIndexStride() const; - - /** - * Set the dictionary key size threshold. - * 0 to disable dictionary encoding. - * 1 to always enable dictionary encoding. - */ - WriterOptions& setDictionaryKeySizeThreshold(double val); - - /** - * Get the dictionary key size threshold. - */ - double getDictionaryKeySizeThreshold() const; - - /** - * Set Orc file version - */ - WriterOptions& setFileVersion(const FileVersion& version); - - /** - * Get Orc file version - */ - FileVersion getFileVersion() const; - - /** - * Set compression kind. - */ - WriterOptions& setCompression(CompressionKind comp); - - /** - * Get the compression kind. - * @return if not set, return default value which is ZLIB. - */ - CompressionKind getCompression() const; - - /** - * Set the compression strategy. - */ - WriterOptions& setCompressionStrategy(CompressionStrategy strategy); - - /** - * Get the compression strategy. - * @return if not set, return default value which is speed. - */ - CompressionStrategy getCompressionStrategy() const; - - /** - * Get if the bitpacking should be aligned. - * @return true if should be aligned, return false otherwise - */ - bool getAlignedBitpacking() const; - - /** - * Set the padding tolerance. - */ - WriterOptions& setPaddingTolerance(double tolerance); - - /** - * Get the padding tolerance. - * @return if not set, return default value which is zero. - */ - double getPaddingTolerance() const; - - /** - * Set the memory pool. - */ - WriterOptions& setMemoryPool(MemoryPool * memoryPool); - - /** - * Get the memory pool. - * @return if not set, return default memory pool. - */ - MemoryPool * getMemoryPool() const; - - /** - * Set the error stream. - */ - WriterOptions& setErrorStream(std::ostream& errStream); - - /** - * Get the error stream. - * @return if not set, return std::err. - */ - std::ostream * getErrorStream() const; - - /** - * Get the RLE version. - */ - RleVersion getRleVersion() const; - - /** - * Get whether or not to write row group index - * @return if not set, the default is false - */ - bool getEnableIndex() const; - - /** - * Get whether or not to enable dictionary encoding - * @return if not set, the default is false - */ - bool getEnableDictionary() const; - - /** - * Set columns that use BloomFilter - */ - WriterOptions& setColumnsUseBloomFilter(const std::set<uint64_t>& columns); - - /** - * Get whether this column uses BloomFilter - */ - bool isColumnUseBloomFilter(uint64_t column) const; - - /** - * Set false positive probability of BloomFilter - */ - WriterOptions& setBloomFilterFPP(double fpp); - - /** - * Get false positive probability of BloomFilter - */ - double getBloomFilterFPP() const; - - /** - * Get version of BloomFilter - */ - BloomFilterVersion getBloomFilterVersion() const; - }; - - class Writer { - public: - virtual ~Writer(); - - /** - * Create a row batch for writing the columns into this file. - * @param size the number of rows to write. - * @return a new ColumnVectorBatch to write into. - */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; - - /** - * Add a row batch into current writer. - * @param rowsToAdd the row batch data to write. - */ - virtual void add(ColumnVectorBatch& rowsToAdd) = 0; - - /** - * Close the writer and flush any pending data to the output stream. - */ - virtual void close() = 0; - - /** - * Add user metadata to the writer. - */ - virtual void addUserMetadata(const std::string name, const std::string value) = 0; - }; -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_WRITER_HH +#define ORC_WRITER_HH + +#include "orc/Common.hh" +#include "orc/orc-config.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" + +#include <memory> +#include <set> +#include <string> +#include <vector> + +namespace orc { + + // classes that hold data members so we can maintain binary compatibility + struct WriterOptionsPrivate; + + enum CompressionStrategy { + CompressionStrategy_SPEED = 0, + CompressionStrategy_COMPRESSION + }; + + enum RleVersion { + RleVersion_1 = 0, + RleVersion_2 = 1 + }; + + class Timezone; + + /** + * Options for creating a Writer. + */ + class WriterOptions { + private: + ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits; + + public: + WriterOptions(); + WriterOptions(const WriterOptions&); + WriterOptions(WriterOptions&); + WriterOptions& operator=(const WriterOptions&); + virtual ~WriterOptions(); + + /** + * Set the strip size. + */ + WriterOptions& setStripeSize(uint64_t size); + + /** + * Get the strip size. + * @return if not set, return default value. + */ + uint64_t getStripeSize() const; + + /** + * Set the data compression block size. + */ + WriterOptions& setCompressionBlockSize(uint64_t size); + + /** + * Get the data compression block size. + * @return if not set, return default value. + */ + uint64_t getCompressionBlockSize() const; + + /** + * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index. + */ + WriterOptions& setRowIndexStride(uint64_t stride); + + /** + * Get the row index stride (the number of rows per an entry in the row index). + * @return if not set, return default value. + */ + uint64_t getRowIndexStride() const; + + /** + * Set the dictionary key size threshold. + * 0 to disable dictionary encoding. + * 1 to always enable dictionary encoding. + */ + WriterOptions& setDictionaryKeySizeThreshold(double val); + + /** + * Get the dictionary key size threshold. + */ + double getDictionaryKeySizeThreshold() const; + + /** + * Set Orc file version + */ + WriterOptions& setFileVersion(const FileVersion& version); + + /** + * Get Orc file version + */ + FileVersion getFileVersion() const; + + /** + * Set compression kind. + */ + WriterOptions& setCompression(CompressionKind comp); + + /** + * Get the compression kind. + * @return if not set, return default value which is ZLIB. + */ + CompressionKind getCompression() const; + + /** + * Set the compression strategy. + */ + WriterOptions& setCompressionStrategy(CompressionStrategy strategy); + + /** + * Get the compression strategy. + * @return if not set, return default value which is speed. + */ + CompressionStrategy getCompressionStrategy() const; + + /** + * Get if the bitpacking should be aligned. + * @return true if should be aligned, return false otherwise + */ + bool getAlignedBitpacking() const; + + /** + * Set the padding tolerance. + */ + WriterOptions& setPaddingTolerance(double tolerance); + + /** + * Get the padding tolerance. + * @return if not set, return default value which is zero. + */ + double getPaddingTolerance() const; + + /** + * Set the memory pool. + */ + WriterOptions& setMemoryPool(MemoryPool * memoryPool); + + /** + * Get the memory pool. + * @return if not set, return default memory pool. + */ + MemoryPool * getMemoryPool() const; + + /** + * Set the error stream. + */ + WriterOptions& setErrorStream(std::ostream& errStream); + + /** + * Get the error stream. + * @return if not set, return std::err. + */ + std::ostream * getErrorStream() const; + + /** + * Get the RLE version. + */ + RleVersion getRleVersion() const; + + /** + * Get whether or not to write row group index + * @return if not set, the default is false + */ + bool getEnableIndex() const; + + /** + * Get whether or not to enable dictionary encoding + * @return if not set, the default is false + */ + bool getEnableDictionary() const; + + /** + * Set columns that use BloomFilter + */ + WriterOptions& setColumnsUseBloomFilter(const std::set<uint64_t>& columns); + + /** + * Get whether this column uses BloomFilter + */ + bool isColumnUseBloomFilter(uint64_t column) const; + + /** + * Set false positive probability of BloomFilter + */ + WriterOptions& setBloomFilterFPP(double fpp); + + /** + * Get false positive probability of BloomFilter + */ + double getBloomFilterFPP() const; + + /** + * Get version of BloomFilter + */ + BloomFilterVersion getBloomFilterVersion() const; + }; + + class Writer { + public: + virtual ~Writer(); + + /** + * Create a row batch for writing the columns into this file. + * @param size the number of rows to write. + * @return a new ColumnVectorBatch to write into. + */ + virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size + ) const = 0; + + /** + * Add a row batch into current writer. + * @param rowsToAdd the row batch data to write. + */ + virtual void add(ColumnVectorBatch& rowsToAdd) = 0; + + /** + * Close the writer and flush any pending data to the output stream. + */ + virtual void close() = 0; + + /** + * Add user metadata to the writer. + */ + virtual void addUserMetadata(const std::string name, const std::string value) = 0; + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh index 18bbbd78e1..d06d892b41 100644 --- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh +++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh @@ -1,78 +1,78 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_CONFIG_HH -#define ORC_CONFIG_HH - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_CONFIG_HH +#define ORC_CONFIG_HH + #define ORC_VERSION "1.6.12" - -#define ORC_CXX_HAS_CSTDINT -#define ORC_CXX_HAS_INITIALIZER_LIST -#define ORC_CXX_HAS_NOEXCEPT -#define ORC_CXX_HAS_NULLPTR -#define ORC_CXX_HAS_OVERRIDE -#define ORC_CXX_HAS_UNIQUE_PTR - -#ifdef ORC_CXX_HAS_CSTDINT - #include <cstdint> -#else - #include <stdint.h> -#endif - -#ifdef ORC_CXX_HAS_NOEXCEPT - #define ORC_NOEXCEPT noexcept -#else - #define ORC_NOEXCEPT throw () -#endif - -#ifdef ORC_CXX_HAS_NULLPTR - #define ORC_NULLPTR nullptr -#else - namespace orc { - class nullptr_t { - public: - template<class T> - operator T*() const { - return 0; - } - - template<class C, class T> - operator T C::*() const { - return 0; - } - private: - void operator&() const; // whose address can't be taken - }; - const nullptr_t nullptr = {}; - } - #define ORC_NULLPTR orc::nullptr -#endif - -#ifdef ORC_CXX_HAS_OVERRIDE - #define ORC_OVERRIDE override -#else - #define ORC_OVERRIDE -#endif - -#ifdef ORC_CXX_HAS_UNIQUE_PTR - #define ORC_UNIQUE_PTR std::unique_ptr -#else - #define ORC_UNIQUE_PTR std::auto_ptr - namespace std { - template<typename T> - inline T move(T& x) { return x; } - } -#endif - -#endif + +#define ORC_CXX_HAS_CSTDINT +#define ORC_CXX_HAS_INITIALIZER_LIST +#define ORC_CXX_HAS_NOEXCEPT +#define ORC_CXX_HAS_NULLPTR +#define ORC_CXX_HAS_OVERRIDE +#define ORC_CXX_HAS_UNIQUE_PTR + +#ifdef ORC_CXX_HAS_CSTDINT + #include <cstdint> +#else + #include <stdint.h> +#endif + +#ifdef ORC_CXX_HAS_NOEXCEPT + #define ORC_NOEXCEPT noexcept +#else + #define ORC_NOEXCEPT throw () +#endif + +#ifdef ORC_CXX_HAS_NULLPTR + #define ORC_NULLPTR nullptr +#else + namespace orc { + class nullptr_t { + public: + template<class T> + operator T*() const { + return 0; + } + + template<class C, class T> + operator T C::*() const { + return 0; + } + private: + void operator&() const; // whose address can't be taken + }; + const nullptr_t nullptr = {}; + } + #define ORC_NULLPTR orc::nullptr +#endif + +#ifdef ORC_CXX_HAS_OVERRIDE + #define ORC_OVERRIDE override +#else + #define ORC_OVERRIDE +#endif + +#ifdef ORC_CXX_HAS_UNIQUE_PTR + #define ORC_UNIQUE_PTR std::unique_ptr +#else + #define ORC_UNIQUE_PTR std::auto_ptr + namespace std { + template<typename T> + inline T move(T& x) { return x; } + } +#endif + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.cc b/contrib/libs/apache/orc/c++/src/Adaptor.cc index bf3a3e181b..f402d65adf 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor.cc +++ b/contrib/libs/apache/orc/c++/src/Adaptor.cc @@ -1,88 +1,88 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "Adaptor.hh" -#include <sstream> -#include <iomanip> - -#ifndef HAS_STOLL -namespace std { - int64_t std::stoll(std::string str) { - int64_t val = 0; - stringstream ss; - ss << str; - ss >> val; - return val; - } -} -#endif - -#ifndef HAS_STRPTIME -char* strptime(const char* s, const char* f, struct tm* tm) { - std::istringstream input(s); - input.imbue(std::locale(setlocale(LC_ALL, nullptr))); - input >> std::get_time(tm, f); - if (input.fail()) return nullptr; - return (char*)(s + input.tellg()); -} -#endif - -#ifndef HAS_PREAD - #ifdef _WIN32 -#include <Windows.h> -#include <io.h> -ssize_t pread(int fd, void* buf, size_t size, off_t offset) { - auto handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd)); - - OVERLAPPED ol; - memset(&ol, 0, sizeof(OVERLAPPED)); - ol.Offset = offset; - - DWORD rt; - if (!ReadFile(handle, buf, static_cast<DWORD>(size), &rt, &ol)) { - errno = GetLastError(); - return -1; - } - return static_cast<ssize_t>(rt); -} - #else - #error("pread() undefined: unknown environment") - #endif -#endif - -namespace orc { -#ifdef HAS_DOUBLE_TO_STRING - std::string to_string(double val) { - return std::to_string(val); - } -#else - std::string to_string(double val) { - return std::to_string(static_cast<long double>(val)); - } -#endif - -#ifdef HAS_INT64_TO_STRING - std::string to_string(int64_t val) { - return std::to_string(val); - } -#else - std::string to_string(int64_t val) { - return std::to_string(static_cast<long long int>(val)); - } -#endif -} +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "Adaptor.hh" +#include <sstream> +#include <iomanip> + +#ifndef HAS_STOLL +namespace std { + int64_t std::stoll(std::string str) { + int64_t val = 0; + stringstream ss; + ss << str; + ss >> val; + return val; + } +} +#endif + +#ifndef HAS_STRPTIME +char* strptime(const char* s, const char* f, struct tm* tm) { + std::istringstream input(s); + input.imbue(std::locale(setlocale(LC_ALL, nullptr))); + input >> std::get_time(tm, f); + if (input.fail()) return nullptr; + return (char*)(s + input.tellg()); +} +#endif + +#ifndef HAS_PREAD + #ifdef _WIN32 +#include <Windows.h> +#include <io.h> +ssize_t pread(int fd, void* buf, size_t size, off_t offset) { + auto handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd)); + + OVERLAPPED ol; + memset(&ol, 0, sizeof(OVERLAPPED)); + ol.Offset = offset; + + DWORD rt; + if (!ReadFile(handle, buf, static_cast<DWORD>(size), &rt, &ol)) { + errno = GetLastError(); + return -1; + } + return static_cast<ssize_t>(rt); +} + #else + #error("pread() undefined: unknown environment") + #endif +#endif + +namespace orc { +#ifdef HAS_DOUBLE_TO_STRING + std::string to_string(double val) { + return std::to_string(val); + } +#else + std::string to_string(double val) { + return std::to_string(static_cast<long double>(val)); + } +#endif + +#ifdef HAS_INT64_TO_STRING + std::string to_string(int64_t val) { + return std::to_string(val); + } +#else + std::string to_string(int64_t val) { + return std::to_string(static_cast<long long int>(val)); + } +#endif +} diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.hh b/contrib/libs/apache/orc/c++/src/Adaptor.hh index a91b9c894d..2d6be71faa 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor.hh +++ b/contrib/libs/apache/orc/c++/src/Adaptor.hh @@ -1,175 +1,175 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ADAPTER_HH -#define ADAPTER_HH - -/* #undef INT64_IS_LL */ -#define HAS_CONSTEXPR +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ADAPTER_HH +#define ADAPTER_HH + +/* #undef INT64_IS_LL */ +#define HAS_CONSTEXPR #ifndef _MSC_VER -#define HAS_PREAD -#endif -#define HAS_STRPTIME -#define HAS_STOLL -#define HAS_DIAGNOSTIC_PUSH -#define HAS_DOUBLE_TO_STRING -#define HAS_INT64_TO_STRING -#define HAS_PRE_1970 +#define HAS_PREAD +#endif +#define HAS_STRPTIME +#define HAS_STOLL +#define HAS_DIAGNOSTIC_PUSH +#define HAS_DOUBLE_TO_STRING +#define HAS_INT64_TO_STRING +#define HAS_PRE_1970 #define HAS_POST_2038 -#define HAS_STD_ISNAN -#define HAS_STD_MUTEX +#define HAS_STD_ISNAN +#define HAS_STD_MUTEX #ifndef _MSC_VER #define HAS_BUILTIN_OVERFLOW_CHECK #endif -/* #undef NEEDS_REDUNDANT_MOVE */ -/* #undef NEEDS_Z_PREFIX */ - -#include "orc/orc-config.hh" -#include <string> - -#ifdef _MSC_VER -#include <BaseTsd.h> -typedef SSIZE_T ssize_t; -#define timegm(tm) _mkgmtime(tm) -#define gmtime_r(timep, result) (gmtime_s(result, timep) ? NULL : result) -#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf) -#endif - -#ifndef HAS_STOLL - // A poor man's stoll that converts str to a long long int base 10 - namespace std { - int64_t stoll(std::string str); - } -#endif - -#ifndef HAS_STRPTIME - char* strptime(const char* buf, const char* format, struct tm* tm); -#endif - -#ifndef HAS_PREAD - ssize_t pread(int fd, void* buf, size_t count, off_t offset); -#endif - -#ifdef INT64_IS_LL - #define INT64_FORMAT_STRING "ll" -#else - #define INT64_FORMAT_STRING "l" -#endif - -#ifndef ORC_CXX_HAS_NOEXCEPT - #define noexcept ORC_NOEXCEPT -#endif - -#ifndef ORC_CXX_HAS_OVERRIDE - #define override ORC_OVERRIDE -#endif - -#ifdef HAS_DIAGNOSTIC_PUSH - #ifdef __clang__ - #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") - #define DIAGNOSTIC_POP _Pragma("clang diagnostic pop") - #elif defined(__GNUC__) - #define DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") - #define DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") - #elif defined(_MSC_VER) - #define DIAGNOSTIC_PUSH __pragma(warning(push)) - #define DIAGNOSTIC_POP __pragma(warning(pop)) - #else - #error("Unknown compiler") - #endif -#else - #define DIAGNOSTIC_PUSH - #define DIAGNOSTIC_POP -#endif - -#define PRAGMA(TXT) _Pragma(#TXT) - - #define DIAGNOSTIC_IGNORE(XXX) - -#ifndef ORC_CXX_HAS_UNIQUE_PTR - #define unique_ptr auto_ptr -#endif - -#ifndef UINT32_MAX - #define UINT32_MAX 0xffffffff -#endif - -#ifndef INT64_MAX - #define INT64_MAX 0x7fffffffffffffff -#endif - -#ifndef INT64_MIN - #define INT64_MIN (-0x7fffffffffffffff - 1) -#endif - -#define GTEST_LANG_CXX11 0 - -#ifdef NEEDS_REDUNDANT_MOVE - #define REDUNDANT_MOVE(XXX) std::move(XXX) -#else - #define REDUNDANT_MOVE(XXX) XXX -#endif - -#ifndef HAS_STD_ISNAN - #include <math.h> - #define std::isnan(XXX) isnan(XXX) -#else - #include <cmath> -#endif - -#ifndef HAS_STD_MUTEX - #include <pthread.h> - namespace orc { - /** - * Lock guard for pthread_mutex_t object using RAII - * The Lock is automatically release when exiting current scope. - */ - class LockORC { - public: - explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) { - pthread_mutex_lock(&mutex_ref_); - } - ~LockORC() { pthread_mutex_unlock(&mutex_ref_); } - private: - // no default constructor - LockORC(); - // prohibit copying - LockORC(const LockORC&); - LockORC& operator=(const LockORC&); - - pthread_mutex_t& mutex_ref_; - }; - } - #define std::mutex pthread_mutex_t - #define std::lock_guard<std::mutex> LockORC -#else - #include <mutex> -#endif - -#ifdef NEEDS_Z_PREFIX -#define Z_PREFIX 1 -#endif - -namespace orc { - std::string to_string(double val); - std::string to_string(int64_t val); -} - +/* #undef NEEDS_REDUNDANT_MOVE */ +/* #undef NEEDS_Z_PREFIX */ + +#include "orc/orc-config.hh" +#include <string> + +#ifdef _MSC_VER +#include <BaseTsd.h> +typedef SSIZE_T ssize_t; +#define timegm(tm) _mkgmtime(tm) +#define gmtime_r(timep, result) (gmtime_s(result, timep) ? NULL : result) +#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf) +#endif + +#ifndef HAS_STOLL + // A poor man's stoll that converts str to a long long int base 10 + namespace std { + int64_t stoll(std::string str); + } +#endif + +#ifndef HAS_STRPTIME + char* strptime(const char* buf, const char* format, struct tm* tm); +#endif + +#ifndef HAS_PREAD + ssize_t pread(int fd, void* buf, size_t count, off_t offset); +#endif + +#ifdef INT64_IS_LL + #define INT64_FORMAT_STRING "ll" +#else + #define INT64_FORMAT_STRING "l" +#endif + +#ifndef ORC_CXX_HAS_NOEXCEPT + #define noexcept ORC_NOEXCEPT +#endif + +#ifndef ORC_CXX_HAS_OVERRIDE + #define override ORC_OVERRIDE +#endif + +#ifdef HAS_DIAGNOSTIC_PUSH + #ifdef __clang__ + #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") + #define DIAGNOSTIC_POP _Pragma("clang diagnostic pop") + #elif defined(__GNUC__) + #define DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") + #define DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") + #elif defined(_MSC_VER) + #define DIAGNOSTIC_PUSH __pragma(warning(push)) + #define DIAGNOSTIC_POP __pragma(warning(pop)) + #else + #error("Unknown compiler") + #endif +#else + #define DIAGNOSTIC_PUSH + #define DIAGNOSTIC_POP +#endif + +#define PRAGMA(TXT) _Pragma(#TXT) + + #define DIAGNOSTIC_IGNORE(XXX) + +#ifndef ORC_CXX_HAS_UNIQUE_PTR + #define unique_ptr auto_ptr +#endif + +#ifndef UINT32_MAX + #define UINT32_MAX 0xffffffff +#endif + +#ifndef INT64_MAX + #define INT64_MAX 0x7fffffffffffffff +#endif + +#ifndef INT64_MIN + #define INT64_MIN (-0x7fffffffffffffff - 1) +#endif + +#define GTEST_LANG_CXX11 0 + +#ifdef NEEDS_REDUNDANT_MOVE + #define REDUNDANT_MOVE(XXX) std::move(XXX) +#else + #define REDUNDANT_MOVE(XXX) XXX +#endif + +#ifndef HAS_STD_ISNAN + #include <math.h> + #define std::isnan(XXX) isnan(XXX) +#else + #include <cmath> +#endif + +#ifndef HAS_STD_MUTEX + #include <pthread.h> + namespace orc { + /** + * Lock guard for pthread_mutex_t object using RAII + * The Lock is automatically release when exiting current scope. + */ + class LockORC { + public: + explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) { + pthread_mutex_lock(&mutex_ref_); + } + ~LockORC() { pthread_mutex_unlock(&mutex_ref_); } + private: + // no default constructor + LockORC(); + // prohibit copying + LockORC(const LockORC&); + LockORC& operator=(const LockORC&); + + pthread_mutex_t& mutex_ref_; + }; + } + #define std::mutex pthread_mutex_t + #define std::lock_guard<std::mutex> LockORC +#else + #include <mutex> +#endif + +#ifdef NEEDS_Z_PREFIX +#define Z_PREFIX 1 +#endif + +namespace orc { + std::string to_string(double val); + std::string to_string(int64_t val); +} + #ifdef HAS_BUILTIN_OVERFLOW_CHECK #define multiplyExact !__builtin_mul_overflow #define addExact !__builtin_add_overflow @@ -204,8 +204,8 @@ namespace orc { } #endif -#ifndef HAS_CONSTEXPR -#define constexpr const -#endif - -#endif /* ADAPTER_HH */ +#ifndef HAS_CONSTEXPR +#define constexpr const +#endif + +#endif /* ADAPTER_HH */ diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc index 8a1f1880e7..8ec0acda8c 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc @@ -1,328 +1,328 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "BloomFilter.hh" -#include "Murmur3.hh" - -namespace orc { - - constexpr uint64_t BITS_OF_LONG = 64; - constexpr uint8_t SHIFT_6_BITS = 6; - constexpr uint8_t SHIFT_3_BITS = 3; - - static bool isLittleEndian() { - static union { uint32_t i; char c[4]; } num = { 0x01020304 }; - return num.c[0] == 4; - } - - /** - * Implementation of BitSet - */ - BitSet::BitSet(uint64_t numBits) { - mData.resize(static_cast<size_t>(ceil( - static_cast<double>(numBits) / BITS_OF_LONG)), 0); - } - - BitSet::BitSet(const uint64_t * bits, uint64_t numBits) { - // caller should make sure numBits is multiple of 64 - mData.resize(numBits >> SHIFT_6_BITS, 0); - memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS); - } - - void BitSet::set(uint64_t index) { - mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG)); - } - - bool BitSet::get(uint64_t index) { - return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0; - } - - uint64_t BitSet::bitSize() { - return mData.size() << SHIFT_6_BITS; - } - - void BitSet::merge(const BitSet& other) { - if (mData.size() != other.mData.size()) { - std::stringstream ss; - ss << "BitSet must be of equal length (" - << mData.size() << " != " << other.mData.size() << ")"; - throw std::logic_error(ss.str()); - } - - for (size_t i = 0; i != mData.size(); i++) { - mData[i] |= other.mData[i]; - } - } - - void BitSet::clear() { - memset(mData.data(), 0, sizeof(uint64_t) * mData.size()); - } - - const uint64_t * BitSet::getData() const { - return mData.data(); - } - - bool BitSet::operator==(const BitSet& other) const { - return mData == other.mData; - } - - /** - * Helper functions - */ - void checkArgument(bool expression, const std::string& message) { - if (!expression) { - throw std::logic_error(message); - } - } - - int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) { - double n = static_cast<double>(expectedEntries); - return std::max<int32_t>(1, static_cast<int32_t>( - std::round(static_cast<double>(numBits) / n * std::log(2.0)))); - } - - int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) { - double n = static_cast<double>(expectedEntries); - return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) * std::log(2.0))); - } - - // We use the trick mentioned in "Less Hashing, Same Performance: - // Building a Better Bloom Filter" by Kirsch et.al. From abstract - // 'only two hash functions are necessary to effectively implement - // a Bloom filter without any loss in the asymptotic false positive - // probability' - // Lets split up 64-bit hashcode into two 32-bit hash codes and employ - // the technique mentioned in the above paper - inline uint64_t getBytesHash(const char * data, int64_t length) { - if (data == nullptr) { - return Murmur3::NULL_HASHCODE; - } - - return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data), - static_cast<uint32_t>(length)); - } - - /** - * Implementation of BloomFilter - */ - BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) { - checkArgument(expectedEntries > 0, - "expectedEntries should be > 0"); - checkArgument(fpp > 0.0 && fpp < 1.0, - "False positive probability should be > 0.0 & < 1.0"); - - uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp)); - // make 'mNumBits' multiple of 64 - mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG)); - mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits); - mBitSet.reset(new BitSet(mNumBits)); - } - - void BloomFilterImpl::addBytes(const char * data, int64_t length) { - uint64_t hash64 = getBytesHash(data, length); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BloomFilter.hh" +#include "Murmur3.hh" + +namespace orc { + + constexpr uint64_t BITS_OF_LONG = 64; + constexpr uint8_t SHIFT_6_BITS = 6; + constexpr uint8_t SHIFT_3_BITS = 3; + + static bool isLittleEndian() { + static union { uint32_t i; char c[4]; } num = { 0x01020304 }; + return num.c[0] == 4; + } + + /** + * Implementation of BitSet + */ + BitSet::BitSet(uint64_t numBits) { + mData.resize(static_cast<size_t>(ceil( + static_cast<double>(numBits) / BITS_OF_LONG)), 0); + } + + BitSet::BitSet(const uint64_t * bits, uint64_t numBits) { + // caller should make sure numBits is multiple of 64 + mData.resize(numBits >> SHIFT_6_BITS, 0); + memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS); + } + + void BitSet::set(uint64_t index) { + mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG)); + } + + bool BitSet::get(uint64_t index) { + return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0; + } + + uint64_t BitSet::bitSize() { + return mData.size() << SHIFT_6_BITS; + } + + void BitSet::merge(const BitSet& other) { + if (mData.size() != other.mData.size()) { + std::stringstream ss; + ss << "BitSet must be of equal length (" + << mData.size() << " != " << other.mData.size() << ")"; + throw std::logic_error(ss.str()); + } + + for (size_t i = 0; i != mData.size(); i++) { + mData[i] |= other.mData[i]; + } + } + + void BitSet::clear() { + memset(mData.data(), 0, sizeof(uint64_t) * mData.size()); + } + + const uint64_t * BitSet::getData() const { + return mData.data(); + } + + bool BitSet::operator==(const BitSet& other) const { + return mData == other.mData; + } + + /** + * Helper functions + */ + void checkArgument(bool expression, const std::string& message) { + if (!expression) { + throw std::logic_error(message); + } + } + + int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) { + double n = static_cast<double>(expectedEntries); + return std::max<int32_t>(1, static_cast<int32_t>( + std::round(static_cast<double>(numBits) / n * std::log(2.0)))); + } + + int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) { + double n = static_cast<double>(expectedEntries); + return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) * std::log(2.0))); + } + + // We use the trick mentioned in "Less Hashing, Same Performance: + // Building a Better Bloom Filter" by Kirsch et.al. From abstract + // 'only two hash functions are necessary to effectively implement + // a Bloom filter without any loss in the asymptotic false positive + // probability' + // Lets split up 64-bit hashcode into two 32-bit hash codes and employ + // the technique mentioned in the above paper + inline uint64_t getBytesHash(const char * data, int64_t length) { + if (data == nullptr) { + return Murmur3::NULL_HASHCODE; + } + + return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data), + static_cast<uint32_t>(length)); + } + + /** + * Implementation of BloomFilter + */ + BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) { + checkArgument(expectedEntries > 0, + "expectedEntries should be > 0"); + checkArgument(fpp > 0.0 && fpp < 1.0, + "False positive probability should be > 0.0 & < 1.0"); + + uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp)); + // make 'mNumBits' multiple of 64 + mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG)); + mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits); + mBitSet.reset(new BitSet(mNumBits)); + } + + void BloomFilterImpl::addBytes(const char * data, int64_t length) { + uint64_t hash64 = getBytesHash(data, length); addHash(static_cast<int64_t>(hash64)); - } - - void BloomFilterImpl::addLong(int64_t data) { + } + + void BloomFilterImpl::addLong(int64_t data) { addHash(getLongHash(data)); - } - - bool BloomFilterImpl::testBytes(const char * data, int64_t length) const { - uint64_t hash64 = getBytesHash(data, length); + } + + bool BloomFilterImpl::testBytes(const char * data, int64_t length) const { + uint64_t hash64 = getBytesHash(data, length); return testHash(static_cast<int64_t>(hash64)); - } - - bool BloomFilterImpl::testLong(int64_t data) const { + } + + bool BloomFilterImpl::testLong(int64_t data) const { return testHash(getLongHash(data)); - } - - uint64_t BloomFilterImpl::sizeInBytes() const { - return getBitSize() >> SHIFT_3_BITS; - } - - uint64_t BloomFilterImpl::getBitSize() const { - return mBitSet->bitSize(); - } - - int32_t BloomFilterImpl::getNumHashFunctions() const { - return mNumHashFunctions; - } - - DIAGNOSTIC_PUSH - -#if defined(__clang__) - DIAGNOSTIC_IGNORE("-Wundefined-reinterpret-cast") -#endif - -#if defined(__GNUC__) - DIAGNOSTIC_IGNORE("-Wstrict-aliasing") -#endif - - // caller should make sure input proto::BloomFilter is valid since - // no check will be performed in the following constructor - BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) { - mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions()); - - const std::string& bitsetStr = bloomFilter.utf8bitset(); - mNumBits = bitsetStr.size() << SHIFT_3_BITS; - checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); - - const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data()); - if (isLittleEndian()) { - mBitSet.reset(new BitSet(bitset, mNumBits)); - } else { - std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS); - for (size_t i = 0; i != longs.size(); ++i) { - // convert little-endian to big-endian - const uint64_t src = bitset[i]; - uint64_t& dst = longs[i]; - for (size_t bit = 0; bit != 64; bit += 8) { - dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); - } - } - - mBitSet.reset(new BitSet(longs.data(), mNumBits)); - } - } - - void BloomFilterImpl::addDouble(double data) { - addLong(reinterpret_cast<int64_t&>(data)); - } - - bool BloomFilterImpl::testDouble(double data) const{ - return testLong(reinterpret_cast<int64_t&>(data)); - } - - DIAGNOSTIC_POP - + } + + uint64_t BloomFilterImpl::sizeInBytes() const { + return getBitSize() >> SHIFT_3_BITS; + } + + uint64_t BloomFilterImpl::getBitSize() const { + return mBitSet->bitSize(); + } + + int32_t BloomFilterImpl::getNumHashFunctions() const { + return mNumHashFunctions; + } + + DIAGNOSTIC_PUSH + +#if defined(__clang__) + DIAGNOSTIC_IGNORE("-Wundefined-reinterpret-cast") +#endif + +#if defined(__GNUC__) + DIAGNOSTIC_IGNORE("-Wstrict-aliasing") +#endif + + // caller should make sure input proto::BloomFilter is valid since + // no check will be performed in the following constructor + BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) { + mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions()); + + const std::string& bitsetStr = bloomFilter.utf8bitset(); + mNumBits = bitsetStr.size() << SHIFT_3_BITS; + checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); + + const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data()); + if (isLittleEndian()) { + mBitSet.reset(new BitSet(bitset, mNumBits)); + } else { + std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS); + for (size_t i = 0; i != longs.size(); ++i) { + // convert little-endian to big-endian + const uint64_t src = bitset[i]; + uint64_t& dst = longs[i]; + for (size_t bit = 0; bit != 64; bit += 8) { + dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); + } + } + + mBitSet.reset(new BitSet(longs.data(), mNumBits)); + } + } + + void BloomFilterImpl::addDouble(double data) { + addLong(reinterpret_cast<int64_t&>(data)); + } + + bool BloomFilterImpl::testDouble(double data) const{ + return testLong(reinterpret_cast<int64_t&>(data)); + } + + DIAGNOSTIC_POP + void BloomFilterImpl::addHash(int64_t hash64) { - int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); + int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. // So we cast hash64 to uint64_t here for an unsigned right shift. int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32); - - for (int32_t i = 1; i <= mNumHashFunctions; ++i) { - int32_t combinedHash = hash1 + i * hash2; - // hashcode should be positive, flip all the bits if it's negative - if (combinedHash < 0) { - combinedHash = ~combinedHash; - } - uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; - mBitSet->set(pos); - } - } - + + for (int32_t i = 1; i <= mNumHashFunctions; ++i) { + int32_t combinedHash = hash1 + i * hash2; + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; + mBitSet->set(pos); + } + } + bool BloomFilterImpl::testHash(int64_t hash64) const{ - int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); + int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. // So we cast hash64 to uint64_t here for an unsigned right shift. int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32); - - for (int32_t i = 1; i <= mNumHashFunctions; ++i) { - int32_t combinedHash = hash1 + i * hash2; - // hashcode should be positive, flip all the bits if it's negative - if (combinedHash < 0) { - combinedHash = ~combinedHash; - } - uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; - if (!mBitSet->get(pos)) { - return false; - } - } - return true; - } - - void BloomFilterImpl::merge(const BloomFilterImpl& other) { - if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) { - std::stringstream ss; - ss << "BloomFilters are not compatible for merging: " - << "this: numBits:" << mNumBits - << ",numHashFunctions:" << mNumHashFunctions - << ", that: numBits:" << other.mNumBits - << ",numHashFunctions:" << other.mNumHashFunctions; - throw std::logic_error(ss.str()); - } - - mBitSet->merge(*other.mBitSet); - } - - void BloomFilterImpl::reset() { - mBitSet->clear(); - } - - void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const { - bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions)); - - // According to ORC standard, the encoding is a sequence of bytes with - // a little endian encoding in the utf8bitset field. - if (isLittleEndian()) { - // bytes are already organized in little endian; thus no conversion needed - const char * bitset = reinterpret_cast<const char *>(mBitSet->getData()); - bloomFilter.set_utf8bitset(bitset, sizeInBytes()); - } else { - std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0); - const uint64_t * longs = mBitSet->getData(); - for (size_t i = 0; i != bitset.size(); ++i) { - uint64_t& dst = bitset[i]; - const uint64_t src = longs[i]; - // convert big-endian to little-endian - for (size_t bit = 0; bit != 64; bit += 8) { - dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); - } - } - bloomFilter.set_utf8bitset(bitset.data(), sizeInBytes()); - } - } - - bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const { - return mNumBits == other.mNumBits && - mNumHashFunctions == other.mNumHashFunctions && - *mBitSet == *other.mBitSet; - } - - BloomFilter::~BloomFilter() { - // PASS - } - - std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize( - const proto::Stream_Kind& streamKind, - const proto::ColumnEncoding& encoding, - const proto::BloomFilter& bloomFilter) { - - std::unique_ptr<BloomFilter> ret(nullptr); - - // only BLOOM_FILTER_UTF8 is supported - if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) { - return ret; - } - - // make sure we don't use unknown encodings or original timestamp encodings - if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) { - return ret; - } - - // make sure all required fields exist - if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) { - return ret; - } - - ret.reset(new BloomFilterImpl(bloomFilter)); - return ret; - } - -} + + for (int32_t i = 1; i <= mNumHashFunctions; ++i) { + int32_t combinedHash = hash1 + i * hash2; + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; + if (!mBitSet->get(pos)) { + return false; + } + } + return true; + } + + void BloomFilterImpl::merge(const BloomFilterImpl& other) { + if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) { + std::stringstream ss; + ss << "BloomFilters are not compatible for merging: " + << "this: numBits:" << mNumBits + << ",numHashFunctions:" << mNumHashFunctions + << ", that: numBits:" << other.mNumBits + << ",numHashFunctions:" << other.mNumHashFunctions; + throw std::logic_error(ss.str()); + } + + mBitSet->merge(*other.mBitSet); + } + + void BloomFilterImpl::reset() { + mBitSet->clear(); + } + + void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const { + bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions)); + + // According to ORC standard, the encoding is a sequence of bytes with + // a little endian encoding in the utf8bitset field. + if (isLittleEndian()) { + // bytes are already organized in little endian; thus no conversion needed + const char * bitset = reinterpret_cast<const char *>(mBitSet->getData()); + bloomFilter.set_utf8bitset(bitset, sizeInBytes()); + } else { + std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0); + const uint64_t * longs = mBitSet->getData(); + for (size_t i = 0; i != bitset.size(); ++i) { + uint64_t& dst = bitset[i]; + const uint64_t src = longs[i]; + // convert big-endian to little-endian + for (size_t bit = 0; bit != 64; bit += 8) { + dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); + } + } + bloomFilter.set_utf8bitset(bitset.data(), sizeInBytes()); + } + } + + bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const { + return mNumBits == other.mNumBits && + mNumHashFunctions == other.mNumHashFunctions && + *mBitSet == *other.mBitSet; + } + + BloomFilter::~BloomFilter() { + // PASS + } + + std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize( + const proto::Stream_Kind& streamKind, + const proto::ColumnEncoding& encoding, + const proto::BloomFilter& bloomFilter) { + + std::unique_ptr<BloomFilter> ret(nullptr); + + // only BLOOM_FILTER_UTF8 is supported + if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) { + return ret; + } + + // make sure we don't use unknown encodings or original timestamp encodings + if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) { + return ret; + } + + // make sure all required fields exist + if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) { + return ret; + } + + ret.reset(new BloomFilterImpl(bloomFilter)); + return ret; + } + +} diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh index cf18a46fd9..ab2006bdae 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh @@ -1,197 +1,197 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_BLOOMFILTER_IMPL_HH -#define ORC_BLOOMFILTER_IMPL_HH - -#include "orc/BloomFilter.hh" -#include "wrap/orc-proto-wrapper.hh" - -#include <cmath> -#include <sstream> -#include <vector> - -namespace orc { - - /** - * Bare metal bit set implementation. For performance reasons, this implementation does not check - * for index bounds nor expand the bit set size if the specified index is greater than the size. - */ - class BitSet { - public: - /** - * Creates an empty BitSet - * - * @param numBits - number of bits used - */ - BitSet(uint64_t numBits); - - /** - * Creates BitSet from serialized uint64_t buffer - * - * @param bits - serialized uint64_t buffer of bitset - * @param numBits - number of bits used - */ - BitSet(const uint64_t * bits, uint64_t numBits); - - /** - * Sets the bit at specified index. - * - * @param index - position - */ - void set(uint64_t index); - - /** - * Returns true if the bit is set in the specified index. - * - * @param index - position - * @return - value at the bit position - */ - bool get(uint64_t index); - - /** - * Number of bits - */ - uint64_t bitSize(); - - /** - * Combines the two BitSets using bitwise OR. - */ - void merge(const BitSet& other); - - /** - * Clears the bit set. - */ - void clear(); - - /** - * Gets underlying raw data - */ - const uint64_t * getData() const; - - /** - * Compares two BitSets - */ - bool operator==(const BitSet& other) const; - - private: - std::vector<uint64_t> mData; - }; - - /** - * BloomFilter is a probabilistic data structure for set membership check. - * BloomFilters are highly space efficient when compared to using a HashSet. - * Because of the probabilistic nature of bloom filter false positive (element - * not present in bloom filter but test() says true) are possible but false - * negatives are not possible (if element is present then test() will never - * say false). The false positive probability is configurable (default: 5%) - * depending on which storage requirement may increase or decrease. Lower the - * false positive probability greater is the space requirement. - * - * Bloom filters are sensitive to number of elements that will be inserted in - * the bloom filter. During the creation of bloom filter expected number of - * entries must be specified. If the number of insertions exceed the specified - * initial number of entries then false positive probability will increase - * accordingly. - * - * Internally, this implementation of bloom filter uses Murmur3 fast - * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than - * Murmur3 in Java, it suffers from hash collisions for specific sequence of - * repeating bytes. Check the following link for more info - * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw - * - * Note that this class is here for backwards compatibility, because it uses - * the JVM default character set for strings. All new users should - * BloomFilterUtf8, which always uses UTF8 for the encoding. - */ - class BloomFilterImpl : public BloomFilter { - public: - /** - * Creates an empty BloomFilter - * - * @param expectedEntries - number of entries it will hold - * @param fpp - false positive probability - */ - BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP); - - /** - * Creates a BloomFilter by deserializing the proto-buf version - * - * caller should make sure input proto::BloomFilter is valid - */ - BloomFilterImpl(const proto::BloomFilter& bloomFilter); - - /** - * Adds a new element to the BloomFilter - */ - void addBytes(const char * data, int64_t length); - void addLong(int64_t data); - void addDouble(double data); - - /** - * Test if the element exists in BloomFilter - */ - bool testBytes(const char * data, int64_t length) const override; - bool testLong(int64_t data) const override; - bool testDouble(double data) const override; - - uint64_t sizeInBytes() const; - uint64_t getBitSize() const; - int32_t getNumHashFunctions() const; - - void merge(const BloomFilterImpl& other); - - void reset(); - - bool operator==(const BloomFilterImpl& other) const; - - private: - friend struct BloomFilterUTF8Utils; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BLOOMFILTER_IMPL_HH +#define ORC_BLOOMFILTER_IMPL_HH + +#include "orc/BloomFilter.hh" +#include "wrap/orc-proto-wrapper.hh" + +#include <cmath> +#include <sstream> +#include <vector> + +namespace orc { + + /** + * Bare metal bit set implementation. For performance reasons, this implementation does not check + * for index bounds nor expand the bit set size if the specified index is greater than the size. + */ + class BitSet { + public: + /** + * Creates an empty BitSet + * + * @param numBits - number of bits used + */ + BitSet(uint64_t numBits); + + /** + * Creates BitSet from serialized uint64_t buffer + * + * @param bits - serialized uint64_t buffer of bitset + * @param numBits - number of bits used + */ + BitSet(const uint64_t * bits, uint64_t numBits); + + /** + * Sets the bit at specified index. + * + * @param index - position + */ + void set(uint64_t index); + + /** + * Returns true if the bit is set in the specified index. + * + * @param index - position + * @return - value at the bit position + */ + bool get(uint64_t index); + + /** + * Number of bits + */ + uint64_t bitSize(); + + /** + * Combines the two BitSets using bitwise OR. + */ + void merge(const BitSet& other); + + /** + * Clears the bit set. + */ + void clear(); + + /** + * Gets underlying raw data + */ + const uint64_t * getData() const; + + /** + * Compares two BitSets + */ + bool operator==(const BitSet& other) const; + + private: + std::vector<uint64_t> mData; + }; + + /** + * BloomFilter is a probabilistic data structure for set membership check. + * BloomFilters are highly space efficient when compared to using a HashSet. + * Because of the probabilistic nature of bloom filter false positive (element + * not present in bloom filter but test() says true) are possible but false + * negatives are not possible (if element is present then test() will never + * say false). The false positive probability is configurable (default: 5%) + * depending on which storage requirement may increase or decrease. Lower the + * false positive probability greater is the space requirement. + * + * Bloom filters are sensitive to number of elements that will be inserted in + * the bloom filter. During the creation of bloom filter expected number of + * entries must be specified. If the number of insertions exceed the specified + * initial number of entries then false positive probability will increase + * accordingly. + * + * Internally, this implementation of bloom filter uses Murmur3 fast + * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than + * Murmur3 in Java, it suffers from hash collisions for specific sequence of + * repeating bytes. Check the following link for more info + * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw + * + * Note that this class is here for backwards compatibility, because it uses + * the JVM default character set for strings. All new users should + * BloomFilterUtf8, which always uses UTF8 for the encoding. + */ + class BloomFilterImpl : public BloomFilter { + public: + /** + * Creates an empty BloomFilter + * + * @param expectedEntries - number of entries it will hold + * @param fpp - false positive probability + */ + BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP); + + /** + * Creates a BloomFilter by deserializing the proto-buf version + * + * caller should make sure input proto::BloomFilter is valid + */ + BloomFilterImpl(const proto::BloomFilter& bloomFilter); + + /** + * Adds a new element to the BloomFilter + */ + void addBytes(const char * data, int64_t length); + void addLong(int64_t data); + void addDouble(double data); + + /** + * Test if the element exists in BloomFilter + */ + bool testBytes(const char * data, int64_t length) const override; + bool testLong(int64_t data) const override; + bool testDouble(double data) const override; + + uint64_t sizeInBytes() const; + uint64_t getBitSize() const; + int32_t getNumHashFunctions() const; + + void merge(const BloomFilterImpl& other); + + void reset(); + + bool operator==(const BloomFilterImpl& other) const; + + private: + friend struct BloomFilterUTF8Utils; friend class TestBloomFilter_testBloomFilterBasicOperations_Test; - - // compute k hash values from hash64 and set bits + + // compute k hash values from hash64 and set bits void addHash(int64_t hash64); - - // compute k hash values from hash64 and check bits + + // compute k hash values from hash64 and check bits bool testHash(int64_t hash64) const; - - void serialize(proto::BloomFilter& bloomFilter) const; - - private: - static constexpr double DEFAULT_FPP = 0.05; - uint64_t mNumBits; - int32_t mNumHashFunctions; - std::unique_ptr<BitSet> mBitSet; - }; - - struct BloomFilterUTF8Utils { - // serialize BloomFilter in protobuf - static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) { - in.serialize(out); - } - - // deserialize BloomFilter from protobuf - static std::unique_ptr<BloomFilter> - deserialize(const proto::Stream_Kind& streamKind, - const proto::ColumnEncoding& columnEncoding, - const proto::BloomFilter& bloomFilter); - }; - + + void serialize(proto::BloomFilter& bloomFilter) const; + + private: + static constexpr double DEFAULT_FPP = 0.05; + uint64_t mNumBits; + int32_t mNumHashFunctions; + std::unique_ptr<BitSet> mBitSet; + }; + + struct BloomFilterUTF8Utils { + // serialize BloomFilter in protobuf + static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) { + in.serialize(out); + } + + // deserialize BloomFilter from protobuf + static std::unique_ptr<BloomFilter> + deserialize(const proto::Stream_Kind& streamKind, + const proto::ColumnEncoding& columnEncoding, + const proto::BloomFilter& bloomFilter); + }; + // Thomas Wang's integer hash function // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm // Put this in header file so tests can use it as well. @@ -205,6 +205,6 @@ namespace orc { key = key + (key << 31); return key; } -} - -#endif //ORC_BLOOMFILTER_IMPL_HH +} + +#endif //ORC_BLOOMFILTER_IMPL_HH diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc index ee1a4575dc..30f5148b7c 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc @@ -1,626 +1,626 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <algorithm> -#include <iostream> -#include <string.h> -#include <utility> - -#include "ByteRLE.hh" -#include "orc/Exceptions.hh" - -namespace orc { - - const int MINIMUM_REPEAT = 3; - const int MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; - const int MAX_LITERAL_SIZE = 128; - - ByteRleEncoder::~ByteRleEncoder() { - // PASS - } - - class ByteRleEncoderImpl : public ByteRleEncoder { - public: - ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); - virtual ~ByteRleEncoderImpl() override; - - /** - * Encode the next batch of values. - * @param data to be encoded - * @param numValues the number of values to be encoded - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) override; - - /** - * Get size of buffer used so far. - */ - virtual uint64_t getBufferSize() const override; - - /** - * Flush underlying BufferedOutputStream. - */ - virtual uint64_t flush() override; - - virtual void recordPosition(PositionRecorder* recorder) const override; - - protected: - std::unique_ptr<BufferedOutputStream> outputStream; - char* literals; - int numLiterals; - bool repeat; - int tailRunLength; - int bufferPosition; - int bufferLength; - char* buffer; - - void writeByte(char c); - void writeValues(); - void write(char c); - }; - - ByteRleEncoderImpl::ByteRleEncoderImpl( - std::unique_ptr<BufferedOutputStream> output) - : outputStream(std::move(output)) { - literals = new char[MAX_LITERAL_SIZE]; - numLiterals = 0; - tailRunLength = 0; - repeat = false; - bufferPosition = 0; - bufferLength = 0; - buffer = nullptr; - } - - ByteRleEncoderImpl::~ByteRleEncoderImpl() { - // PASS - delete [] literals; - } - - void ByteRleEncoderImpl::writeByte(char c) { - if (bufferPosition == bufferLength) { - int addedSize = 0; - if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { - throw std::bad_alloc(); - } - bufferPosition = 0; - bufferLength = addedSize; - } - buffer[bufferPosition++] = c; - } - - void ByteRleEncoderImpl::add( - const char* data, - uint64_t numValues, - const char* notNull) { - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - write(data[i]); - } - } - } - - void ByteRleEncoderImpl::writeValues() { - if (numLiterals != 0) { - if (repeat) { - writeByte( - static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT))); - writeByte(literals[0]); - } else { - writeByte(static_cast<char>(-numLiterals)); - for (int i = 0; i < numLiterals; ++i) { - writeByte(literals[i]); - } - } - repeat = false; - tailRunLength = 0; - numLiterals = 0; - } - } - - uint64_t ByteRleEncoderImpl::flush() { - writeValues(); - outputStream->BackUp(bufferLength - bufferPosition); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; - } - - void ByteRleEncoderImpl::write(char value) { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0]) { - numLiterals += 1; - if (numLiterals == MAXIMUM_REPEAT) { - writeValues(); - } - } else { - writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; - } - } else { - if (value == literals[numLiterals - 1]) { - tailRunLength += 1; - } else { - tailRunLength = 1; - } - if (tailRunLength == MINIMUM_REPEAT) { - if (numLiterals + 1 == MINIMUM_REPEAT) { - repeat = true; - numLiterals += 1; - } else { - numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); - writeValues(); - literals[0] = value; - repeat = true; - numLiterals = MINIMUM_REPEAT; - } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); - } - } - } - } - - uint64_t ByteRleEncoderImpl::getBufferSize() const { - return outputStream->getSize(); - } - - void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const { - uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); - if (outputStream->isCompressed()) { - // start of the compression chunk in the stream - recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); - } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - // byte offset of the RLE run’s start location - recorder->add(flushedSize + unflushedSize); - } - recorder->add(static_cast<uint64_t>(numLiterals)); - } - - std::unique_ptr<ByteRleEncoder> createByteRleEncoder - (std::unique_ptr<BufferedOutputStream> output) { - return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl - (std::move(output))); - } - - class BooleanRleEncoderImpl : public ByteRleEncoderImpl { - public: - BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); - virtual ~BooleanRleEncoderImpl() override; - - /** - * Encode the next batch of values - * @param data to be encoded - * @param numValues the number of values to be encoded - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) override; - - /** - * Flushing underlying BufferedOutputStream - */ - virtual uint64_t flush() override; - - virtual void recordPosition(PositionRecorder* recorder) const override; - - private: - int bitsRemained; - char current; - - }; - - BooleanRleEncoderImpl::BooleanRleEncoderImpl( - std::unique_ptr<BufferedOutputStream> output) - : ByteRleEncoderImpl(std::move(output)) { - bitsRemained = 8; - current = static_cast<char>(0); - } - - BooleanRleEncoderImpl::~BooleanRleEncoderImpl() { - // PASS - } - - void BooleanRleEncoderImpl::add( - const char* data, - uint64_t numValues, - const char* notNull) { - for (uint64_t i = 0; i < numValues; ++i) { - if (bitsRemained == 0) { - write(current); - current = static_cast<char>(0); - bitsRemained = 8; - } - if (!notNull || notNull[i]) { - if (!data || data[i]) { - current = - static_cast<char>(current | (0x80 >> (8 - bitsRemained))); - } - --bitsRemained; - } - } - if (bitsRemained == 0) { - write(current); - current = static_cast<char>(0); - bitsRemained = 8; - } - } - - uint64_t BooleanRleEncoderImpl::flush() { - if (bitsRemained != 8) { - write(current); - } - bitsRemained = 8; - current = static_cast<char>(0); - return ByteRleEncoderImpl::flush(); - } - - void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { - ByteRleEncoderImpl::recordPosition(recorder); - recorder->add(static_cast<uint64_t>(8 - bitsRemained)); - } - - std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder - (std::unique_ptr<BufferedOutputStream> output) { - BooleanRleEncoderImpl* encoder = - new BooleanRleEncoderImpl(std::move(output)) ; - return std::unique_ptr<ByteRleEncoder>( - reinterpret_cast<ByteRleEncoder*>(encoder)); - } - - ByteRleDecoder::~ByteRleDecoder() { - // PASS - } - - class ByteRleDecoderImpl: public ByteRleDecoder { - public: - ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); - - virtual ~ByteRleDecoderImpl(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&); - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues); - - /** - * Read a number of values into the batch. - */ - virtual void next(char* data, uint64_t numValues, char* notNull); - - protected: - inline void nextBuffer(); - inline signed char readByte(); - inline void readHeader(); - - std::unique_ptr<SeekableInputStream> inputStream; - size_t remainingValues; - char value; - const char* bufferStart; - const char* bufferEnd; - bool repeating; - }; - - void ByteRleDecoderImpl::nextBuffer() { - int bufferLength; - const void* bufferPointer; - bool result = inputStream->Next(&bufferPointer, &bufferLength); - if (!result) { - throw ParseError("bad read in nextBuffer"); - } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - - signed char ByteRleDecoderImpl::readByte() { - if (bufferStart == bufferEnd) { - nextBuffer(); - } - return *(bufferStart++); - } - - void ByteRleDecoderImpl::readHeader() { - signed char ch = readByte(); - if (ch < 0) { - remainingValues = static_cast<size_t>(-ch); - repeating = false; - } else { - remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT; - repeating = true; - value = readByte(); - } - } - - ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> - input) { - inputStream = std::move(input); - repeating = false; - remainingValues = 0; - value = 0; - bufferStart = nullptr; - bufferEnd = nullptr; - } - - ByteRleDecoderImpl::~ByteRleDecoderImpl() { - // PASS - } - - void ByteRleDecoderImpl::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // force a re-read from the stream - bufferEnd = bufferStart; - // read a new header - readHeader(); - // skip ahead the given number of records - ByteRleDecoderImpl::skip(location.next()); - } - - void ByteRleDecoderImpl::skip(uint64_t numValues) { - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - size_t count = std::min(static_cast<size_t>(numValues), remainingValues); - remainingValues -= count; - numValues -= count; - // for literals we need to skip over count bytes, which may involve - // reading from the underlying stream - if (!repeating) { - size_t consumedBytes = count; - while (consumedBytes > 0) { - if (bufferStart == bufferEnd) { - nextBuffer(); - } - size_t skipSize = std::min(static_cast<size_t>(consumedBytes), - static_cast<size_t>(bufferEnd - - bufferStart)); - bufferStart += skipSize; - consumedBytes -= skipSize; - } - } - } - } - - void ByteRleDecoderImpl::next(char* data, uint64_t numValues, - char* notNull) { - uint64_t position = 0; - // skip over null values - while (notNull && position < numValues && !notNull[position]) { - position += 1; - } - while (position < numValues) { - // if we are out of values, read more - if (remainingValues == 0) { - readHeader(); - } - // how many do we read out of this block? - size_t count = std::min(static_cast<size_t>(numValues - position), - remainingValues); - uint64_t consumed = 0; - if (repeating) { - if (notNull) { - for(uint64_t i=0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = value; - consumed += 1; - } - } - } else { - memset(data + position, value, count); - consumed = count; - } - } else { - if (notNull) { - for(uint64_t i=0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = readByte(); - consumed += 1; - } - } - } else { - uint64_t i = 0; - while (i < count) { - if (bufferStart == bufferEnd) { - nextBuffer(); - } - uint64_t copyBytes = - std::min(static_cast<uint64_t>(count - i), - static_cast<uint64_t>(bufferEnd - bufferStart)); - memcpy(data + position + i, bufferStart, copyBytes); - bufferStart += copyBytes; - i += copyBytes; - } - consumed = count; - } - } - remainingValues -= consumed; - position += count; - // skip over any null values - while (notNull && position < numValues && !notNull[position]) { - position += 1; - } - } - } - - std::unique_ptr<ByteRleDecoder> createByteRleDecoder - (std::unique_ptr<SeekableInputStream> input) { - return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl - (std::move(input))); - } - - class BooleanRleDecoderImpl: public ByteRleDecoderImpl { - public: - BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); - - virtual ~BooleanRleDecoderImpl(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&); - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues); - - /** - * Read a number of values into the batch. - */ - virtual void next(char* data, uint64_t numValues, char* notNull); - - protected: - size_t remainingBits; - char lastByte; - }; - - BooleanRleDecoderImpl::BooleanRleDecoderImpl - (std::unique_ptr<SeekableInputStream> input - ): ByteRleDecoderImpl(std::move(input)) { - remainingBits = 0; - lastByte = 0; - } - - BooleanRleDecoderImpl::~BooleanRleDecoderImpl() { - // PASS - } - - void BooleanRleDecoderImpl::seek(PositionProvider& location) { - ByteRleDecoderImpl::seek(location); - uint64_t consumed = location.next(); - remainingBits = 0; - if (consumed > 8) { - throw ParseError("bad position"); - } - if (consumed != 0) { - remainingBits = 8 - consumed; - ByteRleDecoderImpl::next(&lastByte, 1, nullptr); - } - } - - void BooleanRleDecoderImpl::skip(uint64_t numValues) { - if (numValues <= remainingBits) { - remainingBits -= numValues; - } else { - numValues -= remainingBits; - uint64_t bytesSkipped = numValues / 8; - ByteRleDecoderImpl::skip(bytesSkipped); - if (numValues % 8 != 0) { - ByteRleDecoderImpl::next(&lastByte, 1, nullptr); - remainingBits = 8 - (numValues % 8); - } else { - remainingBits = 0; - } - } - } - - void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, - char* notNull) { - // next spot to fill in - uint64_t position = 0; - - // use up any remaining bits - if (notNull) { - while(remainingBits > 0 && position < numValues) { - if (notNull[position]) { - remainingBits -= 1; - data[position] = (static_cast<unsigned char>(lastByte) >> - remainingBits) & 0x1; - } else { - data[position] = 0; - } - position += 1; - } - } else { - while(remainingBits > 0 && position < numValues) { - remainingBits -= 1; - data[position++] = (static_cast<unsigned char>(lastByte) >> - remainingBits) & 0x1; - } - } - - // count the number of nonNulls remaining - uint64_t nonNulls = numValues - position; - if (notNull) { - for(uint64_t i=position; i < numValues; ++i) { - if (!notNull[i]) { - nonNulls -= 1; - } - } - } - - // fill in the remaining values - if (nonNulls == 0) { - while (position < numValues) { - data[position++] = 0; - } - } else if (position < numValues) { - // read the new bytes into the array - uint64_t bytesRead = (nonNulls + 7) / 8; - ByteRleDecoderImpl::next(data + position, bytesRead, nullptr); - lastByte = data[position + bytesRead - 1]; - remainingBits = bytesRead * 8 - nonNulls; - // expand the array backwards so that we don't clobber the data - uint64_t bitsLeft = bytesRead * 8 - remainingBits; - if (notNull) { - for(int64_t i=static_cast<int64_t>(numValues) - 1; - i >= static_cast<int64_t>(position); --i) { - if (notNull[i]) { - uint64_t shiftPosn = (-bitsLeft) % 8; - data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; - bitsLeft -= 1; - } else { - data[i] = 0; - } - } - } else { - for(int64_t i=static_cast<int64_t>(numValues) - 1; - i >= static_cast<int64_t>(position); --i, --bitsLeft) { - uint64_t shiftPosn = (-bitsLeft) % 8; - data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; - } - } - } - } - - std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder - (std::unique_ptr<SeekableInputStream> input) { - BooleanRleDecoderImpl* decoder = - new BooleanRleDecoderImpl(std::move(input)); - return std::unique_ptr<ByteRleDecoder>( - reinterpret_cast<ByteRleDecoder*>(decoder)); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <algorithm> +#include <iostream> +#include <string.h> +#include <utility> + +#include "ByteRLE.hh" +#include "orc/Exceptions.hh" + +namespace orc { + + const int MINIMUM_REPEAT = 3; + const int MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; + const int MAX_LITERAL_SIZE = 128; + + ByteRleEncoder::~ByteRleEncoder() { + // PASS + } + + class ByteRleEncoderImpl : public ByteRleEncoder { + public: + ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); + virtual ~ByteRleEncoderImpl() override; + + /** + * Encode the next batch of values. + * @param data to be encoded + * @param numValues the number of values to be encoded + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const char* data, uint64_t numValues, + const char* notNull) override; + + /** + * Get size of buffer used so far. + */ + virtual uint64_t getBufferSize() const override; + + /** + * Flush underlying BufferedOutputStream. + */ + virtual uint64_t flush() override; + + virtual void recordPosition(PositionRecorder* recorder) const override; + + protected: + std::unique_ptr<BufferedOutputStream> outputStream; + char* literals; + int numLiterals; + bool repeat; + int tailRunLength; + int bufferPosition; + int bufferLength; + char* buffer; + + void writeByte(char c); + void writeValues(); + void write(char c); + }; + + ByteRleEncoderImpl::ByteRleEncoderImpl( + std::unique_ptr<BufferedOutputStream> output) + : outputStream(std::move(output)) { + literals = new char[MAX_LITERAL_SIZE]; + numLiterals = 0; + tailRunLength = 0; + repeat = false; + bufferPosition = 0; + bufferLength = 0; + buffer = nullptr; + } + + ByteRleEncoderImpl::~ByteRleEncoderImpl() { + // PASS + delete [] literals; + } + + void ByteRleEncoderImpl::writeByte(char c) { + if (bufferPosition == bufferLength) { + int addedSize = 0; + if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { + throw std::bad_alloc(); + } + bufferPosition = 0; + bufferLength = addedSize; + } + buffer[bufferPosition++] = c; + } + + void ByteRleEncoderImpl::add( + const char* data, + uint64_t numValues, + const char* notNull) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + write(data[i]); + } + } + } + + void ByteRleEncoderImpl::writeValues() { + if (numLiterals != 0) { + if (repeat) { + writeByte( + static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT))); + writeByte(literals[0]); + } else { + writeByte(static_cast<char>(-numLiterals)); + for (int i = 0; i < numLiterals; ++i) { + writeByte(literals[i]); + } + } + repeat = false; + tailRunLength = 0; + numLiterals = 0; + } + } + + uint64_t ByteRleEncoderImpl::flush() { + writeValues(); + outputStream->BackUp(bufferLength - bufferPosition); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; + } + + void ByteRleEncoderImpl::write(char value) { + if (numLiterals == 0) { + literals[numLiterals++] = value; + tailRunLength = 1; + } else if (repeat) { + if (value == literals[0]) { + numLiterals += 1; + if (numLiterals == MAXIMUM_REPEAT) { + writeValues(); + } + } else { + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; + } + } else { + if (value == literals[numLiterals - 1]) { + tailRunLength += 1; + } else { + tailRunLength = 1; + } + if (tailRunLength == MINIMUM_REPEAT) { + if (numLiterals + 1 == MINIMUM_REPEAT) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); + writeValues(); + literals[0] = value; + repeat = true; + numLiterals = MINIMUM_REPEAT; + } + } else { + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } + } + } + } + + uint64_t ByteRleEncoderImpl::getBufferSize() const { + return outputStream->getSize(); + } + + void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const { + uint64_t flushedSize = outputStream->getSize(); + uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); + if (outputStream->isCompressed()) { + // start of the compression chunk in the stream + recorder->add(flushedSize); + // number of decompressed bytes that need to be consumed + recorder->add(unflushedSize); + } else { + flushedSize -= static_cast<uint64_t>(bufferLength); + // byte offset of the RLE run’s start location + recorder->add(flushedSize + unflushedSize); + } + recorder->add(static_cast<uint64_t>(numLiterals)); + } + + std::unique_ptr<ByteRleEncoder> createByteRleEncoder + (std::unique_ptr<BufferedOutputStream> output) { + return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl + (std::move(output))); + } + + class BooleanRleEncoderImpl : public ByteRleEncoderImpl { + public: + BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); + virtual ~BooleanRleEncoderImpl() override; + + /** + * Encode the next batch of values + * @param data to be encoded + * @param numValues the number of values to be encoded + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const char* data, uint64_t numValues, + const char* notNull) override; + + /** + * Flushing underlying BufferedOutputStream + */ + virtual uint64_t flush() override; + + virtual void recordPosition(PositionRecorder* recorder) const override; + + private: + int bitsRemained; + char current; + + }; + + BooleanRleEncoderImpl::BooleanRleEncoderImpl( + std::unique_ptr<BufferedOutputStream> output) + : ByteRleEncoderImpl(std::move(output)) { + bitsRemained = 8; + current = static_cast<char>(0); + } + + BooleanRleEncoderImpl::~BooleanRleEncoderImpl() { + // PASS + } + + void BooleanRleEncoderImpl::add( + const char* data, + uint64_t numValues, + const char* notNull) { + for (uint64_t i = 0; i < numValues; ++i) { + if (bitsRemained == 0) { + write(current); + current = static_cast<char>(0); + bitsRemained = 8; + } + if (!notNull || notNull[i]) { + if (!data || data[i]) { + current = + static_cast<char>(current | (0x80 >> (8 - bitsRemained))); + } + --bitsRemained; + } + } + if (bitsRemained == 0) { + write(current); + current = static_cast<char>(0); + bitsRemained = 8; + } + } + + uint64_t BooleanRleEncoderImpl::flush() { + if (bitsRemained != 8) { + write(current); + } + bitsRemained = 8; + current = static_cast<char>(0); + return ByteRleEncoderImpl::flush(); + } + + void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { + ByteRleEncoderImpl::recordPosition(recorder); + recorder->add(static_cast<uint64_t>(8 - bitsRemained)); + } + + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder + (std::unique_ptr<BufferedOutputStream> output) { + BooleanRleEncoderImpl* encoder = + new BooleanRleEncoderImpl(std::move(output)) ; + return std::unique_ptr<ByteRleEncoder>( + reinterpret_cast<ByteRleEncoder*>(encoder)); + } + + ByteRleDecoder::~ByteRleDecoder() { + // PASS + } + + class ByteRleDecoderImpl: public ByteRleDecoder { + public: + ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); + + virtual ~ByteRleDecoderImpl(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&); + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues); + + /** + * Read a number of values into the batch. + */ + virtual void next(char* data, uint64_t numValues, char* notNull); + + protected: + inline void nextBuffer(); + inline signed char readByte(); + inline void readHeader(); + + std::unique_ptr<SeekableInputStream> inputStream; + size_t remainingValues; + char value; + const char* bufferStart; + const char* bufferEnd; + bool repeating; + }; + + void ByteRleDecoderImpl::nextBuffer() { + int bufferLength; + const void* bufferPointer; + bool result = inputStream->Next(&bufferPointer, &bufferLength); + if (!result) { + throw ParseError("bad read in nextBuffer"); + } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + + signed char ByteRleDecoderImpl::readByte() { + if (bufferStart == bufferEnd) { + nextBuffer(); + } + return *(bufferStart++); + } + + void ByteRleDecoderImpl::readHeader() { + signed char ch = readByte(); + if (ch < 0) { + remainingValues = static_cast<size_t>(-ch); + repeating = false; + } else { + remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT; + repeating = true; + value = readByte(); + } + } + + ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> + input) { + inputStream = std::move(input); + repeating = false; + remainingValues = 0; + value = 0; + bufferStart = nullptr; + bufferEnd = nullptr; + } + + ByteRleDecoderImpl::~ByteRleDecoderImpl() { + // PASS + } + + void ByteRleDecoderImpl::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // force a re-read from the stream + bufferEnd = bufferStart; + // read a new header + readHeader(); + // skip ahead the given number of records + ByteRleDecoderImpl::skip(location.next()); + } + + void ByteRleDecoderImpl::skip(uint64_t numValues) { + while (numValues > 0) { + if (remainingValues == 0) { + readHeader(); + } + size_t count = std::min(static_cast<size_t>(numValues), remainingValues); + remainingValues -= count; + numValues -= count; + // for literals we need to skip over count bytes, which may involve + // reading from the underlying stream + if (!repeating) { + size_t consumedBytes = count; + while (consumedBytes > 0) { + if (bufferStart == bufferEnd) { + nextBuffer(); + } + size_t skipSize = std::min(static_cast<size_t>(consumedBytes), + static_cast<size_t>(bufferEnd - + bufferStart)); + bufferStart += skipSize; + consumedBytes -= skipSize; + } + } + } + } + + void ByteRleDecoderImpl::next(char* data, uint64_t numValues, + char* notNull) { + uint64_t position = 0; + // skip over null values + while (notNull && position < numValues && !notNull[position]) { + position += 1; + } + while (position < numValues) { + // if we are out of values, read more + if (remainingValues == 0) { + readHeader(); + } + // how many do we read out of this block? + size_t count = std::min(static_cast<size_t>(numValues - position), + remainingValues); + uint64_t consumed = 0; + if (repeating) { + if (notNull) { + for(uint64_t i=0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = value; + consumed += 1; + } + } + } else { + memset(data + position, value, count); + consumed = count; + } + } else { + if (notNull) { + for(uint64_t i=0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = readByte(); + consumed += 1; + } + } + } else { + uint64_t i = 0; + while (i < count) { + if (bufferStart == bufferEnd) { + nextBuffer(); + } + uint64_t copyBytes = + std::min(static_cast<uint64_t>(count - i), + static_cast<uint64_t>(bufferEnd - bufferStart)); + memcpy(data + position + i, bufferStart, copyBytes); + bufferStart += copyBytes; + i += copyBytes; + } + consumed = count; + } + } + remainingValues -= consumed; + position += count; + // skip over any null values + while (notNull && position < numValues && !notNull[position]) { + position += 1; + } + } + } + + std::unique_ptr<ByteRleDecoder> createByteRleDecoder + (std::unique_ptr<SeekableInputStream> input) { + return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl + (std::move(input))); + } + + class BooleanRleDecoderImpl: public ByteRleDecoderImpl { + public: + BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); + + virtual ~BooleanRleDecoderImpl(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&); + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues); + + /** + * Read a number of values into the batch. + */ + virtual void next(char* data, uint64_t numValues, char* notNull); + + protected: + size_t remainingBits; + char lastByte; + }; + + BooleanRleDecoderImpl::BooleanRleDecoderImpl + (std::unique_ptr<SeekableInputStream> input + ): ByteRleDecoderImpl(std::move(input)) { + remainingBits = 0; + lastByte = 0; + } + + BooleanRleDecoderImpl::~BooleanRleDecoderImpl() { + // PASS + } + + void BooleanRleDecoderImpl::seek(PositionProvider& location) { + ByteRleDecoderImpl::seek(location); + uint64_t consumed = location.next(); + remainingBits = 0; + if (consumed > 8) { + throw ParseError("bad position"); + } + if (consumed != 0) { + remainingBits = 8 - consumed; + ByteRleDecoderImpl::next(&lastByte, 1, nullptr); + } + } + + void BooleanRleDecoderImpl::skip(uint64_t numValues) { + if (numValues <= remainingBits) { + remainingBits -= numValues; + } else { + numValues -= remainingBits; + uint64_t bytesSkipped = numValues / 8; + ByteRleDecoderImpl::skip(bytesSkipped); + if (numValues % 8 != 0) { + ByteRleDecoderImpl::next(&lastByte, 1, nullptr); + remainingBits = 8 - (numValues % 8); + } else { + remainingBits = 0; + } + } + } + + void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, + char* notNull) { + // next spot to fill in + uint64_t position = 0; + + // use up any remaining bits + if (notNull) { + while(remainingBits > 0 && position < numValues) { + if (notNull[position]) { + remainingBits -= 1; + data[position] = (static_cast<unsigned char>(lastByte) >> + remainingBits) & 0x1; + } else { + data[position] = 0; + } + position += 1; + } + } else { + while(remainingBits > 0 && position < numValues) { + remainingBits -= 1; + data[position++] = (static_cast<unsigned char>(lastByte) >> + remainingBits) & 0x1; + } + } + + // count the number of nonNulls remaining + uint64_t nonNulls = numValues - position; + if (notNull) { + for(uint64_t i=position; i < numValues; ++i) { + if (!notNull[i]) { + nonNulls -= 1; + } + } + } + + // fill in the remaining values + if (nonNulls == 0) { + while (position < numValues) { + data[position++] = 0; + } + } else if (position < numValues) { + // read the new bytes into the array + uint64_t bytesRead = (nonNulls + 7) / 8; + ByteRleDecoderImpl::next(data + position, bytesRead, nullptr); + lastByte = data[position + bytesRead - 1]; + remainingBits = bytesRead * 8 - nonNulls; + // expand the array backwards so that we don't clobber the data + uint64_t bitsLeft = bytesRead * 8 - remainingBits; + if (notNull) { + for(int64_t i=static_cast<int64_t>(numValues) - 1; + i >= static_cast<int64_t>(position); --i) { + if (notNull[i]) { + uint64_t shiftPosn = (-bitsLeft) % 8; + data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; + bitsLeft -= 1; + } else { + data[i] = 0; + } + } + } else { + for(int64_t i=static_cast<int64_t>(numValues) - 1; + i >= static_cast<int64_t>(position); --i, --bitsLeft) { + uint64_t shiftPosn = (-bitsLeft) % 8; + data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; + } + } + } + } + + std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder + (std::unique_ptr<SeekableInputStream> input) { + BooleanRleDecoderImpl* decoder = + new BooleanRleDecoderImpl(std::move(input)); + return std::unique_ptr<ByteRleDecoder>( + reinterpret_cast<ByteRleDecoder*>(decoder)); + } +} diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh index 71ca579cd3..b799675aee 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh @@ -1,117 +1,117 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_BYTE_RLE_HH -#define ORC_BYTE_RLE_HH - -#include <memory> - -#include "io/InputStream.hh" -#include "io/OutputStream.hh" - -namespace orc { - - class ByteRleEncoder { - public: - virtual ~ByteRleEncoder(); - - /** - * Encode the next batch of values - * @param data to be encoded - * @param numValues the number of values to be encoded - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) = 0; - - /** - * Get size of buffer used so far. - */ - virtual uint64_t getBufferSize() const = 0; - - /** - * Flushing underlying output stream - */ - virtual uint64_t flush() = 0; - - /** - * record current position - * @param recorder use the recorder to record current positions - */ - virtual void recordPosition(PositionRecorder* recorder) const = 0; - }; - - class ByteRleDecoder { - public: - virtual ~ByteRleDecoder(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&) = 0; - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues) = 0; - - /** - * Read a number of values into the batch. - * @param data the array to read into - * @param numValues the number of values to read - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void next(char* data, uint64_t numValues, char* notNull) = 0; - }; - - /** - * Create a byte RLE encoder. - * @param output the output stream to write to - */ - std::unique_ptr<ByteRleEncoder> createByteRleEncoder - (std::unique_ptr<BufferedOutputStream> output); - - /** - * Create a boolean RLE encoder. - * @param output the output stream to write to - */ - std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder - (std::unique_ptr<BufferedOutputStream> output); - - /** - * Create a byte RLE decoder. - * @param input the input stream to read from - */ - std::unique_ptr<ByteRleDecoder> createByteRleDecoder - (std::unique_ptr<SeekableInputStream> input); - - /** - * Create a boolean RLE decoder. - * - * Unlike the other RLE decoders, the boolean decoder sets the data to 0 - * if the value is masked by notNull. This is required for the notNull stream - * processing to properly apply multiple masks from nested types. - * @param input the input stream to read from - */ - std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder - (std::unique_ptr<SeekableInputStream> input); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BYTE_RLE_HH +#define ORC_BYTE_RLE_HH + +#include <memory> + +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +namespace orc { + + class ByteRleEncoder { + public: + virtual ~ByteRleEncoder(); + + /** + * Encode the next batch of values + * @param data to be encoded + * @param numValues the number of values to be encoded + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const char* data, uint64_t numValues, + const char* notNull) = 0; + + /** + * Get size of buffer used so far. + */ + virtual uint64_t getBufferSize() const = 0; + + /** + * Flushing underlying output stream + */ + virtual uint64_t flush() = 0; + + /** + * record current position + * @param recorder use the recorder to record current positions + */ + virtual void recordPosition(PositionRecorder* recorder) const = 0; + }; + + class ByteRleDecoder { + public: + virtual ~ByteRleDecoder(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&) = 0; + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues) = 0; + + /** + * Read a number of values into the batch. + * @param data the array to read into + * @param numValues the number of values to read + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void next(char* data, uint64_t numValues, char* notNull) = 0; + }; + + /** + * Create a byte RLE encoder. + * @param output the output stream to write to + */ + std::unique_ptr<ByteRleEncoder> createByteRleEncoder + (std::unique_ptr<BufferedOutputStream> output); + + /** + * Create a boolean RLE encoder. + * @param output the output stream to write to + */ + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder + (std::unique_ptr<BufferedOutputStream> output); + + /** + * Create a byte RLE decoder. + * @param input the input stream to read from + */ + std::unique_ptr<ByteRleDecoder> createByteRleDecoder + (std::unique_ptr<SeekableInputStream> input); + + /** + * Create a boolean RLE decoder. + * + * Unlike the other RLE decoders, the boolean decoder sets the data to 0 + * if the value is masked by notNull. This is required for the notNull stream + * processing to properly apply multiple masks from nested types. + * @param input the input stream to read from + */ + std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder + (std::unique_ptr<SeekableInputStream> input); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc index b4b5860cad..91c2904038 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc @@ -1,747 +1,747 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/ColumnPrinter.hh" -#include "orc/orc-config.hh" - -#include "Adaptor.hh" - -#include <limits> -#include <sstream> -#include <stdexcept> -#include <time.h> -#include <typeinfo> - -#ifdef __clang__ - #pragma clang diagnostic ignored "-Wformat-security" -#endif - -namespace orc { - - class VoidColumnPrinter: public ColumnPrinter { - public: - VoidColumnPrinter(std::string&); - ~VoidColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class BooleanColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - public: - BooleanColumnPrinter(std::string&); - ~BooleanColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class LongColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - public: - LongColumnPrinter(std::string&); - ~LongColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class DoubleColumnPrinter: public ColumnPrinter { - private: - const double* data; - const bool isFloat; - - public: - DoubleColumnPrinter(std::string&, const Type& type); - virtual ~DoubleColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class TimestampColumnPrinter: public ColumnPrinter { - private: - const int64_t* seconds; - const int64_t* nanoseconds; - - public: - TimestampColumnPrinter(std::string&); - ~TimestampColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class DateColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - - public: - DateColumnPrinter(std::string&); - ~DateColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class Decimal64ColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - int32_t scale; - public: - Decimal64ColumnPrinter(std::string&); - ~Decimal64ColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class Decimal128ColumnPrinter: public ColumnPrinter { - private: - const Int128* data; - int32_t scale; - public: - Decimal128ColumnPrinter(std::string&); - ~Decimal128ColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class StringColumnPrinter: public ColumnPrinter { - private: - const char* const * start; - const int64_t* length; - public: - StringColumnPrinter(std::string&); - virtual ~StringColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class BinaryColumnPrinter: public ColumnPrinter { - private: - const char* const * start; - const int64_t* length; - public: - BinaryColumnPrinter(std::string&); - virtual ~BinaryColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class ListColumnPrinter: public ColumnPrinter { - private: - const int64_t* offsets; - std::unique_ptr<ColumnPrinter> elementPrinter; - - public: - ListColumnPrinter(std::string&, const Type& type); - virtual ~ListColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class MapColumnPrinter: public ColumnPrinter { - private: - const int64_t* offsets; - std::unique_ptr<ColumnPrinter> keyPrinter; - std::unique_ptr<ColumnPrinter> elementPrinter; - - public: - MapColumnPrinter(std::string&, const Type& type); - virtual ~MapColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class UnionColumnPrinter: public ColumnPrinter { - private: - const unsigned char *tags; - const uint64_t* offsets; - std::vector<ColumnPrinter*> fieldPrinter; - - public: - UnionColumnPrinter(std::string&, const Type& type); - virtual ~UnionColumnPrinter() override; - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class StructColumnPrinter: public ColumnPrinter { - private: - std::vector<ColumnPrinter*> fieldPrinter; - std::vector<std::string> fieldNames; - public: - StructColumnPrinter(std::string&, const Type& type); - virtual ~StructColumnPrinter() override; - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - void writeChar(std::string& file, char ch) { - file += ch; - } - - void writeString(std::string& file, const char *ptr) { - size_t len = strlen(ptr); - file.append(ptr, len); - } - - ColumnPrinter::ColumnPrinter(std::string& _buffer - ): buffer(_buffer) { - notNull = nullptr; - hasNulls = false; - } - - ColumnPrinter::~ColumnPrinter() { - // PASS - } - - void ColumnPrinter::reset(const ColumnVectorBatch& batch) { - hasNulls = batch.hasNulls; - if (hasNulls) { - notNull = batch.notNull.data(); - } else { - notNull = nullptr ; - } - } - - std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, - const Type* type) { - ColumnPrinter *result = nullptr; - if (type == nullptr) { - result = new VoidColumnPrinter(buffer); - } else { - switch(static_cast<int64_t>(type->getKind())) { - case BOOLEAN: - result = new BooleanColumnPrinter(buffer); - break; - - case BYTE: - case SHORT: - case INT: - case LONG: - result = new LongColumnPrinter(buffer); - break; - - case FLOAT: - case DOUBLE: - result = new DoubleColumnPrinter(buffer, *type); - break; - - case STRING: - case VARCHAR : - case CHAR: - result = new StringColumnPrinter(buffer); - break; - - case BINARY: - result = new BinaryColumnPrinter(buffer); - break; - - case TIMESTAMP: - result = new TimestampColumnPrinter(buffer); - break; - - case LIST: - result = new ListColumnPrinter(buffer, *type); - break; - - case MAP: - result = new MapColumnPrinter(buffer, *type); - break; - - case STRUCT: - result = new StructColumnPrinter(buffer, *type); - break; - - case DECIMAL: - if (type->getPrecision() == 0 || type->getPrecision() > 18) { - result = new Decimal128ColumnPrinter(buffer); - } else { - result = new Decimal64ColumnPrinter(buffer); - } - break; - - case DATE: - result = new DateColumnPrinter(buffer); - break; - - case UNION: - result = new UnionColumnPrinter(buffer, *type); - break; - - default: - throw std::logic_error("unknown batch type"); - } - } - return std::unique_ptr<ColumnPrinter>(result); - } - - VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer) { - // PASS - } - - void VoidColumnPrinter::reset(const ColumnVectorBatch&) { - // PASS - } - - void VoidColumnPrinter::printRow(uint64_t) { - writeString(buffer, "null"); - } - - LongColumnPrinter::LongColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { - // PASS - } - - void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); - } - - void LongColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", - static_cast<int64_t >(data[rowId])); - writeString(buffer, numBuffer); - } - } - - DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - data(nullptr), - isFloat(type.getKind() == FLOAT){ - // PASS - } - - void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); - } - - void DoubleColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", - data[rowId]); - writeString(buffer, numBuffer); - } - } - - Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr), - scale(0) { - // PASS - } - - void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; - } - - std::string toDecimalString(int64_t value, int32_t scale) { - std::stringstream buffer; - if (scale == 0) { - buffer << value; - return buffer.str(); - } - std::string sign = ""; - if (value < 0) { - sign = "-"; - value = -value; - } - buffer << value; - std::string str = buffer.str(); - int32_t len = static_cast<int32_t>(str.length()); - if (len > scale) { - return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); - } else if (len == scale) { - return sign + "0." + str; - } else { - std::string result = sign + "0."; - for(int32_t i=0; i < scale - len; ++i) { - result += "0"; - } - return result + str; - } - } - - void Decimal64ColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, toDecimalString(data[rowId], scale).c_str()); - } - } - - Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr), - scale(0) { - // PASS - } - - void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; - } - - void Decimal128ColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, data[rowId].toDecimalString(scale).c_str()); - } - } - - StringColumnPrinter::StringColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - start(nullptr), - length(nullptr) { - // PASS - } - - void StringColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); - length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); - } - - void StringColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '"'); - for(int64_t i=0; i < length[rowId]; ++i) { - char ch = static_cast<char>(start[rowId][i]); - switch (ch) { - case '\\': - writeString(buffer, "\\\\"); - break; - case '\b': - writeString(buffer, "\\b"); - break; - case '\f': - writeString(buffer, "\\f"); - break; - case '\n': - writeString(buffer, "\\n"); - break; - case '\r': - writeString(buffer, "\\r"); - break; - case '\t': - writeString(buffer, "\\t"); - break; - case '"': - writeString(buffer, "\\\""); - break; - default: - writeChar(buffer, ch); - break; - } - } - writeChar(buffer, '"'); - } - } - - ListColumnPrinter::ListColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - offsets(nullptr) { - elementPrinter = createColumnPrinter(buffer, type.getSubtype(0)); - } - - void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); - elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch). - elements); - } - - void ListColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '['); - for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { - if (i != offsets[rowId]) { - writeString(buffer, ", "); - } - elementPrinter->printRow(static_cast<uint64_t>(i)); - } - writeChar(buffer, ']'); - } - } - - MapColumnPrinter::MapColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - offsets(nullptr) { - keyPrinter = createColumnPrinter(buffer, type.getSubtype(0)); - elementPrinter = createColumnPrinter(buffer, type.getSubtype(1)); - } - - void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch); - offsets = myBatch.offsets.data(); - keyPrinter->reset(*myBatch.keys); - elementPrinter->reset(*myBatch.elements); - } - - void MapColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '['); - for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { - if (i != offsets[rowId]) { - writeString(buffer, ", "); - } - writeString(buffer, "{\"key\": "); - keyPrinter->printRow(static_cast<uint64_t>(i)); - writeString(buffer, ", \"value\": "); - elementPrinter->printRow(static_cast<uint64_t>(i)); - writeChar(buffer, '}'); - } - writeChar(buffer, ']'); - } - } - - UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - tags(nullptr), - offsets(nullptr) { - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)) - .release()); - } - } - - UnionColumnPrinter::~UnionColumnPrinter() { - for (size_t i = 0; i < fieldPrinter.size(); i++) { - delete fieldPrinter[i]; - } - } - - void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const UnionVectorBatch& unionBatch = - dynamic_cast<const UnionVectorBatch&>(batch); - tags = unionBatch.tags.data(); - offsets = unionBatch.offsets.data(); - for(size_t i=0; i < fieldPrinter.size(); ++i) { - fieldPrinter[i]->reset(*(unionBatch.children[i])); - } - } - - void UnionColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, "{\"tag\": "); - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", - static_cast<int64_t>(tags[rowId])); - writeString(buffer, numBuffer); - writeString(buffer, ", \"value\": "); - fieldPrinter[tags[rowId]]->printRow(offsets[rowId]); - writeChar(buffer, '}'); - } - } - - StructColumnPrinter::StructColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer) { - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - fieldNames.push_back(type.getFieldName(i)); - fieldPrinter.push_back(createColumnPrinter(buffer, - type.getSubtype(i)) - .release()); - } - } - - StructColumnPrinter::~StructColumnPrinter() { - for (size_t i = 0; i < fieldPrinter.size(); i++) { - delete fieldPrinter[i]; - } - } - - void StructColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const StructVectorBatch& structBatch = - dynamic_cast<const StructVectorBatch&>(batch); - for(size_t i=0; i < fieldPrinter.size(); ++i) { - fieldPrinter[i]->reset(*(structBatch.fields[i])); - } - } - - void StructColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '{'); - for(unsigned int i=0; i < fieldPrinter.size(); ++i) { - if (i != 0) { - writeString(buffer, ", "); - } - writeChar(buffer, '"'); - writeString(buffer, fieldNames[i].c_str()); - writeString(buffer, "\": "); - fieldPrinter[i]->printRow(rowId); - } - writeChar(buffer, '}'); - } - } - - DateColumnPrinter::DateColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { - // PASS - } - - void DateColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - const time_t timeValue = data[rowId] * 24 * 60 * 60; - struct tm tmValue; - gmtime_r(&timeValue, &tmValue); - char timeBuffer[11]; - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue); - writeChar(buffer, '"'); - writeString(buffer, timeBuffer); - writeChar(buffer, '"'); - } - } - - void DateColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); - } - - BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { - // PASS - } - - void BooleanColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, (data[rowId] ? "true" : "false")); - } - } - - void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); - } - - BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - start(nullptr), - length(nullptr) { - // PASS - } - - void BinaryColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '['); - for(int64_t i=0; i < length[rowId]; ++i) { - if (i != 0) { - writeString(buffer, ", "); - } - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%d", - (static_cast<const int>(start[rowId][i]) & 0xff)); - writeString(buffer, numBuffer); - } - writeChar(buffer, ']'); - } - } - - void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); - length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); - } - - TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - seconds(nullptr), - nanoseconds(nullptr) { - // PASS - } - - void TimestampColumnPrinter::printRow(uint64_t rowId) { - const int64_t NANO_DIGITS = 9; - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - int64_t nanos = nanoseconds[rowId]; - time_t secs = static_cast<time_t>(seconds[rowId]); - struct tm tmValue; - gmtime_r(&secs, &tmValue); - char timeBuffer[20]; - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - writeChar(buffer, '"'); - writeString(buffer, timeBuffer); - writeChar(buffer, '.'); - // remove trailing zeros off the back of the nanos value. - int64_t zeroDigits = 0; - if (nanos == 0) { - zeroDigits = 8; - } else { - while (nanos % 10 == 0) { - nanos /= 10; - zeroDigits += 1; - } - } - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), - "%0*" INT64_FORMAT_STRING "d\"", - static_cast<int>(NANO_DIGITS - zeroDigits), - static_cast<int64_t >(nanos)); - writeString(buffer, numBuffer); - } - } - - void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const TimestampVectorBatch& ts = - dynamic_cast<const TimestampVectorBatch&>(batch); - seconds = ts.data.data(); - nanoseconds = ts.nanoseconds.data(); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/ColumnPrinter.hh" +#include "orc/orc-config.hh" + +#include "Adaptor.hh" + +#include <limits> +#include <sstream> +#include <stdexcept> +#include <time.h> +#include <typeinfo> + +#ifdef __clang__ + #pragma clang diagnostic ignored "-Wformat-security" +#endif + +namespace orc { + + class VoidColumnPrinter: public ColumnPrinter { + public: + VoidColumnPrinter(std::string&); + ~VoidColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class BooleanColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + public: + BooleanColumnPrinter(std::string&); + ~BooleanColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class LongColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + public: + LongColumnPrinter(std::string&); + ~LongColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class DoubleColumnPrinter: public ColumnPrinter { + private: + const double* data; + const bool isFloat; + + public: + DoubleColumnPrinter(std::string&, const Type& type); + virtual ~DoubleColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class TimestampColumnPrinter: public ColumnPrinter { + private: + const int64_t* seconds; + const int64_t* nanoseconds; + + public: + TimestampColumnPrinter(std::string&); + ~TimestampColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class DateColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + + public: + DateColumnPrinter(std::string&); + ~DateColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class Decimal64ColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + int32_t scale; + public: + Decimal64ColumnPrinter(std::string&); + ~Decimal64ColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class Decimal128ColumnPrinter: public ColumnPrinter { + private: + const Int128* data; + int32_t scale; + public: + Decimal128ColumnPrinter(std::string&); + ~Decimal128ColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class StringColumnPrinter: public ColumnPrinter { + private: + const char* const * start; + const int64_t* length; + public: + StringColumnPrinter(std::string&); + virtual ~StringColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class BinaryColumnPrinter: public ColumnPrinter { + private: + const char* const * start; + const int64_t* length; + public: + BinaryColumnPrinter(std::string&); + virtual ~BinaryColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class ListColumnPrinter: public ColumnPrinter { + private: + const int64_t* offsets; + std::unique_ptr<ColumnPrinter> elementPrinter; + + public: + ListColumnPrinter(std::string&, const Type& type); + virtual ~ListColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class MapColumnPrinter: public ColumnPrinter { + private: + const int64_t* offsets; + std::unique_ptr<ColumnPrinter> keyPrinter; + std::unique_ptr<ColumnPrinter> elementPrinter; + + public: + MapColumnPrinter(std::string&, const Type& type); + virtual ~MapColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class UnionColumnPrinter: public ColumnPrinter { + private: + const unsigned char *tags; + const uint64_t* offsets; + std::vector<ColumnPrinter*> fieldPrinter; + + public: + UnionColumnPrinter(std::string&, const Type& type); + virtual ~UnionColumnPrinter() override; + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class StructColumnPrinter: public ColumnPrinter { + private: + std::vector<ColumnPrinter*> fieldPrinter; + std::vector<std::string> fieldNames; + public: + StructColumnPrinter(std::string&, const Type& type); + virtual ~StructColumnPrinter() override; + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + void writeChar(std::string& file, char ch) { + file += ch; + } + + void writeString(std::string& file, const char *ptr) { + size_t len = strlen(ptr); + file.append(ptr, len); + } + + ColumnPrinter::ColumnPrinter(std::string& _buffer + ): buffer(_buffer) { + notNull = nullptr; + hasNulls = false; + } + + ColumnPrinter::~ColumnPrinter() { + // PASS + } + + void ColumnPrinter::reset(const ColumnVectorBatch& batch) { + hasNulls = batch.hasNulls; + if (hasNulls) { + notNull = batch.notNull.data(); + } else { + notNull = nullptr ; + } + } + + std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, + const Type* type) { + ColumnPrinter *result = nullptr; + if (type == nullptr) { + result = new VoidColumnPrinter(buffer); + } else { + switch(static_cast<int64_t>(type->getKind())) { + case BOOLEAN: + result = new BooleanColumnPrinter(buffer); + break; + + case BYTE: + case SHORT: + case INT: + case LONG: + result = new LongColumnPrinter(buffer); + break; + + case FLOAT: + case DOUBLE: + result = new DoubleColumnPrinter(buffer, *type); + break; + + case STRING: + case VARCHAR : + case CHAR: + result = new StringColumnPrinter(buffer); + break; + + case BINARY: + result = new BinaryColumnPrinter(buffer); + break; + + case TIMESTAMP: + result = new TimestampColumnPrinter(buffer); + break; + + case LIST: + result = new ListColumnPrinter(buffer, *type); + break; + + case MAP: + result = new MapColumnPrinter(buffer, *type); + break; + + case STRUCT: + result = new StructColumnPrinter(buffer, *type); + break; + + case DECIMAL: + if (type->getPrecision() == 0 || type->getPrecision() > 18) { + result = new Decimal128ColumnPrinter(buffer); + } else { + result = new Decimal64ColumnPrinter(buffer); + } + break; + + case DATE: + result = new DateColumnPrinter(buffer); + break; + + case UNION: + result = new UnionColumnPrinter(buffer, *type); + break; + + default: + throw std::logic_error("unknown batch type"); + } + } + return std::unique_ptr<ColumnPrinter>(result); + } + + VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer) { + // PASS + } + + void VoidColumnPrinter::reset(const ColumnVectorBatch&) { + // PASS + } + + void VoidColumnPrinter::printRow(uint64_t) { + writeString(buffer, "null"); + } + + LongColumnPrinter::LongColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr) { + // PASS + } + + void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + } + + void LongColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", + static_cast<int64_t >(data[rowId])); + writeString(buffer, numBuffer); + } + } + + DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + data(nullptr), + isFloat(type.getKind() == FLOAT){ + // PASS + } + + void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); + } + + void DoubleColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", + data[rowId]); + writeString(buffer, numBuffer); + } + } + + Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr), + scale(0) { + // PASS + } + + void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); + scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; + } + + std::string toDecimalString(int64_t value, int32_t scale) { + std::stringstream buffer; + if (scale == 0) { + buffer << value; + return buffer.str(); + } + std::string sign = ""; + if (value < 0) { + sign = "-"; + value = -value; + } + buffer << value; + std::string str = buffer.str(); + int32_t len = static_cast<int32_t>(str.length()); + if (len > scale) { + return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." + + str.substr(static_cast<size_t>(len - scale), + static_cast<size_t>(scale)); + } else if (len == scale) { + return sign + "0." + str; + } else { + std::string result = sign + "0."; + for(int32_t i=0; i < scale - len; ++i) { + result += "0"; + } + return result + str; + } + } + + void Decimal64ColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, toDecimalString(data[rowId], scale).c_str()); + } + } + + Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr), + scale(0) { + // PASS + } + + void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); + scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; + } + + void Decimal128ColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, data[rowId].toDecimalString(scale).c_str()); + } + } + + StringColumnPrinter::StringColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + start(nullptr), + length(nullptr) { + // PASS + } + + void StringColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); + length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); + } + + void StringColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '"'); + for(int64_t i=0; i < length[rowId]; ++i) { + char ch = static_cast<char>(start[rowId][i]); + switch (ch) { + case '\\': + writeString(buffer, "\\\\"); + break; + case '\b': + writeString(buffer, "\\b"); + break; + case '\f': + writeString(buffer, "\\f"); + break; + case '\n': + writeString(buffer, "\\n"); + break; + case '\r': + writeString(buffer, "\\r"); + break; + case '\t': + writeString(buffer, "\\t"); + break; + case '"': + writeString(buffer, "\\\""); + break; + default: + writeChar(buffer, ch); + break; + } + } + writeChar(buffer, '"'); + } + } + + ListColumnPrinter::ListColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + offsets(nullptr) { + elementPrinter = createColumnPrinter(buffer, type.getSubtype(0)); + } + + void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); + elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch). + elements); + } + + void ListColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '['); + for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { + if (i != offsets[rowId]) { + writeString(buffer, ", "); + } + elementPrinter->printRow(static_cast<uint64_t>(i)); + } + writeChar(buffer, ']'); + } + } + + MapColumnPrinter::MapColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + offsets(nullptr) { + keyPrinter = createColumnPrinter(buffer, type.getSubtype(0)); + elementPrinter = createColumnPrinter(buffer, type.getSubtype(1)); + } + + void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch); + offsets = myBatch.offsets.data(); + keyPrinter->reset(*myBatch.keys); + elementPrinter->reset(*myBatch.elements); + } + + void MapColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '['); + for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { + if (i != offsets[rowId]) { + writeString(buffer, ", "); + } + writeString(buffer, "{\"key\": "); + keyPrinter->printRow(static_cast<uint64_t>(i)); + writeString(buffer, ", \"value\": "); + elementPrinter->printRow(static_cast<uint64_t>(i)); + writeChar(buffer, '}'); + } + writeChar(buffer, ']'); + } + } + + UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + tags(nullptr), + offsets(nullptr) { + for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)) + .release()); + } + } + + UnionColumnPrinter::~UnionColumnPrinter() { + for (size_t i = 0; i < fieldPrinter.size(); i++) { + delete fieldPrinter[i]; + } + } + + void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const UnionVectorBatch& unionBatch = + dynamic_cast<const UnionVectorBatch&>(batch); + tags = unionBatch.tags.data(); + offsets = unionBatch.offsets.data(); + for(size_t i=0; i < fieldPrinter.size(); ++i) { + fieldPrinter[i]->reset(*(unionBatch.children[i])); + } + } + + void UnionColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, "{\"tag\": "); + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", + static_cast<int64_t>(tags[rowId])); + writeString(buffer, numBuffer); + writeString(buffer, ", \"value\": "); + fieldPrinter[tags[rowId]]->printRow(offsets[rowId]); + writeChar(buffer, '}'); + } + } + + StructColumnPrinter::StructColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer) { + for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + fieldNames.push_back(type.getFieldName(i)); + fieldPrinter.push_back(createColumnPrinter(buffer, + type.getSubtype(i)) + .release()); + } + } + + StructColumnPrinter::~StructColumnPrinter() { + for (size_t i = 0; i < fieldPrinter.size(); i++) { + delete fieldPrinter[i]; + } + } + + void StructColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const StructVectorBatch& structBatch = + dynamic_cast<const StructVectorBatch&>(batch); + for(size_t i=0; i < fieldPrinter.size(); ++i) { + fieldPrinter[i]->reset(*(structBatch.fields[i])); + } + } + + void StructColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '{'); + for(unsigned int i=0; i < fieldPrinter.size(); ++i) { + if (i != 0) { + writeString(buffer, ", "); + } + writeChar(buffer, '"'); + writeString(buffer, fieldNames[i].c_str()); + writeString(buffer, "\": "); + fieldPrinter[i]->printRow(rowId); + } + writeChar(buffer, '}'); + } + } + + DateColumnPrinter::DateColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr) { + // PASS + } + + void DateColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + const time_t timeValue = data[rowId] * 24 * 60 * 60; + struct tm tmValue; + gmtime_r(&timeValue, &tmValue); + char timeBuffer[11]; + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue); + writeChar(buffer, '"'); + writeString(buffer, timeBuffer); + writeChar(buffer, '"'); + } + } + + void DateColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + } + + BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr) { + // PASS + } + + void BooleanColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, (data[rowId] ? "true" : "false")); + } + } + + void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + } + + BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + start(nullptr), + length(nullptr) { + // PASS + } + + void BinaryColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '['); + for(int64_t i=0; i < length[rowId]; ++i) { + if (i != 0) { + writeString(buffer, ", "); + } + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), "%d", + (static_cast<const int>(start[rowId][i]) & 0xff)); + writeString(buffer, numBuffer); + } + writeChar(buffer, ']'); + } + } + + void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); + length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); + } + + TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + seconds(nullptr), + nanoseconds(nullptr) { + // PASS + } + + void TimestampColumnPrinter::printRow(uint64_t rowId) { + const int64_t NANO_DIGITS = 9; + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + int64_t nanos = nanoseconds[rowId]; + time_t secs = static_cast<time_t>(seconds[rowId]); + struct tm tmValue; + gmtime_r(&secs, &tmValue); + char timeBuffer[20]; + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + writeChar(buffer, '"'); + writeString(buffer, timeBuffer); + writeChar(buffer, '.'); + // remove trailing zeros off the back of the nanos value. + int64_t zeroDigits = 0; + if (nanos == 0) { + zeroDigits = 8; + } else { + while (nanos % 10 == 0) { + nanos /= 10; + zeroDigits += 1; + } + } + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), + "%0*" INT64_FORMAT_STRING "d\"", + static_cast<int>(NANO_DIGITS - zeroDigits), + static_cast<int64_t >(nanos)); + writeString(buffer, numBuffer); + } + } + + void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const TimestampVectorBatch& ts = + dynamic_cast<const TimestampVectorBatch&>(batch); + seconds = ts.data.data(); + nanoseconds = ts.nanoseconds.data(); + } +} diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc index 8cf660be11..aa891f5074 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc @@ -1,1836 +1,1836 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" - -#include "Adaptor.hh" -#include "ByteRLE.hh" -#include "ColumnReader.hh" -#include "orc/Exceptions.hh" -#include "RLE.hh" - -#include <math.h> -#include <iostream> - -namespace orc { - - StripeStreams::~StripeStreams() { - // PASS - } - - inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { - switch (static_cast<int64_t>(kind)) { - case proto::ColumnEncoding_Kind_DIRECT: - case proto::ColumnEncoding_Kind_DICTIONARY: - return RleVersion_1; - case proto::ColumnEncoding_Kind_DIRECT_V2: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - return RleVersion_2; - default: - throw ParseError("Unknown encoding in convertRleVersion"); - } - } - - ColumnReader::ColumnReader(const Type& type, - StripeStreams& stripe - ): columnId(type.getColumnId()), - memoryPool(stripe.getMemoryPool()) { - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); - if (stream.get()) { - notNullDecoder = createBooleanRleDecoder(std::move(stream)); - } - } - - ColumnReader::~ColumnReader() { - // PASS - } - - uint64_t ColumnReader::skip(uint64_t numValues) { - ByteRleDecoder* decoder = notNullDecoder.get(); - if (decoder) { - // page through the values that we want to skip - // and count how many are non-null - const size_t MAX_BUFFER_SIZE = 32768; - size_t bufferSize = std::min(MAX_BUFFER_SIZE, - static_cast<size_t>(numValues)); - char buffer[MAX_BUFFER_SIZE]; - uint64_t remaining = numValues; - while (remaining > 0) { - uint64_t chunkSize = - std::min(remaining, - static_cast<uint64_t>(bufferSize)); - decoder->next(buffer, chunkSize, nullptr); - remaining -= chunkSize; - for(uint64_t i=0; i < chunkSize; ++i) { - if (!buffer[i]) { - numValues -= 1; - } - } - } - } - return numValues; - } - - void ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* incomingMask) { - if (numValues > rowBatch.capacity) { - rowBatch.resize(numValues); - } - rowBatch.numElements = numValues; - ByteRleDecoder* decoder = notNullDecoder.get(); - if (decoder) { - char* notNullArray = rowBatch.notNull.data(); - decoder->next(notNullArray, numValues, incomingMask); - // check to see if there are nulls in this batch - for(uint64_t i=0; i < numValues; ++i) { - if (!notNullArray[i]) { - rowBatch.hasNulls = true; - return; - } - } - } else if (incomingMask) { - // If we don't have a notNull stream, copy the incomingMask - rowBatch.hasNulls = true; - memcpy(rowBatch.notNull.data(), incomingMask, numValues); - return; - } - rowBatch.hasNulls = false; - } - - void ColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - if (notNullDecoder.get()) { - notNullDecoder->seek(positions.at(columnId)); - } - } - - /** - * Expand an array of bytes in place to the corresponding array of longs. - * Has to work backwards so that they data isn't clobbered during the - * expansion. - * @param buffer the array of chars and array of longs that need to be - * expanded - * @param numValues the number of bytes to convert to longs - */ - void expandBytesToLongs(int64_t* buffer, uint64_t numValues) { - for(size_t i=numValues - 1; i < numValues; --i) { - buffer[i] = reinterpret_cast<char *>(buffer)[i]; - } - } - - class BooleanColumnReader: public ColumnReader { - private: - std::unique_ptr<orc::ByteRleDecoder> rle; - - public: - BooleanColumnReader(const Type& type, StripeStreams& stipe); - ~BooleanColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - BooleanColumnReader::BooleanColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe){ - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Boolean column"); - rle = createBooleanRleDecoder(std::move(stream)); - } - - BooleanColumnReader::~BooleanColumnReader() { - // PASS - } - - uint64_t BooleanColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // Since the byte rle places the output in a char* instead of long*, - // we cheat here and use the long* and then expand it in a second pass. - int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - expandBytesToLongs(ptr, numValues); - } - - void BooleanColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - class ByteColumnReader: public ColumnReader { - private: - std::unique_ptr<orc::ByteRleDecoder> rle; - - public: - ByteColumnReader(const Type& type, StripeStreams& stipe); - ~ByteColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - ByteColumnReader::ByteColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe){ - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Byte column"); - rle = createByteRleDecoder(std::move(stream)); - } - - ByteColumnReader::~ByteColumnReader() { - // PASS - } - - uint64_t ByteColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void ByteColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // Since the byte rle places the output in a char* instead of long*, - // we cheat here and use the long* and then expand it in a second pass. - int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - expandBytesToLongs(ptr, numValues); - } - - void ByteColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - class IntegerColumnReader: public ColumnReader { - protected: - std::unique_ptr<orc::RleDecoder> rle; - - public: - IntegerColumnReader(const Type& type, StripeStreams& stripe); - ~IntegerColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - IntegerColumnReader::IntegerColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Integer column"); - rle = createRleDecoder(std::move(stream), true, vers, memoryPool); - } - - IntegerColumnReader::~IntegerColumnReader() { - // PASS - } - - uint64_t IntegerColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void IntegerColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - } - - void IntegerColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - class TimestampColumnReader: public ColumnReader { - private: - std::unique_ptr<orc::RleDecoder> secondsRle; - std::unique_ptr<orc::RleDecoder> nanoRle; - const Timezone& writerTimezone; - const int64_t epochOffset; - - public: - TimestampColumnReader(const Type& type, StripeStreams& stripe); - ~TimestampColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - - TimestampColumnReader::TimestampColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - writerTimezone(stripe.getWriterTimezone()), - epochOffset(writerTimezone.getEpoch()) { - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Timestamp column"); - secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool); - stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); - if (stream == nullptr) - throw ParseError("SECONDARY stream not found in Timestamp column"); - nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool); - } - - TimestampColumnReader::~TimestampColumnReader() { - // PASS - } - - uint64_t TimestampColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - secondsRle->skip(numValues); - nanoRle->skip(numValues); - return numValues; - } - - void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - TimestampVectorBatch& timestampBatch = - dynamic_cast<TimestampVectorBatch&>(rowBatch); - int64_t *secsBuffer = timestampBatch.data.data(); - secondsRle->next(secsBuffer, numValues, notNull); - int64_t *nanoBuffer = timestampBatch.nanoseconds.data(); - nanoRle->next(nanoBuffer, numValues, notNull); - - // Construct the values - for(uint64_t i=0; i < numValues; i++) { - if (notNull == nullptr || notNull[i]) { - uint64_t zeros = nanoBuffer[i] & 0x7; - nanoBuffer[i] >>= 3; - if (zeros != 0) { - for(uint64_t j = 0; j <= zeros; ++j) { - nanoBuffer[i] *= 10; - } - } - int64_t writerTime = secsBuffer[i] + epochOffset; - secsBuffer[i] = writerTimezone.convertToUTC(writerTime); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" + +#include "Adaptor.hh" +#include "ByteRLE.hh" +#include "ColumnReader.hh" +#include "orc/Exceptions.hh" +#include "RLE.hh" + +#include <math.h> +#include <iostream> + +namespace orc { + + StripeStreams::~StripeStreams() { + // PASS + } + + inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { + switch (static_cast<int64_t>(kind)) { + case proto::ColumnEncoding_Kind_DIRECT: + case proto::ColumnEncoding_Kind_DICTIONARY: + return RleVersion_1; + case proto::ColumnEncoding_Kind_DIRECT_V2: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + return RleVersion_2; + default: + throw ParseError("Unknown encoding in convertRleVersion"); + } + } + + ColumnReader::ColumnReader(const Type& type, + StripeStreams& stripe + ): columnId(type.getColumnId()), + memoryPool(stripe.getMemoryPool()) { + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); + if (stream.get()) { + notNullDecoder = createBooleanRleDecoder(std::move(stream)); + } + } + + ColumnReader::~ColumnReader() { + // PASS + } + + uint64_t ColumnReader::skip(uint64_t numValues) { + ByteRleDecoder* decoder = notNullDecoder.get(); + if (decoder) { + // page through the values that we want to skip + // and count how many are non-null + const size_t MAX_BUFFER_SIZE = 32768; + size_t bufferSize = std::min(MAX_BUFFER_SIZE, + static_cast<size_t>(numValues)); + char buffer[MAX_BUFFER_SIZE]; + uint64_t remaining = numValues; + while (remaining > 0) { + uint64_t chunkSize = + std::min(remaining, + static_cast<uint64_t>(bufferSize)); + decoder->next(buffer, chunkSize, nullptr); + remaining -= chunkSize; + for(uint64_t i=0; i < chunkSize; ++i) { + if (!buffer[i]) { + numValues -= 1; + } + } + } + } + return numValues; + } + + void ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* incomingMask) { + if (numValues > rowBatch.capacity) { + rowBatch.resize(numValues); + } + rowBatch.numElements = numValues; + ByteRleDecoder* decoder = notNullDecoder.get(); + if (decoder) { + char* notNullArray = rowBatch.notNull.data(); + decoder->next(notNullArray, numValues, incomingMask); + // check to see if there are nulls in this batch + for(uint64_t i=0; i < numValues; ++i) { + if (!notNullArray[i]) { + rowBatch.hasNulls = true; + return; + } + } + } else if (incomingMask) { + // If we don't have a notNull stream, copy the incomingMask + rowBatch.hasNulls = true; + memcpy(rowBatch.notNull.data(), incomingMask, numValues); + return; + } + rowBatch.hasNulls = false; + } + + void ColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + if (notNullDecoder.get()) { + notNullDecoder->seek(positions.at(columnId)); + } + } + + /** + * Expand an array of bytes in place to the corresponding array of longs. + * Has to work backwards so that they data isn't clobbered during the + * expansion. + * @param buffer the array of chars and array of longs that need to be + * expanded + * @param numValues the number of bytes to convert to longs + */ + void expandBytesToLongs(int64_t* buffer, uint64_t numValues) { + for(size_t i=numValues - 1; i < numValues; --i) { + buffer[i] = reinterpret_cast<char *>(buffer)[i]; + } + } + + class BooleanColumnReader: public ColumnReader { + private: + std::unique_ptr<orc::ByteRleDecoder> rle; + + public: + BooleanColumnReader(const Type& type, StripeStreams& stipe); + ~BooleanColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + BooleanColumnReader::BooleanColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe){ + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Boolean column"); + rle = createBooleanRleDecoder(std::move(stream)); + } + + BooleanColumnReader::~BooleanColumnReader() { + // PASS + } + + uint64_t BooleanColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // Since the byte rle places the output in a char* instead of long*, + // we cheat here and use the long* and then expand it in a second pass. + int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); + rle->next(reinterpret_cast<char*>(ptr), + numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + expandBytesToLongs(ptr, numValues); + } + + void BooleanColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + class ByteColumnReader: public ColumnReader { + private: + std::unique_ptr<orc::ByteRleDecoder> rle; + + public: + ByteColumnReader(const Type& type, StripeStreams& stipe); + ~ByteColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + ByteColumnReader::ByteColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe){ + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Byte column"); + rle = createByteRleDecoder(std::move(stream)); + } + + ByteColumnReader::~ByteColumnReader() { + // PASS + } + + uint64_t ByteColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void ByteColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // Since the byte rle places the output in a char* instead of long*, + // we cheat here and use the long* and then expand it in a second pass. + int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); + rle->next(reinterpret_cast<char*>(ptr), + numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + expandBytesToLongs(ptr, numValues); + } + + void ByteColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + class IntegerColumnReader: public ColumnReader { + protected: + std::unique_ptr<orc::RleDecoder> rle; + + public: + IntegerColumnReader(const Type& type, StripeStreams& stripe); + ~IntegerColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + IntegerColumnReader::IntegerColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Integer column"); + rle = createRleDecoder(std::move(stream), true, vers, memoryPool); + } + + IntegerColumnReader::~IntegerColumnReader() { + // PASS + } + + uint64_t IntegerColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void IntegerColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(), + numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + } + + void IntegerColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + class TimestampColumnReader: public ColumnReader { + private: + std::unique_ptr<orc::RleDecoder> secondsRle; + std::unique_ptr<orc::RleDecoder> nanoRle; + const Timezone& writerTimezone; + const int64_t epochOffset; + + public: + TimestampColumnReader(const Type& type, StripeStreams& stripe); + ~TimestampColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + + TimestampColumnReader::TimestampColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe), + writerTimezone(stripe.getWriterTimezone()), + epochOffset(writerTimezone.getEpoch()) { + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Timestamp column"); + secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool); + stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); + if (stream == nullptr) + throw ParseError("SECONDARY stream not found in Timestamp column"); + nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool); + } + + TimestampColumnReader::~TimestampColumnReader() { + // PASS + } + + uint64_t TimestampColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + secondsRle->skip(numValues); + nanoRle->skip(numValues); + return numValues; + } + + void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + TimestampVectorBatch& timestampBatch = + dynamic_cast<TimestampVectorBatch&>(rowBatch); + int64_t *secsBuffer = timestampBatch.data.data(); + secondsRle->next(secsBuffer, numValues, notNull); + int64_t *nanoBuffer = timestampBatch.nanoseconds.data(); + nanoRle->next(nanoBuffer, numValues, notNull); + + // Construct the values + for(uint64_t i=0; i < numValues; i++) { + if (notNull == nullptr || notNull[i]) { + uint64_t zeros = nanoBuffer[i] & 0x7; + nanoBuffer[i] >>= 3; + if (zeros != 0) { + for(uint64_t j = 0; j <= zeros; ++j) { + nanoBuffer[i] *= 10; + } + } + int64_t writerTime = secsBuffer[i] + epochOffset; + secsBuffer[i] = writerTimezone.convertToUTC(writerTime); if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) { - secsBuffer[i] -= 1; - } - } - } - } - - void TimestampColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - secondsRle->seek(positions.at(columnId)); - nanoRle->seek(positions.at(columnId)); - } - - class DoubleColumnReader: public ColumnReader { - public: - DoubleColumnReader(const Type& type, StripeStreams& stripe); - ~DoubleColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - std::unique_ptr<SeekableInputStream> inputStream; - TypeKind columnKind; - const uint64_t bytesPerValue ; - const char *bufferPointer; - const char *bufferEnd; - - unsigned char readByte() { - if (bufferPointer == bufferEnd) { - int length; - if (!inputStream->Next - (reinterpret_cast<const void**>(&bufferPointer), &length)) { - throw ParseError("bad read in DoubleColumnReader::next()"); - } - bufferEnd = bufferPointer + length; - } - return static_cast<unsigned char>(*(bufferPointer++)); - } - - double readDouble() { - int64_t bits = 0; - for (uint64_t i=0; i < 8; i++) { - bits |= static_cast<int64_t>(readByte()) << (i*8); - } - double *result = reinterpret_cast<double*>(&bits); - return *result; - } - - double readFloat() { - int32_t bits = 0; - for (uint64_t i=0; i < 4; i++) { - bits |= readByte() << (i*8); - } - float *result = reinterpret_cast<float*>(&bits); - return static_cast<double>(*result); - } - }; - - DoubleColumnReader::DoubleColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - columnKind(type.getKind()), - bytesPerValue((type.getKind() == - FLOAT) ? 4 : 8), - bufferPointer(nullptr), - bufferEnd(nullptr) { - inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (inputStream == nullptr) - throw ParseError("DATA stream not found in Double column"); - } - - DoubleColumnReader::~DoubleColumnReader() { - // PASS - } - - uint64_t DoubleColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - - if (static_cast<size_t>(bufferEnd - bufferPointer) >= - bytesPerValue * numValues) { - bufferPointer += bytesPerValue * numValues; - } else { - size_t sizeToSkip = bytesPerValue * numValues - - static_cast<size_t>(bufferEnd - bufferPointer); - const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); - while (sizeToSkip != 0) { - size_t step = sizeToSkip > cap ? cap : sizeToSkip; - inputStream->Skip(static_cast<int>(step)); - sizeToSkip -= step; - } - bufferEnd = nullptr; - bufferPointer = nullptr; - } - - return numValues; - } - - void DoubleColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // update the notNull from the parent class - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data(); - - if (columnKind == FLOAT) { - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - outArray[i] = readFloat(); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - outArray[i] = readFloat(); - } - } - } else { - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - outArray[i] = readDouble(); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - outArray[i] = readDouble(); - } - } - } - } - - void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) { - int64_t posn = 0; - while (posn < bufferSize) { - const void* chunk; - int length; - if (!stream->Next(&chunk, &length)) { - throw ParseError("bad read in readFully"); - } - if (posn + length > bufferSize) { - throw ParseError("Corrupt dictionary blob in StringDictionaryColumn"); - } - memcpy(buffer + posn, chunk, static_cast<size_t>(length)); - posn += length; - } - } - - void DoubleColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - inputStream->seek(positions.at(columnId)); - } - - class StringDictionaryColumnReader: public ColumnReader { - private: - std::shared_ptr<StringDictionary> dictionary; - std::unique_ptr<RleDecoder> rle; - - public: - StringDictionaryColumnReader(const Type& type, StripeStreams& stipe); - ~StringDictionaryColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - StringDictionaryColumnReader::StringDictionaryColumnReader - (const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - dictionary(new StringDictionary(stripe.getMemoryPool())) { - RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) - .kind()); - uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize(); - rle = createRleDecoder(stripe.getStream(columnId, - proto::Stream_Kind_DATA, - true), - false, rleVersion, memoryPool); - std::unique_ptr<RleDecoder> lengthDecoder = - createRleDecoder(stripe.getStream(columnId, - proto::Stream_Kind_LENGTH, - false), - false, rleVersion, memoryPool); - dictionary->dictionaryOffset.resize(dictSize + 1); - int64_t* lengthArray = dictionary->dictionaryOffset.data(); - lengthDecoder->next(lengthArray + 1, dictSize, nullptr); - lengthArray[0] = 0; - for(uint32_t i = 1; i < dictSize + 1; ++i) { - lengthArray[i] += lengthArray[i - 1]; - } - dictionary->dictionaryBlob.resize( - static_cast<uint64_t>(lengthArray[dictSize])); - std::unique_ptr<SeekableInputStream> blobStream = - stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); - readFully( - dictionary->dictionaryBlob.data(), - lengthArray[dictSize], - blobStream.get()); - } - - StringDictionaryColumnReader::~StringDictionaryColumnReader() { - // PASS - } - - uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // update the notNull from the parent class - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char *blob = dictionary->dictionaryBlob.data(); - int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data(); - char **outputStarts = byteBatch.data.data(); - int64_t *outputLengths = byteBatch.length.data(); - rle->next(outputLengths, numValues, notNull); - uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1; - if (notNull) { - for(uint64_t i=0; i < numValues; ++i) { - if (notNull[i]) { - int64_t entry = outputLengths[i]; - if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) { - throw ParseError("Entry index out of range in StringDictionaryColumn"); - } - outputStarts[i] = blob + dictionaryOffsets[entry]; - outputLengths[i] = dictionaryOffsets[entry+1] - - dictionaryOffsets[entry]; - } - } - } else { - for(uint64_t i=0; i < numValues; ++i) { - int64_t entry = outputLengths[i]; - if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) { - throw ParseError("Entry index out of range in StringDictionaryColumn"); - } - outputStarts[i] = blob + dictionaryOffsets[entry]; - outputLengths[i] = dictionaryOffsets[entry+1] - - dictionaryOffsets[entry]; - } - } - } - - void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - rowBatch.isEncoded = true; - - EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch); - batch.dictionary = this->dictionary; - - // Length buffer is reused to save dictionary entry ids - rle->next(batch.index.data(), numValues, notNull); - } - - void StringDictionaryColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - - class StringDirectColumnReader: public ColumnReader { - private: - std::unique_ptr<RleDecoder> lengthRle; - std::unique_ptr<SeekableInputStream> blobStream; - const char *lastBuffer; - size_t lastBufferLength; - - /** - * Compute the total length of the values. - * @param lengths the array of lengths - * @param notNull the array of notNull flags - * @param numValues the lengths of the arrays - * @return the total number of bytes for the non-null values - */ - size_t computeSize(const int64_t *lengths, const char *notNull, - uint64_t numValues); - - public: - StringDirectColumnReader(const Type& type, StripeStreams& stipe); - ~StringDirectColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - StringDirectColumnReader::StringDirectColumnReader - (const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) - .kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in StringDirectColumn"); - lengthRle = createRleDecoder( - std::move(stream), false, rleVersion, memoryPool); - blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (blobStream == nullptr) - throw ParseError("DATA stream not found in StringDirectColumn"); - lastBuffer = nullptr; - lastBufferLength = 0; - } - - StringDirectColumnReader::~StringDirectColumnReader() { - // PASS - } - - uint64_t StringDirectColumnReader::skip(uint64_t numValues) { - const size_t BUFFER_SIZE = 1024; - numValues = ColumnReader::skip(numValues); - int64_t buffer[BUFFER_SIZE]; - uint64_t done = 0; - size_t totalBytes = 0; - // read the lengths, so we know haw many bytes to skip - while (done < numValues) { - uint64_t step = std::min(BUFFER_SIZE, - static_cast<size_t>(numValues - done)); - lengthRle->next(buffer, step, nullptr); - totalBytes += computeSize(buffer, nullptr, step); - done += step; - } - if (totalBytes <= lastBufferLength) { - // subtract the needed bytes from the ones left over - lastBufferLength -= totalBytes; - lastBuffer += totalBytes; - } else { - // move the stream forward after accounting for the buffered bytes - totalBytes -= lastBufferLength; - const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); - while (totalBytes != 0) { - size_t step = totalBytes > cap ? cap : totalBytes; - blobStream->Skip(static_cast<int>(step)); - totalBytes -= step; - } - lastBufferLength = 0; - lastBuffer = nullptr; - } - return numValues; - } - - size_t StringDirectColumnReader::computeSize(const int64_t* lengths, - const char* notNull, - uint64_t numValues) { - size_t totalLength = 0; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - totalLength += static_cast<size_t>(lengths[i]); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - totalLength += static_cast<size_t>(lengths[i]); - } - } - return totalLength; - } - - void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // update the notNull from the parent class - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char **startPtr = byteBatch.data.data(); - int64_t *lengthPtr = byteBatch.length.data(); - - // read the length vector - lengthRle->next(lengthPtr, numValues, notNull); - - // figure out the total length of data we need from the blob stream - const size_t totalLength = computeSize(lengthPtr, notNull, numValues); - - // Load data from the blob stream into our buffer until we have enough - // to get the rest directly out of the stream's buffer. - size_t bytesBuffered = 0; - byteBatch.blob.resize(totalLength); - char *ptr= byteBatch.blob.data(); - while (bytesBuffered + lastBufferLength < totalLength) { - memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength); - bytesBuffered += lastBufferLength; - const void* readBuffer; - int readLength; - if (!blobStream->Next(&readBuffer, &readLength)) { - throw ParseError("failed to read in StringDirectColumnReader.next"); - } - lastBuffer = static_cast<const char*>(readBuffer); - lastBufferLength = static_cast<size_t>(readLength); - } - - if (bytesBuffered < totalLength) { - size_t moreBytes = totalLength - bytesBuffered; - memcpy(ptr + bytesBuffered, lastBuffer, moreBytes); - lastBuffer += moreBytes; - lastBufferLength -= moreBytes; - } - - size_t filledSlots = 0; - ptr = byteBatch.blob.data(); - if (notNull) { - while (filledSlots < numValues) { - if (notNull[filledSlots]) { - startPtr[filledSlots] = const_cast<char*>(ptr); - ptr += lengthPtr[filledSlots]; - } - filledSlots += 1; - } - } else { - while (filledSlots < numValues) { - startPtr[filledSlots] = const_cast<char*>(ptr); - ptr += lengthPtr[filledSlots]; - filledSlots += 1; - } - } - } - - void StringDirectColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - blobStream->seek(positions.at(columnId)); - lengthRle->seek(positions.at(columnId)); - } - - class StructColumnReader: public ColumnReader { - private: - std::vector<ColumnReader*> children; - - public: - StructColumnReader(const Type& type, StripeStreams& stipe); - ~StructColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - StructColumnReader::StructColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - // count the number of selected sub-columns - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) { - case proto::ColumnEncoding_Kind_DIRECT: - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - const Type& child = *type.getSubtype(i); - if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { - children.push_back(buildReader(child, stripe).release()); - } - } - break; - case proto::ColumnEncoding_Kind_DIRECT_V2: - case proto::ColumnEncoding_Kind_DICTIONARY: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - default: - throw ParseError("Unknown encoding for StructColumnReader"); - } - } - - StructColumnReader::~StructColumnReader() { - for (size_t i=0; i<children.size(); i++) { - delete children[i]; - } - } - - uint64_t StructColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - for(std::vector<ColumnReader*>::iterator ptr=children.begin(); ptr != children.end(); ++ptr) { - (*ptr)->skip(numValues); - } - return numValues; - } - - void StructColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - uint64_t i=0; - notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr; - for(std::vector<ColumnReader*>::iterator ptr=children.begin(); - ptr != children.end(); ++ptr, ++i) { - if (encoded) { - (*ptr)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), - numValues, notNull); - } else { - (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), - numValues, notNull); - } - } - } - - void StructColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - - for(std::vector<ColumnReader*>::iterator ptr = children.begin(); - ptr != children.end(); - ++ptr) { - (*ptr)->seekToRowGroup(positions); - } - } - - class ListColumnReader: public ColumnReader { - private: - std::unique_ptr<ColumnReader> child; - std::unique_ptr<RleDecoder> rle; - - public: - ListColumnReader(const Type& type, StripeStreams& stipe); - ~ListColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - ListColumnReader::ListColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - // count the number of selected sub-columns - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in List column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool); - const Type& childType = *type.getSubtype(0); - if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) { - child = buildReader(childType, stripe); - } - } - - ListColumnReader::~ListColumnReader() { - // PASS - } - - uint64_t ListColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - ColumnReader *childReader = child.get(); - if (childReader) { - const uint64_t BUFFER_SIZE = 1024; - int64_t buffer[BUFFER_SIZE]; - uint64_t childrenElements = 0; - uint64_t lengthsRead = 0; - while (lengthsRead < numValues) { - uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { - childrenElements += static_cast<size_t>(buffer[i]); - } - lengthsRead += chunk; - } - childReader->skip(childrenElements); - } else { - rle->skip(numValues); - } - return numValues; - } - - void ListColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); - int64_t* offsets = listBatch.offsets.data(); - notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr; - rle->next(offsets, numValues, notNull); - uint64_t totalChildren = 0; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } else { - offsets[i] = static_cast<int64_t>(totalChildren); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } - } - offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader *childReader = child.get(); - if (childReader) { - if (encoded) { - childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr); - } else { - childReader->next(*(listBatch.elements.get()), totalChildren, nullptr); - } - } - } - - void ListColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - if (child.get()) { - child->seekToRowGroup(positions); - } - } - - class MapColumnReader: public ColumnReader { - private: - std::unique_ptr<ColumnReader> keyReader; - std::unique_ptr<ColumnReader> elementReader; - std::unique_ptr<RleDecoder> rle; - - public: - MapColumnReader(const Type& type, StripeStreams& stipe); - ~MapColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - MapColumnReader::MapColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - // Determine if the key and/or value columns are selected - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in Map column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool); - const Type& keyType = *type.getSubtype(0); - if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) { - keyReader = buildReader(keyType, stripe); - } - const Type& elementType = *type.getSubtype(1); - if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) { - elementReader = buildReader(elementType, stripe); - } - } - - MapColumnReader::~MapColumnReader() { - // PASS - } - - uint64_t MapColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - ColumnReader *rawKeyReader = keyReader.get(); - ColumnReader *rawElementReader = elementReader.get(); - if (rawKeyReader || rawElementReader) { - const uint64_t BUFFER_SIZE = 1024; - int64_t buffer[BUFFER_SIZE]; - uint64_t childrenElements = 0; - uint64_t lengthsRead = 0; - while (lengthsRead < numValues) { - uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { - childrenElements += static_cast<size_t>(buffer[i]); - } - lengthsRead += chunk; - } - if (rawKeyReader) { - rawKeyReader->skip(childrenElements); - } - if (rawElementReader) { - rawElementReader->skip(childrenElements); - } - } else { - rle->skip(numValues); - } - return numValues; - } - - void MapColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) - { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) - { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); - int64_t* offsets = mapBatch.offsets.data(); - notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr; - rle->next(offsets, numValues, notNull); - uint64_t totalChildren = 0; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } else { - offsets[i] = static_cast<int64_t>(totalChildren); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } - } - offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader *rawKeyReader = keyReader.get(); - if (rawKeyReader) { - if (encoded) { - rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr); - } else { - rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr); - } - } - ColumnReader *rawElementReader = elementReader.get(); - if (rawElementReader) { - if (encoded) { - rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr); - } else { - rawElementReader->next(*(mapBatch.elements.get()), totalChildren, nullptr); - } - } - } - - void MapColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - if (keyReader.get()) { - keyReader->seekToRowGroup(positions); - } - if (elementReader.get()) { - elementReader->seekToRowGroup(positions); - } - } - - class UnionColumnReader: public ColumnReader { - private: - std::unique_ptr<ByteRleDecoder> rle; - std::vector<ColumnReader*> childrenReader; - std::vector<int64_t> childrenCounts; - uint64_t numChildren; - - public: - UnionColumnReader(const Type& type, StripeStreams& stipe); - ~UnionColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - UnionColumnReader::UnionColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - numChildren = type.getSubtypeCount(); - childrenReader.resize(numChildren); - childrenCounts.resize(numChildren); - - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in Union column"); - rle = createByteRleDecoder(std::move(stream)); - // figure out which types are selected - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - for(unsigned int i=0; i < numChildren; ++i) { - const Type &child = *type.getSubtype(i); - if (selectedColumns[static_cast<size_t>(child.getColumnId())]) { - childrenReader[i] = buildReader(child, stripe).release(); - } - } - } - - UnionColumnReader::~UnionColumnReader() { - for(std::vector<ColumnReader*>::iterator itr = childrenReader.begin(); - itr != childrenReader.end(); ++itr) { - delete *itr; - } - } - - uint64_t UnionColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - const uint64_t BUFFER_SIZE = 1024; - char buffer[BUFFER_SIZE]; - uint64_t lengthsRead = 0; - int64_t *counts = childrenCounts.data(); - memset(counts, 0, sizeof(int64_t) * numChildren); - while (lengthsRead < numValues) { - uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { - counts[static_cast<size_t>(buffer[i])] += 1; - } - lengthsRead += chunk; - } - for(size_t i=0; i < numChildren; ++i) { - if (counts[i] != 0 && childrenReader[i] != nullptr) { - childrenReader[i]->skip(static_cast<uint64_t>(counts[i])); - } - } - return numValues; - } - - void UnionColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); - uint64_t* offsets = unionBatch.offsets.data(); - int64_t* counts = childrenCounts.data(); - memset(counts, 0, sizeof(int64_t) * numChildren); - unsigned char* tags = unionBatch.tags.data(); - notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr; - rle->next(reinterpret_cast<char *>(tags), numValues, notNull); - // set the offsets for each row - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - offsets[i] = - static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - offsets[i] = - static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); - } - } - // read the right number of each child column - for(size_t i=0; i < numChildren; ++i) { - if (childrenReader[i] != nullptr) { - if (encoded) { - childrenReader[i]->nextEncoded(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); - } else { - childrenReader[i]->next(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); - } - } - } - } - - void UnionColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - for(size_t i = 0; i < numChildren; ++i) { - if (childrenReader[i] != nullptr) { - childrenReader[i]->seekToRowGroup(positions); - } - } - } - - /** - * Destructively convert the number from zigzag encoding to the - * natural signed representation. - */ - void unZigZagInt128(Int128& value) { - bool needsNegate = value.getLowBits() & 1; - value >>= 1; - if (needsNegate) { - value.negate(); - value -= 1; - } - } - - class Decimal64ColumnReader: public ColumnReader { - public: - static const uint32_t MAX_PRECISION_64 = 18; - static const uint32_t MAX_PRECISION_128 = 38; - static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1]; - - protected: - std::unique_ptr<SeekableInputStream> valueStream; - int32_t precision; - int32_t scale; - const char* buffer; - const char* bufferEnd; - - std::unique_ptr<RleDecoder> scaleDecoder; - - /** - * Read the valueStream for more bytes. - */ - void readBuffer() { - while (buffer == bufferEnd) { - int length; - if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), - &length)) { - throw ParseError("Read past end of stream in Decimal64ColumnReader "+ - valueStream->getName()); - } - bufferEnd = buffer + length; - } - } - - void readInt64(int64_t& value, int32_t currentScale) { - value = 0; - size_t offset = 0; - while (true) { - readBuffer(); - unsigned char ch = static_cast<unsigned char>(*(buffer++)); - value |= static_cast<uint64_t>(ch & 0x7f) << offset; - offset += 7; - if (!(ch & 0x80)) { - break; - } - } - value = unZigZag(static_cast<uint64_t>(value)); - if (scale > currentScale && - static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) { - value *= POWERS_OF_TEN[scale - currentScale]; - } else if (scale < currentScale && - static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) { - value /= POWERS_OF_TEN[currentScale - scale]; - } else if (scale != currentScale) { - throw ParseError("Decimal scale out of range"); - } - } - - public: - Decimal64ColumnReader(const Type& type, StripeStreams& stipe); - ~Decimal64ColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - const uint32_t Decimal64ColumnReader::MAX_PRECISION_64; - const uint32_t Decimal64ColumnReader::MAX_PRECISION_128; - const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]= - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; - - Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - scale = static_cast<int32_t>(type.getScale()); - precision = static_cast<int32_t>(type.getPrecision()); - valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (valueStream == nullptr) - throw ParseError("DATA stream not found in Decimal64Column"); - buffer = nullptr; - bufferEnd = nullptr; - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); - if (stream == nullptr) - throw ParseError("SECONDARY stream not found in Decimal64Column"); - scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool); - } - - Decimal64ColumnReader::~Decimal64ColumnReader() { - // PASS - } - - uint64_t Decimal64ColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - uint64_t skipped = 0; - while (skipped < numValues) { - readBuffer(); - if (!(0x80 & *(buffer++))) { - skipped += 1; - } - } - scaleDecoder->skip(numValues); - return numValues; - } - - void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal64VectorBatch &batch = - dynamic_cast<Decimal64VectorBatch&>(rowBatch); - int64_t* values = batch.values.data(); - // read the next group of scales - int64_t* scaleBuffer = batch.readScales.data(); - scaleDecoder->next(scaleBuffer, numValues, notNull); - batch.precision = precision; - batch.scale = scale; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } - - void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) { - if (scale > currentScale) { - while(scale > currentScale) { - uint32_t scaleAdjust = - std::min(Decimal64ColumnReader::MAX_PRECISION_64, - scale - currentScale); - value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust]; - currentScale += scaleAdjust; - } - } else if (scale < currentScale) { - Int128 remainder; - while(currentScale > scale) { - uint32_t scaleAdjust = - std::min(Decimal64ColumnReader::MAX_PRECISION_64, - currentScale - scale); - value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], - remainder); - currentScale -= scaleAdjust; - } - } - } - - void Decimal64ColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - valueStream->seek(positions.at(columnId)); - scaleDecoder->seek(positions.at(columnId)); - } - - class Decimal128ColumnReader: public Decimal64ColumnReader { - public: - Decimal128ColumnReader(const Type& type, StripeStreams& stipe); - ~Decimal128ColumnReader() override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - private: - void readInt128(Int128& value, int32_t currentScale) { - value = 0; - Int128 work; - uint32_t offset = 0; - while (true) { - readBuffer(); - unsigned char ch = static_cast<unsigned char>(*(buffer++)); - work = ch & 0x7f; - work <<= offset; - value |= work; - offset += 7; - if (!(ch & 0x80)) { - break; - } - } - unZigZagInt128(value); - scaleInt128(value, static_cast<uint32_t>(scale), - static_cast<uint32_t>(currentScale)); - } - }; - - Decimal128ColumnReader::Decimal128ColumnReader - (const Type& type, - StripeStreams& stripe - ): Decimal64ColumnReader(type, stripe) { - // PASS - } - - Decimal128ColumnReader::~Decimal128ColumnReader() { - // PASS - } - - void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal128VectorBatch &batch = - dynamic_cast<Decimal128VectorBatch&>(rowBatch); - Int128* values = batch.values.data(); - // read the next group of scales - int64_t* scaleBuffer = batch.readScales.data(); - scaleDecoder->next(scaleBuffer, numValues, notNull); - batch.precision = precision; - batch.scale = scale; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } - - class DecimalHive11ColumnReader: public Decimal64ColumnReader { - private: - bool throwOnOverflow; - std::ostream* errorStream; - - /** - * Read an Int128 from the stream and correct it to the desired scale. - */ - bool readInt128(Int128& value, int32_t currentScale) { - // -/+ 99999999999999999999999999999999999999 - static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001); - static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff); - - value = 0; - Int128 work; - uint32_t offset = 0; - bool result = true; - while (true) { - readBuffer(); - unsigned char ch = static_cast<unsigned char>(*(buffer++)); - work = ch & 0x7f; - // If we have read more than 128 bits, we flag the error, but keep - // reading bytes so the stream isn't thrown off. - if (offset > 128 || (offset == 126 && work > 3)) { - result = false; - } - work <<= offset; - value |= work; - offset += 7; - if (!(ch & 0x80)) { - break; - } - } - - if (!result) { - return result; - } - unZigZagInt128(value); - scaleInt128(value, static_cast<uint32_t>(scale), - static_cast<uint32_t>(currentScale)); - return value >= MIN_VALUE && value <= MAX_VALUE; - } - - public: - DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe); - ~DecimalHive11ColumnReader() override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - }; - - DecimalHive11ColumnReader::DecimalHive11ColumnReader - (const Type& type, - StripeStreams& stripe - ): Decimal64ColumnReader(type, stripe) { - scale = stripe.getForcedScaleOnHive11Decimal(); - throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow(); - errorStream = stripe.getErrorStream(); - } - - DecimalHive11ColumnReader::~DecimalHive11ColumnReader() { - // PASS - } - - void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal128VectorBatch &batch = - dynamic_cast<Decimal128VectorBatch&>(rowBatch); - Int128* values = batch.values.data(); - // read the next group of scales - int64_t* scaleBuffer = batch.readScales.data(); - - scaleDecoder->next(scaleBuffer, numValues, notNull); - - batch.precision = precision; - batch.scale = scale; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - if (!readInt128(values[i], - static_cast<int32_t>(scaleBuffer[i]))) { - if (throwOnOverflow) { - throw ParseError("Hive 0.11 decimal was more than 38 digits."); - } else { - *errorStream << "Warning: " - << "Hive 0.11 decimal with more than 38 digits " - << "replaced by NULL.\n"; - notNull[i] = false; - } - } - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - if (!readInt128(values[i], - static_cast<int32_t>(scaleBuffer[i]))) { - if (throwOnOverflow) { - throw ParseError("Hive 0.11 decimal was more than 38 digits."); - } else { - *errorStream << "Warning: " - << "Hive 0.11 decimal with more than 38 digits " - << "replaced by NULL.\n"; - batch.hasNulls = true; - batch.notNull[i] = false; - } - } - } - } - } - - /** - * Create a reader for the given stripe. - */ - std::unique_ptr<ColumnReader> buildReader(const Type& type, - StripeStreams& stripe) { - switch (static_cast<int64_t>(type.getKind())) { - case DATE: - case INT: - case LONG: - case SHORT: - return std::unique_ptr<ColumnReader>( - new IntegerColumnReader(type, stripe)); - case BINARY: - case CHAR: - case STRING: - case VARCHAR: - switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){ - case proto::ColumnEncoding_Kind_DICTIONARY: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - return std::unique_ptr<ColumnReader>( - new StringDictionaryColumnReader(type, stripe)); - case proto::ColumnEncoding_Kind_DIRECT: - case proto::ColumnEncoding_Kind_DIRECT_V2: - return std::unique_ptr<ColumnReader>( - new StringDirectColumnReader(type, stripe)); - default: - throw NotImplementedYet("buildReader unhandled string encoding"); - } - - case BOOLEAN: - return std::unique_ptr<ColumnReader>( - new BooleanColumnReader(type, stripe)); - - case BYTE: - return std::unique_ptr<ColumnReader>( - new ByteColumnReader(type, stripe)); - - case LIST: - return std::unique_ptr<ColumnReader>( - new ListColumnReader(type, stripe)); - - case MAP: - return std::unique_ptr<ColumnReader>( - new MapColumnReader(type, stripe)); - - case UNION: - return std::unique_ptr<ColumnReader>( - new UnionColumnReader(type, stripe)); - - case STRUCT: - return std::unique_ptr<ColumnReader>( - new StructColumnReader(type, stripe)); - - case FLOAT: - case DOUBLE: - return std::unique_ptr<ColumnReader>( - new DoubleColumnReader(type, stripe)); - - case TIMESTAMP: - return std::unique_ptr<ColumnReader> - (new TimestampColumnReader(type, stripe)); - - case DECIMAL: - // is this a Hive 0.11 or 0.12 file? - if (type.getPrecision() == 0) { - return std::unique_ptr<ColumnReader> - (new DecimalHive11ColumnReader(type, stripe)); - - // can we represent the values using int64_t? - } else if (type.getPrecision() <= - Decimal64ColumnReader::MAX_PRECISION_64) { - return std::unique_ptr<ColumnReader> - (new Decimal64ColumnReader(type, stripe)); - - // otherwise we use the Int128 implementation - } else { - return std::unique_ptr<ColumnReader> - (new Decimal128ColumnReader(type, stripe)); - } - - default: - throw NotImplementedYet("buildReader unhandled type"); - } - } - -} + secsBuffer[i] -= 1; + } + } + } + } + + void TimestampColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + secondsRle->seek(positions.at(columnId)); + nanoRle->seek(positions.at(columnId)); + } + + class DoubleColumnReader: public ColumnReader { + public: + DoubleColumnReader(const Type& type, StripeStreams& stripe); + ~DoubleColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + std::unique_ptr<SeekableInputStream> inputStream; + TypeKind columnKind; + const uint64_t bytesPerValue ; + const char *bufferPointer; + const char *bufferEnd; + + unsigned char readByte() { + if (bufferPointer == bufferEnd) { + int length; + if (!inputStream->Next + (reinterpret_cast<const void**>(&bufferPointer), &length)) { + throw ParseError("bad read in DoubleColumnReader::next()"); + } + bufferEnd = bufferPointer + length; + } + return static_cast<unsigned char>(*(bufferPointer++)); + } + + double readDouble() { + int64_t bits = 0; + for (uint64_t i=0; i < 8; i++) { + bits |= static_cast<int64_t>(readByte()) << (i*8); + } + double *result = reinterpret_cast<double*>(&bits); + return *result; + } + + double readFloat() { + int32_t bits = 0; + for (uint64_t i=0; i < 4; i++) { + bits |= readByte() << (i*8); + } + float *result = reinterpret_cast<float*>(&bits); + return static_cast<double>(*result); + } + }; + + DoubleColumnReader::DoubleColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe), + columnKind(type.getKind()), + bytesPerValue((type.getKind() == + FLOAT) ? 4 : 8), + bufferPointer(nullptr), + bufferEnd(nullptr) { + inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (inputStream == nullptr) + throw ParseError("DATA stream not found in Double column"); + } + + DoubleColumnReader::~DoubleColumnReader() { + // PASS + } + + uint64_t DoubleColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + + if (static_cast<size_t>(bufferEnd - bufferPointer) >= + bytesPerValue * numValues) { + bufferPointer += bytesPerValue * numValues; + } else { + size_t sizeToSkip = bytesPerValue * numValues - + static_cast<size_t>(bufferEnd - bufferPointer); + const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); + while (sizeToSkip != 0) { + size_t step = sizeToSkip > cap ? cap : sizeToSkip; + inputStream->Skip(static_cast<int>(step)); + sizeToSkip -= step; + } + bufferEnd = nullptr; + bufferPointer = nullptr; + } + + return numValues; + } + + void DoubleColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // update the notNull from the parent class + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data(); + + if (columnKind == FLOAT) { + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + outArray[i] = readFloat(); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + outArray[i] = readFloat(); + } + } + } else { + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + outArray[i] = readDouble(); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + outArray[i] = readDouble(); + } + } + } + } + + void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) { + int64_t posn = 0; + while (posn < bufferSize) { + const void* chunk; + int length; + if (!stream->Next(&chunk, &length)) { + throw ParseError("bad read in readFully"); + } + if (posn + length > bufferSize) { + throw ParseError("Corrupt dictionary blob in StringDictionaryColumn"); + } + memcpy(buffer + posn, chunk, static_cast<size_t>(length)); + posn += length; + } + } + + void DoubleColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + inputStream->seek(positions.at(columnId)); + } + + class StringDictionaryColumnReader: public ColumnReader { + private: + std::shared_ptr<StringDictionary> dictionary; + std::unique_ptr<RleDecoder> rle; + + public: + StringDictionaryColumnReader(const Type& type, StripeStreams& stipe); + ~StringDictionaryColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + StringDictionaryColumnReader::StringDictionaryColumnReader + (const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe), + dictionary(new StringDictionary(stripe.getMemoryPool())) { + RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) + .kind()); + uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize(); + rle = createRleDecoder(stripe.getStream(columnId, + proto::Stream_Kind_DATA, + true), + false, rleVersion, memoryPool); + std::unique_ptr<RleDecoder> lengthDecoder = + createRleDecoder(stripe.getStream(columnId, + proto::Stream_Kind_LENGTH, + false), + false, rleVersion, memoryPool); + dictionary->dictionaryOffset.resize(dictSize + 1); + int64_t* lengthArray = dictionary->dictionaryOffset.data(); + lengthDecoder->next(lengthArray + 1, dictSize, nullptr); + lengthArray[0] = 0; + for(uint32_t i = 1; i < dictSize + 1; ++i) { + lengthArray[i] += lengthArray[i - 1]; + } + dictionary->dictionaryBlob.resize( + static_cast<uint64_t>(lengthArray[dictSize])); + std::unique_ptr<SeekableInputStream> blobStream = + stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); + readFully( + dictionary->dictionaryBlob.data(), + lengthArray[dictSize], + blobStream.get()); + } + + StringDictionaryColumnReader::~StringDictionaryColumnReader() { + // PASS + } + + uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // update the notNull from the parent class + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); + char *blob = dictionary->dictionaryBlob.data(); + int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data(); + char **outputStarts = byteBatch.data.data(); + int64_t *outputLengths = byteBatch.length.data(); + rle->next(outputLengths, numValues, notNull); + uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1; + if (notNull) { + for(uint64_t i=0; i < numValues; ++i) { + if (notNull[i]) { + int64_t entry = outputLengths[i]; + if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) { + throw ParseError("Entry index out of range in StringDictionaryColumn"); + } + outputStarts[i] = blob + dictionaryOffsets[entry]; + outputLengths[i] = dictionaryOffsets[entry+1] - + dictionaryOffsets[entry]; + } + } + } else { + for(uint64_t i=0; i < numValues; ++i) { + int64_t entry = outputLengths[i]; + if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) { + throw ParseError("Entry index out of range in StringDictionaryColumn"); + } + outputStarts[i] = blob + dictionaryOffsets[entry]; + outputLengths[i] = dictionaryOffsets[entry+1] - + dictionaryOffsets[entry]; + } + } + } + + void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + rowBatch.isEncoded = true; + + EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch); + batch.dictionary = this->dictionary; + + // Length buffer is reused to save dictionary entry ids + rle->next(batch.index.data(), numValues, notNull); + } + + void StringDictionaryColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + + class StringDirectColumnReader: public ColumnReader { + private: + std::unique_ptr<RleDecoder> lengthRle; + std::unique_ptr<SeekableInputStream> blobStream; + const char *lastBuffer; + size_t lastBufferLength; + + /** + * Compute the total length of the values. + * @param lengths the array of lengths + * @param notNull the array of notNull flags + * @param numValues the lengths of the arrays + * @return the total number of bytes for the non-null values + */ + size_t computeSize(const int64_t *lengths, const char *notNull, + uint64_t numValues); + + public: + StringDirectColumnReader(const Type& type, StripeStreams& stipe); + ~StringDirectColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + StringDirectColumnReader::StringDirectColumnReader + (const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) + .kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in StringDirectColumn"); + lengthRle = createRleDecoder( + std::move(stream), false, rleVersion, memoryPool); + blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (blobStream == nullptr) + throw ParseError("DATA stream not found in StringDirectColumn"); + lastBuffer = nullptr; + lastBufferLength = 0; + } + + StringDirectColumnReader::~StringDirectColumnReader() { + // PASS + } + + uint64_t StringDirectColumnReader::skip(uint64_t numValues) { + const size_t BUFFER_SIZE = 1024; + numValues = ColumnReader::skip(numValues); + int64_t buffer[BUFFER_SIZE]; + uint64_t done = 0; + size_t totalBytes = 0; + // read the lengths, so we know haw many bytes to skip + while (done < numValues) { + uint64_t step = std::min(BUFFER_SIZE, + static_cast<size_t>(numValues - done)); + lengthRle->next(buffer, step, nullptr); + totalBytes += computeSize(buffer, nullptr, step); + done += step; + } + if (totalBytes <= lastBufferLength) { + // subtract the needed bytes from the ones left over + lastBufferLength -= totalBytes; + lastBuffer += totalBytes; + } else { + // move the stream forward after accounting for the buffered bytes + totalBytes -= lastBufferLength; + const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); + while (totalBytes != 0) { + size_t step = totalBytes > cap ? cap : totalBytes; + blobStream->Skip(static_cast<int>(step)); + totalBytes -= step; + } + lastBufferLength = 0; + lastBuffer = nullptr; + } + return numValues; + } + + size_t StringDirectColumnReader::computeSize(const int64_t* lengths, + const char* notNull, + uint64_t numValues) { + size_t totalLength = 0; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + totalLength += static_cast<size_t>(lengths[i]); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + totalLength += static_cast<size_t>(lengths[i]); + } + } + return totalLength; + } + + void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // update the notNull from the parent class + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); + char **startPtr = byteBatch.data.data(); + int64_t *lengthPtr = byteBatch.length.data(); + + // read the length vector + lengthRle->next(lengthPtr, numValues, notNull); + + // figure out the total length of data we need from the blob stream + const size_t totalLength = computeSize(lengthPtr, notNull, numValues); + + // Load data from the blob stream into our buffer until we have enough + // to get the rest directly out of the stream's buffer. + size_t bytesBuffered = 0; + byteBatch.blob.resize(totalLength); + char *ptr= byteBatch.blob.data(); + while (bytesBuffered + lastBufferLength < totalLength) { + memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength); + bytesBuffered += lastBufferLength; + const void* readBuffer; + int readLength; + if (!blobStream->Next(&readBuffer, &readLength)) { + throw ParseError("failed to read in StringDirectColumnReader.next"); + } + lastBuffer = static_cast<const char*>(readBuffer); + lastBufferLength = static_cast<size_t>(readLength); + } + + if (bytesBuffered < totalLength) { + size_t moreBytes = totalLength - bytesBuffered; + memcpy(ptr + bytesBuffered, lastBuffer, moreBytes); + lastBuffer += moreBytes; + lastBufferLength -= moreBytes; + } + + size_t filledSlots = 0; + ptr = byteBatch.blob.data(); + if (notNull) { + while (filledSlots < numValues) { + if (notNull[filledSlots]) { + startPtr[filledSlots] = const_cast<char*>(ptr); + ptr += lengthPtr[filledSlots]; + } + filledSlots += 1; + } + } else { + while (filledSlots < numValues) { + startPtr[filledSlots] = const_cast<char*>(ptr); + ptr += lengthPtr[filledSlots]; + filledSlots += 1; + } + } + } + + void StringDirectColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + blobStream->seek(positions.at(columnId)); + lengthRle->seek(positions.at(columnId)); + } + + class StructColumnReader: public ColumnReader { + private: + std::vector<ColumnReader*> children; + + public: + StructColumnReader(const Type& type, StripeStreams& stipe); + ~StructColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + StructColumnReader::StructColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + // count the number of selected sub-columns + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) { + case proto::ColumnEncoding_Kind_DIRECT: + for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + const Type& child = *type.getSubtype(i); + if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { + children.push_back(buildReader(child, stripe).release()); + } + } + break; + case proto::ColumnEncoding_Kind_DIRECT_V2: + case proto::ColumnEncoding_Kind_DICTIONARY: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + default: + throw ParseError("Unknown encoding for StructColumnReader"); + } + } + + StructColumnReader::~StructColumnReader() { + for (size_t i=0; i<children.size(); i++) { + delete children[i]; + } + } + + uint64_t StructColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + for(std::vector<ColumnReader*>::iterator ptr=children.begin(); ptr != children.end(); ++ptr) { + (*ptr)->skip(numValues); + } + return numValues; + } + + void StructColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + uint64_t i=0; + notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr; + for(std::vector<ColumnReader*>::iterator ptr=children.begin(); + ptr != children.end(); ++ptr, ++i) { + if (encoded) { + (*ptr)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), + numValues, notNull); + } else { + (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), + numValues, notNull); + } + } + } + + void StructColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + + for(std::vector<ColumnReader*>::iterator ptr = children.begin(); + ptr != children.end(); + ++ptr) { + (*ptr)->seekToRowGroup(positions); + } + } + + class ListColumnReader: public ColumnReader { + private: + std::unique_ptr<ColumnReader> child; + std::unique_ptr<RleDecoder> rle; + + public: + ListColumnReader(const Type& type, StripeStreams& stipe); + ~ListColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + ListColumnReader::ListColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + // count the number of selected sub-columns + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in List column"); + rle = createRleDecoder(std::move(stream), false, vers, memoryPool); + const Type& childType = *type.getSubtype(0); + if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) { + child = buildReader(childType, stripe); + } + } + + ListColumnReader::~ListColumnReader() { + // PASS + } + + uint64_t ListColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + ColumnReader *childReader = child.get(); + if (childReader) { + const uint64_t BUFFER_SIZE = 1024; + int64_t buffer[BUFFER_SIZE]; + uint64_t childrenElements = 0; + uint64_t lengthsRead = 0; + while (lengthsRead < numValues) { + uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); + rle->next(buffer, chunk, nullptr); + for(size_t i=0; i < chunk; ++i) { + childrenElements += static_cast<size_t>(buffer[i]); + } + lengthsRead += chunk; + } + childReader->skip(childrenElements); + } else { + rle->skip(numValues); + } + return numValues; + } + + void ListColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); + int64_t* offsets = listBatch.offsets.data(); + notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr; + rle->next(offsets, numValues, notNull); + uint64_t totalChildren = 0; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } else { + offsets[i] = static_cast<int64_t>(totalChildren); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } + } + offsets[numValues] = static_cast<int64_t>(totalChildren); + ColumnReader *childReader = child.get(); + if (childReader) { + if (encoded) { + childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr); + } else { + childReader->next(*(listBatch.elements.get()), totalChildren, nullptr); + } + } + } + + void ListColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + if (child.get()) { + child->seekToRowGroup(positions); + } + } + + class MapColumnReader: public ColumnReader { + private: + std::unique_ptr<ColumnReader> keyReader; + std::unique_ptr<ColumnReader> elementReader; + std::unique_ptr<RleDecoder> rle; + + public: + MapColumnReader(const Type& type, StripeStreams& stipe); + ~MapColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + MapColumnReader::MapColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + // Determine if the key and/or value columns are selected + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in Map column"); + rle = createRleDecoder(std::move(stream), false, vers, memoryPool); + const Type& keyType = *type.getSubtype(0); + if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) { + keyReader = buildReader(keyType, stripe); + } + const Type& elementType = *type.getSubtype(1); + if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) { + elementReader = buildReader(elementType, stripe); + } + } + + MapColumnReader::~MapColumnReader() { + // PASS + } + + uint64_t MapColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + ColumnReader *rawKeyReader = keyReader.get(); + ColumnReader *rawElementReader = elementReader.get(); + if (rawKeyReader || rawElementReader) { + const uint64_t BUFFER_SIZE = 1024; + int64_t buffer[BUFFER_SIZE]; + uint64_t childrenElements = 0; + uint64_t lengthsRead = 0; + while (lengthsRead < numValues) { + uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); + rle->next(buffer, chunk, nullptr); + for(size_t i=0; i < chunk; ++i) { + childrenElements += static_cast<size_t>(buffer[i]); + } + lengthsRead += chunk; + } + if (rawKeyReader) { + rawKeyReader->skip(childrenElements); + } + if (rawElementReader) { + rawElementReader->skip(childrenElements); + } + } else { + rle->skip(numValues); + } + return numValues; + } + + void MapColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) + { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) + { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); + int64_t* offsets = mapBatch.offsets.data(); + notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr; + rle->next(offsets, numValues, notNull); + uint64_t totalChildren = 0; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } else { + offsets[i] = static_cast<int64_t>(totalChildren); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } + } + offsets[numValues] = static_cast<int64_t>(totalChildren); + ColumnReader *rawKeyReader = keyReader.get(); + if (rawKeyReader) { + if (encoded) { + rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr); + } else { + rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr); + } + } + ColumnReader *rawElementReader = elementReader.get(); + if (rawElementReader) { + if (encoded) { + rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr); + } else { + rawElementReader->next(*(mapBatch.elements.get()), totalChildren, nullptr); + } + } + } + + void MapColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + if (keyReader.get()) { + keyReader->seekToRowGroup(positions); + } + if (elementReader.get()) { + elementReader->seekToRowGroup(positions); + } + } + + class UnionColumnReader: public ColumnReader { + private: + std::unique_ptr<ByteRleDecoder> rle; + std::vector<ColumnReader*> childrenReader; + std::vector<int64_t> childrenCounts; + uint64_t numChildren; + + public: + UnionColumnReader(const Type& type, StripeStreams& stipe); + ~UnionColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + UnionColumnReader::UnionColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + numChildren = type.getSubtypeCount(); + childrenReader.resize(numChildren); + childrenCounts.resize(numChildren); + + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in Union column"); + rle = createByteRleDecoder(std::move(stream)); + // figure out which types are selected + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + for(unsigned int i=0; i < numChildren; ++i) { + const Type &child = *type.getSubtype(i); + if (selectedColumns[static_cast<size_t>(child.getColumnId())]) { + childrenReader[i] = buildReader(child, stripe).release(); + } + } + } + + UnionColumnReader::~UnionColumnReader() { + for(std::vector<ColumnReader*>::iterator itr = childrenReader.begin(); + itr != childrenReader.end(); ++itr) { + delete *itr; + } + } + + uint64_t UnionColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + const uint64_t BUFFER_SIZE = 1024; + char buffer[BUFFER_SIZE]; + uint64_t lengthsRead = 0; + int64_t *counts = childrenCounts.data(); + memset(counts, 0, sizeof(int64_t) * numChildren); + while (lengthsRead < numValues) { + uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); + rle->next(buffer, chunk, nullptr); + for(size_t i=0; i < chunk; ++i) { + counts[static_cast<size_t>(buffer[i])] += 1; + } + lengthsRead += chunk; + } + for(size_t i=0; i < numChildren; ++i) { + if (counts[i] != 0 && childrenReader[i] != nullptr) { + childrenReader[i]->skip(static_cast<uint64_t>(counts[i])); + } + } + return numValues; + } + + void UnionColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); + uint64_t* offsets = unionBatch.offsets.data(); + int64_t* counts = childrenCounts.data(); + memset(counts, 0, sizeof(int64_t) * numChildren); + unsigned char* tags = unionBatch.tags.data(); + notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr; + rle->next(reinterpret_cast<char *>(tags), numValues, notNull); + // set the offsets for each row + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + offsets[i] = + static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + offsets[i] = + static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); + } + } + // read the right number of each child column + for(size_t i=0; i < numChildren; ++i) { + if (childrenReader[i] != nullptr) { + if (encoded) { + childrenReader[i]->nextEncoded(*(unionBatch.children[i]), + static_cast<uint64_t>(counts[i]), nullptr); + } else { + childrenReader[i]->next(*(unionBatch.children[i]), + static_cast<uint64_t>(counts[i]), nullptr); + } + } + } + } + + void UnionColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + for(size_t i = 0; i < numChildren; ++i) { + if (childrenReader[i] != nullptr) { + childrenReader[i]->seekToRowGroup(positions); + } + } + } + + /** + * Destructively convert the number from zigzag encoding to the + * natural signed representation. + */ + void unZigZagInt128(Int128& value) { + bool needsNegate = value.getLowBits() & 1; + value >>= 1; + if (needsNegate) { + value.negate(); + value -= 1; + } + } + + class Decimal64ColumnReader: public ColumnReader { + public: + static const uint32_t MAX_PRECISION_64 = 18; + static const uint32_t MAX_PRECISION_128 = 38; + static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1]; + + protected: + std::unique_ptr<SeekableInputStream> valueStream; + int32_t precision; + int32_t scale; + const char* buffer; + const char* bufferEnd; + + std::unique_ptr<RleDecoder> scaleDecoder; + + /** + * Read the valueStream for more bytes. + */ + void readBuffer() { + while (buffer == bufferEnd) { + int length; + if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), + &length)) { + throw ParseError("Read past end of stream in Decimal64ColumnReader "+ + valueStream->getName()); + } + bufferEnd = buffer + length; + } + } + + void readInt64(int64_t& value, int32_t currentScale) { + value = 0; + size_t offset = 0; + while (true) { + readBuffer(); + unsigned char ch = static_cast<unsigned char>(*(buffer++)); + value |= static_cast<uint64_t>(ch & 0x7f) << offset; + offset += 7; + if (!(ch & 0x80)) { + break; + } + } + value = unZigZag(static_cast<uint64_t>(value)); + if (scale > currentScale && + static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) { + value *= POWERS_OF_TEN[scale - currentScale]; + } else if (scale < currentScale && + static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) { + value /= POWERS_OF_TEN[currentScale - scale]; + } else if (scale != currentScale) { + throw ParseError("Decimal scale out of range"); + } + } + + public: + Decimal64ColumnReader(const Type& type, StripeStreams& stipe); + ~Decimal64ColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + const uint32_t Decimal64ColumnReader::MAX_PRECISION_64; + const uint32_t Decimal64ColumnReader::MAX_PRECISION_128; + const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]= + {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + + Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + scale = static_cast<int32_t>(type.getScale()); + precision = static_cast<int32_t>(type.getPrecision()); + valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (valueStream == nullptr) + throw ParseError("DATA stream not found in Decimal64Column"); + buffer = nullptr; + bufferEnd = nullptr; + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); + if (stream == nullptr) + throw ParseError("SECONDARY stream not found in Decimal64Column"); + scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool); + } + + Decimal64ColumnReader::~Decimal64ColumnReader() { + // PASS + } + + uint64_t Decimal64ColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + uint64_t skipped = 0; + while (skipped < numValues) { + readBuffer(); + if (!(0x80 & *(buffer++))) { + skipped += 1; + } + } + scaleDecoder->skip(numValues); + return numValues; + } + + void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + Decimal64VectorBatch &batch = + dynamic_cast<Decimal64VectorBatch&>(rowBatch); + int64_t* values = batch.values.data(); + // read the next group of scales + int64_t* scaleBuffer = batch.readScales.data(); + scaleDecoder->next(scaleBuffer, numValues, notNull); + batch.precision = precision; + batch.scale = scale; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } + + void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) { + if (scale > currentScale) { + while(scale > currentScale) { + uint32_t scaleAdjust = + std::min(Decimal64ColumnReader::MAX_PRECISION_64, + scale - currentScale); + value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust]; + currentScale += scaleAdjust; + } + } else if (scale < currentScale) { + Int128 remainder; + while(currentScale > scale) { + uint32_t scaleAdjust = + std::min(Decimal64ColumnReader::MAX_PRECISION_64, + currentScale - scale); + value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], + remainder); + currentScale -= scaleAdjust; + } + } + } + + void Decimal64ColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + valueStream->seek(positions.at(columnId)); + scaleDecoder->seek(positions.at(columnId)); + } + + class Decimal128ColumnReader: public Decimal64ColumnReader { + public: + Decimal128ColumnReader(const Type& type, StripeStreams& stipe); + ~Decimal128ColumnReader() override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + private: + void readInt128(Int128& value, int32_t currentScale) { + value = 0; + Int128 work; + uint32_t offset = 0; + while (true) { + readBuffer(); + unsigned char ch = static_cast<unsigned char>(*(buffer++)); + work = ch & 0x7f; + work <<= offset; + value |= work; + offset += 7; + if (!(ch & 0x80)) { + break; + } + } + unZigZagInt128(value); + scaleInt128(value, static_cast<uint32_t>(scale), + static_cast<uint32_t>(currentScale)); + } + }; + + Decimal128ColumnReader::Decimal128ColumnReader + (const Type& type, + StripeStreams& stripe + ): Decimal64ColumnReader(type, stripe) { + // PASS + } + + Decimal128ColumnReader::~Decimal128ColumnReader() { + // PASS + } + + void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + Decimal128VectorBatch &batch = + dynamic_cast<Decimal128VectorBatch&>(rowBatch); + Int128* values = batch.values.data(); + // read the next group of scales + int64_t* scaleBuffer = batch.readScales.data(); + scaleDecoder->next(scaleBuffer, numValues, notNull); + batch.precision = precision; + batch.scale = scale; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } + + class DecimalHive11ColumnReader: public Decimal64ColumnReader { + private: + bool throwOnOverflow; + std::ostream* errorStream; + + /** + * Read an Int128 from the stream and correct it to the desired scale. + */ + bool readInt128(Int128& value, int32_t currentScale) { + // -/+ 99999999999999999999999999999999999999 + static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001); + static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff); + + value = 0; + Int128 work; + uint32_t offset = 0; + bool result = true; + while (true) { + readBuffer(); + unsigned char ch = static_cast<unsigned char>(*(buffer++)); + work = ch & 0x7f; + // If we have read more than 128 bits, we flag the error, but keep + // reading bytes so the stream isn't thrown off. + if (offset > 128 || (offset == 126 && work > 3)) { + result = false; + } + work <<= offset; + value |= work; + offset += 7; + if (!(ch & 0x80)) { + break; + } + } + + if (!result) { + return result; + } + unZigZagInt128(value); + scaleInt128(value, static_cast<uint32_t>(scale), + static_cast<uint32_t>(currentScale)); + return value >= MIN_VALUE && value <= MAX_VALUE; + } + + public: + DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe); + ~DecimalHive11ColumnReader() override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + }; + + DecimalHive11ColumnReader::DecimalHive11ColumnReader + (const Type& type, + StripeStreams& stripe + ): Decimal64ColumnReader(type, stripe) { + scale = stripe.getForcedScaleOnHive11Decimal(); + throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow(); + errorStream = stripe.getErrorStream(); + } + + DecimalHive11ColumnReader::~DecimalHive11ColumnReader() { + // PASS + } + + void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + Decimal128VectorBatch &batch = + dynamic_cast<Decimal128VectorBatch&>(rowBatch); + Int128* values = batch.values.data(); + // read the next group of scales + int64_t* scaleBuffer = batch.readScales.data(); + + scaleDecoder->next(scaleBuffer, numValues, notNull); + + batch.precision = precision; + batch.scale = scale; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + if (!readInt128(values[i], + static_cast<int32_t>(scaleBuffer[i]))) { + if (throwOnOverflow) { + throw ParseError("Hive 0.11 decimal was more than 38 digits."); + } else { + *errorStream << "Warning: " + << "Hive 0.11 decimal with more than 38 digits " + << "replaced by NULL.\n"; + notNull[i] = false; + } + } + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + if (!readInt128(values[i], + static_cast<int32_t>(scaleBuffer[i]))) { + if (throwOnOverflow) { + throw ParseError("Hive 0.11 decimal was more than 38 digits."); + } else { + *errorStream << "Warning: " + << "Hive 0.11 decimal with more than 38 digits " + << "replaced by NULL.\n"; + batch.hasNulls = true; + batch.notNull[i] = false; + } + } + } + } + } + + /** + * Create a reader for the given stripe. + */ + std::unique_ptr<ColumnReader> buildReader(const Type& type, + StripeStreams& stripe) { + switch (static_cast<int64_t>(type.getKind())) { + case DATE: + case INT: + case LONG: + case SHORT: + return std::unique_ptr<ColumnReader>( + new IntegerColumnReader(type, stripe)); + case BINARY: + case CHAR: + case STRING: + case VARCHAR: + switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){ + case proto::ColumnEncoding_Kind_DICTIONARY: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + return std::unique_ptr<ColumnReader>( + new StringDictionaryColumnReader(type, stripe)); + case proto::ColumnEncoding_Kind_DIRECT: + case proto::ColumnEncoding_Kind_DIRECT_V2: + return std::unique_ptr<ColumnReader>( + new StringDirectColumnReader(type, stripe)); + default: + throw NotImplementedYet("buildReader unhandled string encoding"); + } + + case BOOLEAN: + return std::unique_ptr<ColumnReader>( + new BooleanColumnReader(type, stripe)); + + case BYTE: + return std::unique_ptr<ColumnReader>( + new ByteColumnReader(type, stripe)); + + case LIST: + return std::unique_ptr<ColumnReader>( + new ListColumnReader(type, stripe)); + + case MAP: + return std::unique_ptr<ColumnReader>( + new MapColumnReader(type, stripe)); + + case UNION: + return std::unique_ptr<ColumnReader>( + new UnionColumnReader(type, stripe)); + + case STRUCT: + return std::unique_ptr<ColumnReader>( + new StructColumnReader(type, stripe)); + + case FLOAT: + case DOUBLE: + return std::unique_ptr<ColumnReader>( + new DoubleColumnReader(type, stripe)); + + case TIMESTAMP: + return std::unique_ptr<ColumnReader> + (new TimestampColumnReader(type, stripe)); + + case DECIMAL: + // is this a Hive 0.11 or 0.12 file? + if (type.getPrecision() == 0) { + return std::unique_ptr<ColumnReader> + (new DecimalHive11ColumnReader(type, stripe)); + + // can we represent the values using int64_t? + } else if (type.getPrecision() <= + Decimal64ColumnReader::MAX_PRECISION_64) { + return std::unique_ptr<ColumnReader> + (new Decimal64ColumnReader(type, stripe)); + + // otherwise we use the Int128 implementation + } else { + return std::unique_ptr<ColumnReader> + (new Decimal128ColumnReader(type, stripe)); + } + + default: + throw NotImplementedYet("buildReader unhandled type"); + } + } + +} diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.hh b/contrib/libs/apache/orc/c++/src/ColumnReader.hh index 0c64e5b80f..5023cdfab5 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.hh @@ -1,156 +1,156 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COLUMN_READER_HH -#define ORC_COLUMN_READER_HH - -#include <unordered_map> - -#include "orc/Vector.hh" - -#include "ByteRLE.hh" -#include "Compression.hh" -#include "Timezone.hh" -#include "wrap/orc-proto-wrapper.hh" - -namespace orc { - - class StripeStreams { - public: - virtual ~StripeStreams(); - - /** - * Get the array of booleans for which columns are selected. - * @return the address of an array which contains true at the index of - * each columnId is selected. - */ - virtual const std::vector<bool> getSelectedColumns() const = 0; - - /** - * Get the encoding for the given column for this stripe. - */ - virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0; - - /** - * Get the stream for the given column/kind in this stripe. - * @param columnId the id of the column - * @param kind the kind of the stream - * @param shouldStream should the reading page the stream in - * @return the new stream - */ - virtual std::unique_ptr<SeekableInputStream> - getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const = 0; - - /** - * Get the memory pool for this reader. - */ - virtual MemoryPool& getMemoryPool() const = 0; - - /** - * Get the writer's timezone, so that we can convert their dates correctly. - */ - virtual const Timezone& getWriterTimezone() const = 0; - - /** - * Get the error stream. - * @return a pointer to the stream that should get error messages - */ - virtual std::ostream* getErrorStream() const = 0; - - /** - * Should the reader throw when the scale overflows when reading Hive 0.11 - * decimals. - * @return true if it should throw - */ - virtual bool getThrowOnHive11DecimalOverflow() const = 0; - - /** - * What is the scale forced on the Hive 0.11 decimals? - * @return the number of scale digits - */ - virtual int32_t getForcedScaleOnHive11Decimal() const = 0; - }; - - /** - * The interface for reading ORC data types. - */ - class ColumnReader { - protected: - std::unique_ptr<ByteRleDecoder> notNullDecoder; - uint64_t columnId; - MemoryPool& memoryPool; - - public: - ColumnReader(const Type& type, StripeStreams& stipe); - - virtual ~ColumnReader(); - - /** - * Skip number of specified rows. - * @param numValues the number of values to skip - * @return the number of non-null values skipped - */ - virtual uint64_t skip(uint64_t numValues); - - /** - * Read the next group of values into this rowBatch. - * @param rowBatch the memory to read into. - * @param numValues the number of values to read - * @param notNull if null, all values are not null. Otherwise, it is - * a mask (with at least numValues bytes) for which values to - * set. - */ - virtual void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull); - - /** - * Read the next group of values without decoding - * @param rowBatch the memory to read into. - * @param numValues the number of values to read - * @param notNull if null, all values are not null. Otherwise, it is - * a mask (with at least numValues bytes) for which values to - * set. - */ - virtual void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) - { - rowBatch.isEncoded = false; - next(rowBatch, numValues, notNull); - } - - /** - * Seek to beginning of a row group in the current stripe - * @param positions a list of PositionProviders storing the positions - */ - virtual void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions); - - }; - - /** - * Create a reader for the given stripe. - */ - std::unique_ptr<ColumnReader> buildReader(const Type& type, - StripeStreams& stripe); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COLUMN_READER_HH +#define ORC_COLUMN_READER_HH + +#include <unordered_map> + +#include "orc/Vector.hh" + +#include "ByteRLE.hh" +#include "Compression.hh" +#include "Timezone.hh" +#include "wrap/orc-proto-wrapper.hh" + +namespace orc { + + class StripeStreams { + public: + virtual ~StripeStreams(); + + /** + * Get the array of booleans for which columns are selected. + * @return the address of an array which contains true at the index of + * each columnId is selected. + */ + virtual const std::vector<bool> getSelectedColumns() const = 0; + + /** + * Get the encoding for the given column for this stripe. + */ + virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0; + + /** + * Get the stream for the given column/kind in this stripe. + * @param columnId the id of the column + * @param kind the kind of the stream + * @param shouldStream should the reading page the stream in + * @return the new stream + */ + virtual std::unique_ptr<SeekableInputStream> + getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const = 0; + + /** + * Get the memory pool for this reader. + */ + virtual MemoryPool& getMemoryPool() const = 0; + + /** + * Get the writer's timezone, so that we can convert their dates correctly. + */ + virtual const Timezone& getWriterTimezone() const = 0; + + /** + * Get the error stream. + * @return a pointer to the stream that should get error messages + */ + virtual std::ostream* getErrorStream() const = 0; + + /** + * Should the reader throw when the scale overflows when reading Hive 0.11 + * decimals. + * @return true if it should throw + */ + virtual bool getThrowOnHive11DecimalOverflow() const = 0; + + /** + * What is the scale forced on the Hive 0.11 decimals? + * @return the number of scale digits + */ + virtual int32_t getForcedScaleOnHive11Decimal() const = 0; + }; + + /** + * The interface for reading ORC data types. + */ + class ColumnReader { + protected: + std::unique_ptr<ByteRleDecoder> notNullDecoder; + uint64_t columnId; + MemoryPool& memoryPool; + + public: + ColumnReader(const Type& type, StripeStreams& stipe); + + virtual ~ColumnReader(); + + /** + * Skip number of specified rows. + * @param numValues the number of values to skip + * @return the number of non-null values skipped + */ + virtual uint64_t skip(uint64_t numValues); + + /** + * Read the next group of values into this rowBatch. + * @param rowBatch the memory to read into. + * @param numValues the number of values to read + * @param notNull if null, all values are not null. Otherwise, it is + * a mask (with at least numValues bytes) for which values to + * set. + */ + virtual void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull); + + /** + * Read the next group of values without decoding + * @param rowBatch the memory to read into. + * @param numValues the number of values to read + * @param notNull if null, all values are not null. Otherwise, it is + * a mask (with at least numValues bytes) for which values to + * set. + */ + virtual void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) + { + rowBatch.isEncoded = false; + next(rowBatch, numValues, notNull); + } + + /** + * Seek to beginning of a row group in the current stripe + * @param positions a list of PositionProviders storing the positions + */ + virtual void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions); + + }; + + /** + * Create a reader for the given stripe. + */ + std::unique_ptr<ColumnReader> buildReader(const Type& type, + StripeStreams& stripe); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc index 1408a15457..8d4d00cc61 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc @@ -1,3013 +1,3013 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" -#include "orc/Writer.hh" - -#include "ByteRLE.hh" -#include "ColumnWriter.hh" -#include "RLE.hh" -#include "Statistics.hh" -#include "Timezone.hh" - -namespace orc { - StreamsFactory::~StreamsFactory() { - //PASS - } - - class StreamsFactoryImpl : public StreamsFactory { - public: - StreamsFactoryImpl( - const WriterOptions& writerOptions, - OutputStream* outputStream) : - options(writerOptions), - outStream(outputStream) { - } - - virtual std::unique_ptr<BufferedOutputStream> - createStream(proto::Stream_Kind kind) const override; - private: - const WriterOptions& options; - OutputStream* outStream; - }; - - std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream( - proto::Stream_Kind) const { - // In the future, we can decide compression strategy and modifier - // based on stream kind. But for now we just use the setting from - // WriterOption - return createCompressor( - options.getCompression(), - outStream, - options.getCompressionStrategy(), - // BufferedOutputStream initial capacity - 1 * 1024 * 1024, - options.getCompressionBlockSize(), - *options.getMemoryPool()); - } - - std::unique_ptr<StreamsFactory> createStreamsFactory( - const WriterOptions& options, - OutputStream* outStream) { - return std::unique_ptr<StreamsFactory>( - new StreamsFactoryImpl(options, outStream)); - } - - RowIndexPositionRecorder::~RowIndexPositionRecorder() { - // PASS - } - - proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) - { - switch (rleVersion) - { - case RleVersion_1: - return proto::ColumnEncoding_Kind_DIRECT; - case RleVersion_2: - return proto::ColumnEncoding_Kind_DIRECT_V2; - default: - throw InvalidArgument("Invalid param"); - } - } - - ColumnWriter::ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - columnId(type.getColumnId()), - colIndexStatistics(), - colStripeStatistics(), - colFileStatistics(), - enableIndex(options.getEnableIndex()), - rowIndex(), - rowIndexEntry(), - rowIndexPosition(), - enableBloomFilter(false), - memPool(*options.getMemoryPool()), - indexStream(), - bloomFilterStream() { - - std::unique_ptr<BufferedOutputStream> presentStream = - factory.createStream(proto::Stream_Kind_PRESENT); - notNullEncoder = createBooleanRleEncoder(std::move(presentStream)); - - colIndexStatistics = createColumnStatistics(type); - colStripeStatistics = createColumnStatistics(type); - colFileStatistics = createColumnStatistics(type); - - if (enableIndex) { - rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex()); - rowIndexEntry = - std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry()); - rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>( - new RowIndexPositionRecorder(*rowIndexEntry)); - indexStream = - factory.createStream(proto::Stream_Kind_ROW_INDEX); - - // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported - if (options.isColumnUseBloomFilter(columnId) - && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) { - enableBloomFilter = true; - bloomFilter.reset(new BloomFilterImpl( - options.getRowIndexStride(), options.getBloomFilterFPP())); - bloomFilterIndex.reset(new proto::BloomFilterIndex()); - bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8); - } - } - } - - ColumnWriter::~ColumnWriter() { - // PASS - } - - void ColumnWriter::add(ColumnVectorBatch& batch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - notNullEncoder->add(batch.notNull.data() + offset, numValues, incomingMask); - } - - void ColumnWriter::flush(std::vector<proto::Stream>& streams) { - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_PRESENT); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(notNullEncoder->flush()); - streams.push_back(stream); - } - - uint64_t ColumnWriter::getEstimatedSize() const { - return notNullEncoder->getBufferSize(); - } - - void ColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - getProtoBufStatistics(stats, colStripeStatistics.get()); - } - - void ColumnWriter::mergeStripeStatsIntoFileStats() { - colFileStatistics->merge(*colStripeStatistics); - colStripeStatistics->reset(); - } - - void ColumnWriter::mergeRowGroupStatsIntoStripeStats() { - colStripeStatistics->merge(*colIndexStatistics); - colIndexStatistics->reset(); - } - - void ColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - getProtoBufStatistics(stats, colFileStatistics.get()); - } - - void ColumnWriter::createRowIndexEntry() { - proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics(); - colIndexStatistics->toProtoBuf(*indexStats); - - *rowIndex->add_entry() = *rowIndexEntry; - - rowIndexEntry->clear_positions(); - rowIndexEntry->clear_statistics(); - - colStripeStatistics->merge(*colIndexStatistics); - colIndexStatistics->reset(); - - addBloomFilterEntry(); - - recordPosition(); - } - - void ColumnWriter::addBloomFilterEntry() { - if (enableBloomFilter) { - BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter()); - bloomFilter->reset(); - } - } - - void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { - // write row index to output stream - rowIndex->SerializeToZeroCopyStream(indexStream.get()); - - // construct row index stream - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_ROW_INDEX); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(indexStream->flush()); - streams.push_back(stream); - - // write BLOOM_FILTER_UTF8 stream - if (enableBloomFilter) { - if (!bloomFilterIndex->SerializeToZeroCopyStream(bloomFilterStream.get())) { - throw std::logic_error("Failed to write bloom filter stream."); - } - stream.set_kind(proto::Stream_Kind_BLOOM_FILTER_UTF8); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(bloomFilterStream->flush()); - streams.push_back(stream); - } - } - - void ColumnWriter::recordPosition() const { - notNullEncoder->recordPosition(rowIndexPosition.get()); - } - - void ColumnWriter::reset() { - if (enableIndex) { - // clear row index - rowIndex->clear_entry(); - rowIndexEntry->clear_positions(); - rowIndexEntry->clear_statistics(); - - // write current positions - recordPosition(); - } - - if (enableBloomFilter) { - bloomFilter->reset(); - bloomFilterIndex->clear_bloomfilter(); - } - } - - void ColumnWriter::writeDictionary() { - // PASS - } - - class StructColumnWriter : public ColumnWriter { - public: - StructColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~StructColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::vector<ColumnWriter *> children; - }; - - StructColumnWriter::StructColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) { - const Type& child = *type.getSubtype(i); - children.push_back(buildWriter(child, factory, options).release()); - } - - if (enableIndex) { - recordPosition(); - } - } - - StructColumnWriter::~StructColumnWriter() { - for (uint32_t i = 0; i < children.size(); ++i) { - delete children[i]; - } - } - - void StructColumnWriter::add( - ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const StructVectorBatch* structBatch = - dynamic_cast<const StructVectorBatch *>(&rowBatch); - if (structBatch == nullptr) { - throw InvalidArgument("Failed to cast to StructVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = structBatch->hasNulls ? - structBatch->notNull.data() + offset : nullptr; - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->add(*structBatch->fields[i], offset, numValues, notNull); - } - - // update stats - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - - void StructColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->flush(streams); - } - } - - void StructColumnWriter::writeIndex( - std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeIndex(streams); - } - } - - uint64_t StructColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - for (uint32_t i = 0; i < children.size(); ++i) { - size += children[i]->getEstimatedSize(); - } - return size; - } - - void StructColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - encodings.push_back(encoding); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getColumnEncoding(encodings); - } - } - - void StructColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getStripeStatistics(stats); - } - } - - void StructColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeStripeStatsIntoFileStats(); - } - } - - void StructColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getFileStatistics(stats); - } - } - - void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeRowGroupStatsIntoStripeStats(); - } - } - - void StructColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->createRowIndexEntry(); - } - } - - void StructColumnWriter::reset() { - ColumnWriter::reset(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->reset(); - } - } - - void StructColumnWriter::writeDictionary() { - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeDictionary(); - } - } - - class IntegerColumnWriter : public ColumnWriter { - public: - IntegerColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - protected: - std::unique_ptr<RleEncoder> rleEncoder; - - private: - RleVersion rleVersion; - }; - - IntegerColumnWriter::IntegerColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()) { - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createRleEncoder( - std::move(dataStream), - true, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (enableIndex) { - recordPosition(); - } - } - - void IntegerColumnWriter::add( - ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const LongVectorBatch* longBatch = - dynamic_cast<const LongVectorBatch*>(&rowBatch); - if (longBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - IntegerColumnStatisticsImpl* intStats = - dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); - if (intStats == nullptr) { - throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const int64_t* data = longBatch->data.data() + offset; - const char* notNull = longBatch->hasNulls ? - longBatch->notNull.data() + offset : nullptr; - - rleEncoder->add(data, numValues, notNull); - - // update stats - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - intStats->update(data[i], 1); - } - } - intStats->increase(count); - if (count < numValues) { - intStats->setHasNull(true); - } - } - - void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); - streams.push_back(stream); - } - - uint64_t IntegerColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - return size; - } - - void IntegerColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void IntegerColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); - } - - class ByteColumnWriter : public ColumnWriter { - public: - ByteColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - private: - std::unique_ptr<ByteRleEncoder> byteRleEncoder; - }; - - ByteColumnWriter::ByteColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - byteRleEncoder = createByteRleEncoder(std::move(dataStream)); - - if (enableIndex) { - recordPosition(); - } - } - - void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); - if (byteBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - IntegerColumnStatisticsImpl* intStats = - dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); - if (intStats == nullptr) { - throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* data = byteBatch->data.data() + offset; - const char* notNull = byteBatch->hasNulls ? - byteBatch->notNull.data() + offset : nullptr; - - char* byteData = reinterpret_cast<char*>(data); - for (uint64_t i = 0; i < numValues; ++i) { - byteData[i] = static_cast<char>(data[i]); - } - byteRleEncoder->add(byteData, numValues, notNull); - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - intStats->update(static_cast<int64_t>(byteData[i]), 1); - } - } - intStats->increase(count); - if (count < numValues) { - intStats->setHasNull(true); - } - } - - void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(byteRleEncoder->flush()); - streams.push_back(stream); - } - - uint64_t ByteColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += byteRleEncoder->getBufferSize(); - return size; - } - - void ByteColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void ByteColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - byteRleEncoder->recordPosition(rowIndexPosition.get()); - } - - class BooleanColumnWriter : public ColumnWriter { - public: - BooleanColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - private: - std::unique_ptr<ByteRleEncoder> rleEncoder; - }; - - BooleanColumnWriter::BooleanColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createBooleanRleEncoder(std::move(dataStream)); - - if (enableIndex) { - recordPosition(); - } - } - - void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); - if (byteBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - BooleanColumnStatisticsImpl* boolStats = - dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get()); - if (boolStats == nullptr) { - throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* data = byteBatch->data.data() + offset; - const char* notNull = byteBatch->hasNulls ? - byteBatch->notNull.data() + offset : nullptr; - - char* byteData = reinterpret_cast<char*>(data); - for (uint64_t i = 0; i < numValues; ++i) { - byteData[i] = static_cast<char>(data[i]); - } - rleEncoder->add(byteData, numValues, notNull); - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - boolStats->update(byteData[i] != 0, 1); - } - } - boolStats->increase(count); - if (count < numValues) { - boolStats->setHasNull(true); - } - } - - void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); - streams.push_back(stream); - } - - uint64_t BooleanColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - return size; - } - - void BooleanColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void BooleanColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); - } - - class DoubleColumnWriter : public ColumnWriter { - public: - DoubleColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isFloat); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - private: - bool isFloat; - std::unique_ptr<AppendOnlyBufferedStream> dataStream; - DataBuffer<char> buffer; - }; - - DoubleColumnWriter::DoubleColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isFloatType) : - ColumnWriter(type, factory, options), - isFloat(isFloatType), - buffer(*options.getMemoryPool()) { - dataStream.reset(new AppendOnlyBufferedStream( - factory.createStream(proto::Stream_Kind_DATA))); - buffer.resize(isFloat ? 4 : 8); - - if (enableIndex) { - recordPosition(); - } - } - - // Floating point types are stored using IEEE 754 floating point bit layout. - // Float columns use 4 bytes per value and double columns use 8 bytes. - template <typename FLOAT_TYPE, typename INTEGER_TYPE> - inline void encodeFloatNum(FLOAT_TYPE input, char* output) { - INTEGER_TYPE* intBits = reinterpret_cast<INTEGER_TYPE*>(&input); - for (size_t i = 0; i < sizeof(INTEGER_TYPE); ++i) { - output[i] = static_cast<char>(((*intBits) >> (8 * i)) & 0xff); - } - } - - void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const DoubleVectorBatch* dblBatch = - dynamic_cast<const DoubleVectorBatch*>(&rowBatch); - if (dblBatch == nullptr) { - throw InvalidArgument("Failed to cast to DoubleVectorBatch"); - } - DoubleColumnStatisticsImpl* doubleStats = - dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get()); - if (doubleStats == nullptr) { - throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const double* doubleData = dblBatch->data.data() + offset; - const char* notNull = dblBatch->hasNulls ? - dblBatch->notNull.data() + offset : nullptr; - - size_t bytes = isFloat ? 4 : 8; - char* data = buffer.data(); - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - if (isFloat) { - encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data); - } else { - encodeFloatNum<double, int64_t>(doubleData[i], data); - } - dataStream->write(data, bytes); - ++count; - if (enableBloomFilter) { - bloomFilter->addDouble(doubleData[i]); - } - doubleStats->update(doubleData[i]); - } - } - doubleStats->increase(count); - if (count < numValues) { - doubleStats->setHasNull(true); - } - } - - void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(dataStream->flush()); - streams.push_back(stream); - } - - uint64_t DoubleColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += dataStream->getSize(); - return size; - } - - void DoubleColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void DoubleColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - dataStream->recordPosition(rowIndexPosition.get()); - } - - /** - * Implementation of increasing sorted string dictionary - */ - class SortedStringDictionary { - public: - struct DictEntry { - DictEntry(const char * str, size_t len):data(str),length(len) {} - const char * data; - size_t length; - }; - - SortedStringDictionary():totalLength(0) {} - - // insert a new string into dictionary, return its insertion order - size_t insert(const char * data, size_t len); - - // write dictionary data & length to output buffer - void flush(AppendOnlyBufferedStream * dataStream, - RleEncoder * lengthEncoder) const; - - // reorder input index buffer from insertion order to dictionary order - void reorder(std::vector<int64_t>& idxBuffer) const; - - // get dict entries in insertion order - void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const; - - // return count of entries - size_t size() const; - - // return total length of strings in the dictioanry - uint64_t length() const; - - void clear(); - - private: - struct LessThan { - bool operator()(const DictEntry& left, const DictEntry& right) const { - int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); - if (ret != 0) { - return ret < 0; - } - return left.length < right.length; - } - }; - - std::map<DictEntry, size_t, LessThan> dict; - std::vector<std::vector<char>> data; - uint64_t totalLength; - - // use friend class here to avoid being bothered by const function calls - friend class StringColumnWriter; - friend class CharColumnWriter; - friend class VarCharColumnWriter; - // store indexes of insertion order in the dictionary for not-null rows - std::vector<int64_t> idxInDictBuffer; - }; - - // insert a new string into dictionary, return its insertion order - size_t SortedStringDictionary::insert(const char * str, size_t len) { - auto ret = dict.insert({DictEntry(str, len), dict.size()}); - if (ret.second) { - // make a copy to internal storage - data.push_back(std::vector<char>(len)); - memcpy(data.back().data(), str, len); - // update dictionary entry to link pointer to internal storage - DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first)); - entry->data = data.back().data(); - totalLength += len; - } - return ret.first->second; - } - - // write dictionary data & length to output buffer - void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream, - RleEncoder * lengthEncoder) const { - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - dataStream->write(it->first.data, it->first.length); - lengthEncoder->write(static_cast<int64_t>(it->first.length)); - } - } - - /** - * Reorder input index buffer from insertion order to dictionary order - * - * We require this function because string values are buffered by indexes - * in their insertion order. Until the entire dictionary is complete can - * we get their sorted indexes in the dictionary in that ORC specification - * demands dictionary should be ordered. Therefore this function transforms - * the indexes from insertion order to dictionary value order for final - * output. - */ - void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const { - // iterate the dictionary to get mapping from insertion order to value order - std::vector<size_t> mapping(dict.size()); - size_t dictIdx = 0; - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - mapping[it->second] = dictIdx++; - } - - // do the transformation - for (size_t i = 0; i != idxBuffer.size(); ++i) { - idxBuffer[i] = static_cast<int64_t>( - mapping[static_cast<size_t>(idxBuffer[i])]); - } - } - - // get dict entries in insertion order - void SortedStringDictionary::getEntriesInInsertionOrder( - std::vector<const DictEntry *>& entries) const { - entries.resize(dict.size()); - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - entries[it->second] = &(it->first); - } - } - - // return count of entries - size_t SortedStringDictionary::size() const { - return dict.size(); - } - - // return total length of strings in the dictioanry - uint64_t SortedStringDictionary::length() const { - return totalLength; - } - - void SortedStringDictionary::clear() { - totalLength = 0; - data.clear(); - dict.clear(); - } - - class StringColumnWriter : public ColumnWriter { - public: - StringColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - virtual void createRowIndexEntry() override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - /** - * dictionary related functions - */ - bool checkDictionaryKeyRatio(); - void createDirectStreams(); - void createDictStreams(); - void deleteDictStreams(); - void fallbackToDirectEncoding(); - - protected: - RleVersion rleVersion; - bool useCompression; - const StreamsFactory& streamsFactory; - bool alignedBitPacking; - - // direct encoding streams - std::unique_ptr<RleEncoder> directLengthEncoder; - std::unique_ptr<AppendOnlyBufferedStream> directDataStream; - - // dictionary encoding streams - std::unique_ptr<RleEncoder> dictDataEncoder; - std::unique_ptr<RleEncoder> dictLengthEncoder; - std::unique_ptr<AppendOnlyBufferedStream> dictStream; - - /** - * dictionary related variables - */ - SortedStringDictionary dictionary; - // whether or not dictionary checking is done - bool doneDictionaryCheck; - // whether or not it should be used - bool useDictionary; - // keys in the dictionary should not exceed this ratio - double dictSizeThreshold; - - // record start row of each row group; null rows are skipped - mutable std::vector<size_t> startOfRowGroups; - }; - - StringColumnWriter::StringColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - useCompression(options.getCompression() != CompressionKind_NONE), - streamsFactory(factory), - alignedBitPacking(options.getAlignedBitpacking()), - doneDictionaryCheck(false), - useDictionary(options.getEnableDictionary()), - dictSizeThreshold(options.getDictionaryKeySizeThreshold()){ - if (type.getKind() == TypeKind::BINARY) { - useDictionary = false; - doneDictionaryCheck = true; - } - - if (useDictionary) { - createDictStreams(); - } else { - doneDictionaryCheck = true; - createDirectStreams(); - } - - if (enableIndex) { - recordPosition(); - } - } - - void StringColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const StringVectorBatch* stringBatch = - dynamic_cast<const StringVectorBatch*>(&rowBatch); - if (stringBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - StringColumnStatisticsImpl* strStats = - dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); - if (strStats == nullptr) { - throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char *const * data = stringBatch->data.data() + offset; - const int64_t* length = stringBatch->length.data() + offset; - const char* notNull = stringBatch->hasNulls ? - stringBatch->notNull.data() + offset : nullptr; - - if (!useDictionary){ - directLengthEncoder->add(length, numValues, notNull); - } - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - const size_t len = static_cast<size_t>(length[i]); - if (useDictionary) { - size_t index = dictionary.insert(data[i], len); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); - } else { - directDataStream->write(data[i], len); - } - if (enableBloomFilter) { - bloomFilter->addBytes(data[i], static_cast<int64_t>(len)); - } - strStats->update(data[i], len); - ++count; - } - } - strStats->increase(count); - if (count < numValues) { - strStats->setHasNull(true); - } - } - - void StringColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - if (useDictionary) { - proto::Stream data; - data.set_kind(proto::Stream_Kind_DATA); - data.set_column(static_cast<uint32_t>(columnId)); - data.set_length(dictDataEncoder->flush()); - streams.push_back(data); - - proto::Stream dict; - dict.set_kind(proto::Stream_Kind_DICTIONARY_DATA); - dict.set_column(static_cast<uint32_t>(columnId)); - dict.set_length(dictStream->flush()); - streams.push_back(dict); - - proto::Stream length; - length.set_kind(proto::Stream_Kind_LENGTH); - length.set_column(static_cast<uint32_t>(columnId)); - length.set_length(dictLengthEncoder->flush()); - streams.push_back(length); - } else { - proto::Stream length; - length.set_kind(proto::Stream_Kind_LENGTH); - length.set_column(static_cast<uint32_t>(columnId)); - length.set_length(directLengthEncoder->flush()); - streams.push_back(length); - - proto::Stream data; - data.set_kind(proto::Stream_Kind_DATA); - data.set_column(static_cast<uint32_t>(columnId)); - data.set_length(directDataStream->flush()); - streams.push_back(data); - } - } - - uint64_t StringColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - if (!useDictionary) { - size += directLengthEncoder->getBufferSize(); - size += directDataStream->getSize(); - } else { - size += dictionary.length(); - size += dictionary.size() * sizeof(int32_t); - size += dictionary.idxInDictBuffer.size() * sizeof(int32_t); - if (useCompression) { - size /= 3; // estimated ratio is 3:1 - } - } - return size; - } - - void StringColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - if (!useDictionary) { - encoding.set_kind(rleVersion == RleVersion_1 ? - proto::ColumnEncoding_Kind_DIRECT : - proto::ColumnEncoding_Kind_DIRECT_V2); - } else { - encoding.set_kind(rleVersion == RleVersion_1 ? - proto::ColumnEncoding_Kind_DICTIONARY : - proto::ColumnEncoding_Kind_DICTIONARY_V2); - } - encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size())); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void StringColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - if (!useDictionary) { - directDataStream->recordPosition(rowIndexPosition.get()); - directLengthEncoder->recordPosition(rowIndexPosition.get()); - } else { - if (enableIndex) { - startOfRowGroups.push_back(dictionary.idxInDictBuffer.size()); - } - } - } - - bool StringColumnWriter::checkDictionaryKeyRatio() { - if (!doneDictionaryCheck) { - useDictionary = dictionary.size() <= static_cast<size_t>( - static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold); - doneDictionaryCheck = true; - } - - return useDictionary; - } - - void StringColumnWriter::createRowIndexEntry() { - if (useDictionary && !doneDictionaryCheck) { - if (!checkDictionaryKeyRatio()) { - fallbackToDirectEncoding(); - } - } - ColumnWriter::createRowIndexEntry(); - } - - void StringColumnWriter::reset() { - ColumnWriter::reset(); - - dictionary.clear(); - dictionary.idxInDictBuffer.resize(0); - startOfRowGroups.clear(); - startOfRowGroups.push_back(0); - } - - void StringColumnWriter::createDirectStreams() { - std::unique_ptr<BufferedOutputStream> directLengthStream = - streamsFactory.createStream(proto::Stream_Kind_LENGTH); - directLengthEncoder = createRleEncoder(std::move(directLengthStream), - false, - rleVersion, - memPool, - alignedBitPacking); - directDataStream.reset(new AppendOnlyBufferedStream( - streamsFactory.createStream(proto::Stream_Kind_DATA))); - } - - void StringColumnWriter::createDictStreams() { - std::unique_ptr<BufferedOutputStream> dictDataStream = - streamsFactory.createStream(proto::Stream_Kind_DATA); - dictDataEncoder = createRleEncoder(std::move(dictDataStream), - false, - rleVersion, - memPool, - alignedBitPacking); - std::unique_ptr<BufferedOutputStream> dictLengthStream = - streamsFactory.createStream(proto::Stream_Kind_LENGTH); - dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), - false, - rleVersion, - memPool, - alignedBitPacking); - dictStream.reset(new AppendOnlyBufferedStream( - streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA))); - } - - void StringColumnWriter::deleteDictStreams() { - dictDataEncoder.reset(nullptr); - dictLengthEncoder.reset(nullptr); - dictStream.reset(nullptr); - - dictionary.clear(); - dictionary.idxInDictBuffer.clear(); - startOfRowGroups.clear(); - } - - void StringColumnWriter::writeDictionary() { - if (useDictionary && !doneDictionaryCheck) { - // when index is disabled, dictionary check happens while writing 1st stripe - if (!checkDictionaryKeyRatio()) { - fallbackToDirectEncoding(); - return; - } - } - - if (useDictionary) { - // flush dictionary data & length streams - dictionary.flush(dictStream.get(), dictLengthEncoder.get()); - - // convert index from insertion order to dictionary order - dictionary.reorder(dictionary.idxInDictBuffer); - - // write data sequences - int64_t * data = dictionary.idxInDictBuffer.data(); - if (enableIndex) { - size_t prevOffset = 0; - for (size_t i = 0; i < startOfRowGroups.size(); ++i) { - // write sequences in batch for a row group stride - size_t offset = startOfRowGroups[i]; - dictDataEncoder->add(data + prevOffset, offset - prevOffset, nullptr); - - // update index positions - int rowGroupId = static_cast<int>(i); - proto::RowIndexEntry* indexEntry = - (rowGroupId < rowIndex->entry_size()) ? - rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get(); - - // add positions for direct streams - RowIndexPositionRecorder recorder(*indexEntry); - dictDataEncoder->recordPosition(&recorder); - - prevOffset = offset; - } - - dictDataEncoder->add(data + prevOffset, - dictionary.idxInDictBuffer.size() - prevOffset, - nullptr); - } else { - dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr); - } - } - } - - void StringColumnWriter::fallbackToDirectEncoding() { - createDirectStreams(); - - if (enableIndex) { - // fallback happens at the 1st row group; - // simply complete positions for direct streams - proto::RowIndexEntry * indexEntry = rowIndexEntry.get(); - RowIndexPositionRecorder recorder(*indexEntry); - directDataStream->recordPosition(&recorder); - directLengthEncoder->recordPosition(&recorder); - } - - // get dictionary entries in insertion order - std::vector<const SortedStringDictionary::DictEntry *> entries; - dictionary.getEntriesInInsertionOrder(entries); - - // store each length of the data into a vector - const SortedStringDictionary::DictEntry * dictEntry = nullptr; - for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) { - // write one row data in direct encoding - dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])]; - directDataStream->write(dictEntry->data, dictEntry->length); - directLengthEncoder->write(static_cast<int64_t>(dictEntry->length)); - } - - deleteDictStreams(); - } - - struct Utf8Utils { - /** - * Counts how many utf-8 chars of the input data - */ - static uint64_t charLength(const char * data, uint64_t length) { - uint64_t chars = 0; - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - } - return chars; - } - - /** - * Return the number of bytes required to read at most maxCharLength - * characters in full from a utf-8 encoded byte array provided - * by data. This does not validate utf-8 data, but - * operates correctly on already valid utf-8 data. - * - * @param maxCharLength number of characters required - * @param data the bytes of UTF-8 - * @param length the length of data to truncate - */ - static uint64_t truncateBytesTo(uint64_t maxCharLength, - const char * data, - uint64_t length) { - uint64_t chars = 0; - if (length <= maxCharLength) { - return length; - } - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - if (chars > maxCharLength) { - return i; - } - } - // everything fits - return length; - } - - /** - * Checks if b is the first byte of a UTF-8 character. - */ - inline static bool isUtfStartByte(char b) { - return (b & 0xC0) != 0x80; - } - - /** - * Find the start of the last character that ends in the current string. - * @param text the bytes of the utf-8 - * @param from the first byte location - * @param until the last byte location - * @return the index of the last character - */ - static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) { - uint64_t posn = until; - /* we don't expect characters more than 5 bytes */ - while (posn >= from) { - if (isUtfStartByte(text[posn])) { - return posn; - } - posn -= 1; - } - /* beginning of a valid char not found */ - throw std::logic_error( - "Could not truncate string, beginning of a valid char not found"); - } - }; - - class CharColumnWriter : public StringColumnWriter { - public: - CharColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()), - padBuffer(*options.getMemoryPool()) { - // utf-8 is currently 4 bytes long, but it could be up to 6 - padBuffer.resize(maxLength * 6); - } - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - private: - uint64_t maxLength; - DataBuffer<char> padBuffer; - }; - - void CharColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); - if (charsBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - StringColumnStatisticsImpl* strStats = - dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); - if (strStats == nullptr) { - throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char** data = charsBatch->data.data() + offset; - int64_t* length = charsBatch->length.data() + offset; - const char* notNull = charsBatch->hasNulls ? - charsBatch->notNull.data() + offset : nullptr; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - const char * charData = nullptr; - uint64_t originLength = static_cast<uint64_t>(length[i]); - uint64_t charLength = Utf8Utils::charLength(data[i], originLength); - if (charLength >= maxLength) { - charData = data[i]; - length[i] = static_cast<int64_t>( - Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); - } else { - charData = padBuffer.data(); - // the padding is exactly 1 byte per char - length[i] = length[i] + static_cast<int64_t>(maxLength - charLength); - memcpy(padBuffer.data(), data[i], originLength); - memset(padBuffer.data() + originLength, - ' ', - static_cast<size_t>(length[i]) - originLength); - } - - if (useDictionary) { - size_t index = dictionary.insert(charData, static_cast<size_t>(length[i])); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); - } else { - directDataStream->write(charData, static_cast<size_t>(length[i])); - } - - if (enableBloomFilter) { - bloomFilter->addBytes(data[i], length[i]); - } - strStats->update(charData, static_cast<size_t>(length[i])); - ++count; - } - } - - if (!useDictionary) { - directLengthEncoder->add(length, numValues, notNull); - } - - strStats->increase(count); - if (count < numValues) { - strStats->setHasNull(true); - } - } - - class VarCharColumnWriter : public StringColumnWriter { - public: - VarCharColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()) { - // PASS - } - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - private: - uint64_t maxLength; - }; - - void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); - if (charsBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - StringColumnStatisticsImpl* strStats = - dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); - if (strStats == nullptr) { - throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char* const* data = charsBatch->data.data() + offset; - int64_t* length = charsBatch->length.data() + offset; - const char* notNull = charsBatch->hasNulls ? - charsBatch->notNull.data() + offset : nullptr; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - uint64_t itemLength = Utf8Utils::truncateBytesTo( - maxLength, data[i], static_cast<uint64_t>(length[i])); - length[i] = static_cast<int64_t>(itemLength); - - if (useDictionary) { - size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i])); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); - } else { - directDataStream->write(data[i], static_cast<size_t>(length[i])); - } - - if (enableBloomFilter) { - bloomFilter->addBytes(data[i], length[i]); - } - strStats->update(data[i], static_cast<size_t>(length[i])); - ++count; - } - } - - if (!useDictionary) { - directLengthEncoder->add(length, numValues, notNull); - } - - strStats->increase(count); - if (count < numValues) { - strStats->setHasNull(true); - } - } - - class BinaryColumnWriter : public StringColumnWriter { - public: - BinaryColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options) { - // PASS - } - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - }; - - void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); - if (binBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - BinaryColumnStatisticsImpl* binStats = - dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get()); - if (binStats == nullptr) { - throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char** data = binBatch->data.data() + offset; - int64_t* length = binBatch->length.data() + offset; - const char* notNull = binBatch->hasNulls ? - binBatch->notNull.data() + offset : nullptr; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - uint64_t unsignedLength = static_cast<uint64_t>(length[i]); - if (!notNull || notNull[i]) { - directDataStream->write(data[i], unsignedLength); - - binStats->update(unsignedLength); - ++count; - } - } - directLengthEncoder->add(length, numValues, notNull); - binStats->increase(count); - if (count < numValues) { - binStats->setHasNull(true); - } - } - - class TimestampColumnWriter : public ColumnWriter { - public: - TimestampColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - protected: - std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder; - - private: - RleVersion rleVersion; - const Timezone& timezone; - }; - - TimestampColumnWriter::TimestampColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - timezone(getTimezoneByName("GMT")){ - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - std::unique_ptr<BufferedOutputStream> secondaryStream = - factory.createStream(proto::Stream_Kind_SECONDARY); - secRleEncoder = createRleEncoder(std::move(dataStream), - true, - rleVersion, - memPool, - options.getAlignedBitpacking()); - nanoRleEncoder = createRleEncoder(std::move(secondaryStream), - false, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (enableIndex) { - recordPosition(); - } - } - - // Because the number of nanoseconds often has a large number of trailing zeros, - // the number has trailing decimal zero digits removed and the last three bits - // are used to record how many zeros were removed if the trailing zeros are - // more than 2. Thus 1000 nanoseconds would be serialized as 0x0a and - // 100000 would be serialized as 0x0c. - static int64_t formatNano(int64_t nanos) { - if (nanos == 0) { - return 0; - } else if (nanos % 100 != 0) { - return (nanos) << 3; - } else { - nanos /= 100; - int64_t trailingZeros = 1; - while (nanos % 10 == 0 && trailingZeros < 7) { - nanos /= 10; - trailingZeros += 1; - } - return (nanos) << 3 | trailingZeros; - } - } - - void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - TimestampVectorBatch* tsBatch = - dynamic_cast<TimestampVectorBatch*>(&rowBatch); - if (tsBatch == nullptr) { - throw InvalidArgument("Failed to cast to TimestampVectorBatch"); - } - - TimestampColumnStatisticsImpl* tsStats = - dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get()); - if (tsStats == nullptr) { - throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = tsBatch->hasNulls ? - tsBatch->notNull.data() + offset : nullptr; - int64_t *secs = tsBatch->data.data() + offset; - int64_t *nanos = tsBatch->nanoseconds.data() + offset; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - // TimestampVectorBatch already stores data in UTC - int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(millsUTC); - } - tsStats->update(millsUTC); - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" +#include "orc/Writer.hh" + +#include "ByteRLE.hh" +#include "ColumnWriter.hh" +#include "RLE.hh" +#include "Statistics.hh" +#include "Timezone.hh" + +namespace orc { + StreamsFactory::~StreamsFactory() { + //PASS + } + + class StreamsFactoryImpl : public StreamsFactory { + public: + StreamsFactoryImpl( + const WriterOptions& writerOptions, + OutputStream* outputStream) : + options(writerOptions), + outStream(outputStream) { + } + + virtual std::unique_ptr<BufferedOutputStream> + createStream(proto::Stream_Kind kind) const override; + private: + const WriterOptions& options; + OutputStream* outStream; + }; + + std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream( + proto::Stream_Kind) const { + // In the future, we can decide compression strategy and modifier + // based on stream kind. But for now we just use the setting from + // WriterOption + return createCompressor( + options.getCompression(), + outStream, + options.getCompressionStrategy(), + // BufferedOutputStream initial capacity + 1 * 1024 * 1024, + options.getCompressionBlockSize(), + *options.getMemoryPool()); + } + + std::unique_ptr<StreamsFactory> createStreamsFactory( + const WriterOptions& options, + OutputStream* outStream) { + return std::unique_ptr<StreamsFactory>( + new StreamsFactoryImpl(options, outStream)); + } + + RowIndexPositionRecorder::~RowIndexPositionRecorder() { + // PASS + } + + proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) + { + switch (rleVersion) + { + case RleVersion_1: + return proto::ColumnEncoding_Kind_DIRECT; + case RleVersion_2: + return proto::ColumnEncoding_Kind_DIRECT_V2; + default: + throw InvalidArgument("Invalid param"); + } + } + + ColumnWriter::ColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + columnId(type.getColumnId()), + colIndexStatistics(), + colStripeStatistics(), + colFileStatistics(), + enableIndex(options.getEnableIndex()), + rowIndex(), + rowIndexEntry(), + rowIndexPosition(), + enableBloomFilter(false), + memPool(*options.getMemoryPool()), + indexStream(), + bloomFilterStream() { + + std::unique_ptr<BufferedOutputStream> presentStream = + factory.createStream(proto::Stream_Kind_PRESENT); + notNullEncoder = createBooleanRleEncoder(std::move(presentStream)); + + colIndexStatistics = createColumnStatistics(type); + colStripeStatistics = createColumnStatistics(type); + colFileStatistics = createColumnStatistics(type); + + if (enableIndex) { + rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex()); + rowIndexEntry = + std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry()); + rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>( + new RowIndexPositionRecorder(*rowIndexEntry)); + indexStream = + factory.createStream(proto::Stream_Kind_ROW_INDEX); + + // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported + if (options.isColumnUseBloomFilter(columnId) + && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) { + enableBloomFilter = true; + bloomFilter.reset(new BloomFilterImpl( + options.getRowIndexStride(), options.getBloomFilterFPP())); + bloomFilterIndex.reset(new proto::BloomFilterIndex()); + bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8); + } + } + } + + ColumnWriter::~ColumnWriter() { + // PASS + } + + void ColumnWriter::add(ColumnVectorBatch& batch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + notNullEncoder->add(batch.notNull.data() + offset, numValues, incomingMask); + } + + void ColumnWriter::flush(std::vector<proto::Stream>& streams) { + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_PRESENT); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(notNullEncoder->flush()); + streams.push_back(stream); + } + + uint64_t ColumnWriter::getEstimatedSize() const { + return notNullEncoder->getBufferSize(); + } + + void ColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + getProtoBufStatistics(stats, colStripeStatistics.get()); + } + + void ColumnWriter::mergeStripeStatsIntoFileStats() { + colFileStatistics->merge(*colStripeStatistics); + colStripeStatistics->reset(); + } + + void ColumnWriter::mergeRowGroupStatsIntoStripeStats() { + colStripeStatistics->merge(*colIndexStatistics); + colIndexStatistics->reset(); + } + + void ColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + getProtoBufStatistics(stats, colFileStatistics.get()); + } + + void ColumnWriter::createRowIndexEntry() { + proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics(); + colIndexStatistics->toProtoBuf(*indexStats); + + *rowIndex->add_entry() = *rowIndexEntry; + + rowIndexEntry->clear_positions(); + rowIndexEntry->clear_statistics(); + + colStripeStatistics->merge(*colIndexStatistics); + colIndexStatistics->reset(); + + addBloomFilterEntry(); + + recordPosition(); + } + + void ColumnWriter::addBloomFilterEntry() { + if (enableBloomFilter) { + BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter()); + bloomFilter->reset(); + } + } + + void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + // write row index to output stream + rowIndex->SerializeToZeroCopyStream(indexStream.get()); + + // construct row index stream + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_ROW_INDEX); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(indexStream->flush()); + streams.push_back(stream); + + // write BLOOM_FILTER_UTF8 stream + if (enableBloomFilter) { + if (!bloomFilterIndex->SerializeToZeroCopyStream(bloomFilterStream.get())) { + throw std::logic_error("Failed to write bloom filter stream."); + } + stream.set_kind(proto::Stream_Kind_BLOOM_FILTER_UTF8); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(bloomFilterStream->flush()); + streams.push_back(stream); + } + } + + void ColumnWriter::recordPosition() const { + notNullEncoder->recordPosition(rowIndexPosition.get()); + } + + void ColumnWriter::reset() { + if (enableIndex) { + // clear row index + rowIndex->clear_entry(); + rowIndexEntry->clear_positions(); + rowIndexEntry->clear_statistics(); + + // write current positions + recordPosition(); + } + + if (enableBloomFilter) { + bloomFilter->reset(); + bloomFilterIndex->clear_bloomfilter(); + } + } + + void ColumnWriter::writeDictionary() { + // PASS + } + + class StructColumnWriter : public ColumnWriter { + public: + StructColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~StructColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::vector<ColumnWriter *> children; + }; + + StructColumnWriter::StructColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) { + const Type& child = *type.getSubtype(i); + children.push_back(buildWriter(child, factory, options).release()); + } + + if (enableIndex) { + recordPosition(); + } + } + + StructColumnWriter::~StructColumnWriter() { + for (uint32_t i = 0; i < children.size(); ++i) { + delete children[i]; + } + } + + void StructColumnWriter::add( + ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const StructVectorBatch* structBatch = + dynamic_cast<const StructVectorBatch *>(&rowBatch); + if (structBatch == nullptr) { + throw InvalidArgument("Failed to cast to StructVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + const char* notNull = structBatch->hasNulls ? + structBatch->notNull.data() + offset : nullptr; + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->add(*structBatch->fields[i], offset, numValues, notNull); + } + + // update stats + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + + void StructColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->flush(streams); + } + } + + void StructColumnWriter::writeIndex( + std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeIndex(streams); + } + } + + uint64_t StructColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + for (uint32_t i = 0; i < children.size(); ++i) { + size += children[i]->getEstimatedSize(); + } + return size; + } + + void StructColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + encodings.push_back(encoding); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getColumnEncoding(encodings); + } + } + + void StructColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getStripeStatistics(stats); + } + } + + void StructColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeStripeStatsIntoFileStats(); + } + } + + void StructColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getFileStatistics(stats); + } + } + + void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeRowGroupStatsIntoStripeStats(); + } + } + + void StructColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->createRowIndexEntry(); + } + } + + void StructColumnWriter::reset() { + ColumnWriter::reset(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->reset(); + } + } + + void StructColumnWriter::writeDictionary() { + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeDictionary(); + } + } + + class IntegerColumnWriter : public ColumnWriter { + public: + IntegerColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + protected: + std::unique_ptr<RleEncoder> rleEncoder; + + private: + RleVersion rleVersion; + }; + + IntegerColumnWriter::IntegerColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()) { + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createRleEncoder( + std::move(dataStream), + true, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (enableIndex) { + recordPosition(); + } + } + + void IntegerColumnWriter::add( + ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const LongVectorBatch* longBatch = + dynamic_cast<const LongVectorBatch*>(&rowBatch); + if (longBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + IntegerColumnStatisticsImpl* intStats = + dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); + if (intStats == nullptr) { + throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const int64_t* data = longBatch->data.data() + offset; + const char* notNull = longBatch->hasNulls ? + longBatch->notNull.data() + offset : nullptr; + + rleEncoder->add(data, numValues, notNull); + + // update stats + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + intStats->update(data[i], 1); + } + } + intStats->increase(count); + if (count < numValues) { + intStats->setHasNull(true); + } + } + + void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(rleEncoder->flush()); + streams.push_back(stream); + } + + uint64_t IntegerColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += rleEncoder->getBufferSize(); + return size; + } + + void IntegerColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void IntegerColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + rleEncoder->recordPosition(rowIndexPosition.get()); + } + + class ByteColumnWriter : public ColumnWriter { + public: + ByteColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + private: + std::unique_ptr<ByteRleEncoder> byteRleEncoder; + }; + + ByteColumnWriter::ByteColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + byteRleEncoder = createByteRleEncoder(std::move(dataStream)); + + if (enableIndex) { + recordPosition(); + } + } + + void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); + if (byteBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + IntegerColumnStatisticsImpl* intStats = + dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); + if (intStats == nullptr) { + throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* data = byteBatch->data.data() + offset; + const char* notNull = byteBatch->hasNulls ? + byteBatch->notNull.data() + offset : nullptr; + + char* byteData = reinterpret_cast<char*>(data); + for (uint64_t i = 0; i < numValues; ++i) { + byteData[i] = static_cast<char>(data[i]); + } + byteRleEncoder->add(byteData, numValues, notNull); + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + intStats->update(static_cast<int64_t>(byteData[i]), 1); + } + } + intStats->increase(count); + if (count < numValues) { + intStats->setHasNull(true); + } + } + + void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(byteRleEncoder->flush()); + streams.push_back(stream); + } + + uint64_t ByteColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += byteRleEncoder->getBufferSize(); + return size; + } + + void ByteColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void ByteColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + byteRleEncoder->recordPosition(rowIndexPosition.get()); + } + + class BooleanColumnWriter : public ColumnWriter { + public: + BooleanColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + private: + std::unique_ptr<ByteRleEncoder> rleEncoder; + }; + + BooleanColumnWriter::BooleanColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createBooleanRleEncoder(std::move(dataStream)); + + if (enableIndex) { + recordPosition(); + } + } + + void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); + if (byteBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + BooleanColumnStatisticsImpl* boolStats = + dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get()); + if (boolStats == nullptr) { + throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* data = byteBatch->data.data() + offset; + const char* notNull = byteBatch->hasNulls ? + byteBatch->notNull.data() + offset : nullptr; + + char* byteData = reinterpret_cast<char*>(data); + for (uint64_t i = 0; i < numValues; ++i) { + byteData[i] = static_cast<char>(data[i]); + } + rleEncoder->add(byteData, numValues, notNull); + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + boolStats->update(byteData[i] != 0, 1); + } + } + boolStats->increase(count); + if (count < numValues) { + boolStats->setHasNull(true); + } + } + + void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(rleEncoder->flush()); + streams.push_back(stream); + } + + uint64_t BooleanColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += rleEncoder->getBufferSize(); + return size; + } + + void BooleanColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void BooleanColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + rleEncoder->recordPosition(rowIndexPosition.get()); + } + + class DoubleColumnWriter : public ColumnWriter { + public: + DoubleColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options, + bool isFloat); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + private: + bool isFloat; + std::unique_ptr<AppendOnlyBufferedStream> dataStream; + DataBuffer<char> buffer; + }; + + DoubleColumnWriter::DoubleColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options, + bool isFloatType) : + ColumnWriter(type, factory, options), + isFloat(isFloatType), + buffer(*options.getMemoryPool()) { + dataStream.reset(new AppendOnlyBufferedStream( + factory.createStream(proto::Stream_Kind_DATA))); + buffer.resize(isFloat ? 4 : 8); + + if (enableIndex) { + recordPosition(); + } + } + + // Floating point types are stored using IEEE 754 floating point bit layout. + // Float columns use 4 bytes per value and double columns use 8 bytes. + template <typename FLOAT_TYPE, typename INTEGER_TYPE> + inline void encodeFloatNum(FLOAT_TYPE input, char* output) { + INTEGER_TYPE* intBits = reinterpret_cast<INTEGER_TYPE*>(&input); + for (size_t i = 0; i < sizeof(INTEGER_TYPE); ++i) { + output[i] = static_cast<char>(((*intBits) >> (8 * i)) & 0xff); + } + } + + void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const DoubleVectorBatch* dblBatch = + dynamic_cast<const DoubleVectorBatch*>(&rowBatch); + if (dblBatch == nullptr) { + throw InvalidArgument("Failed to cast to DoubleVectorBatch"); + } + DoubleColumnStatisticsImpl* doubleStats = + dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get()); + if (doubleStats == nullptr) { + throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const double* doubleData = dblBatch->data.data() + offset; + const char* notNull = dblBatch->hasNulls ? + dblBatch->notNull.data() + offset : nullptr; + + size_t bytes = isFloat ? 4 : 8; + char* data = buffer.data(); + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + if (isFloat) { + encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data); + } else { + encodeFloatNum<double, int64_t>(doubleData[i], data); + } + dataStream->write(data, bytes); + ++count; + if (enableBloomFilter) { + bloomFilter->addDouble(doubleData[i]); + } + doubleStats->update(doubleData[i]); + } + } + doubleStats->increase(count); + if (count < numValues) { + doubleStats->setHasNull(true); + } + } + + void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(dataStream->flush()); + streams.push_back(stream); + } + + uint64_t DoubleColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += dataStream->getSize(); + return size; + } + + void DoubleColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void DoubleColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + dataStream->recordPosition(rowIndexPosition.get()); + } + + /** + * Implementation of increasing sorted string dictionary + */ + class SortedStringDictionary { + public: + struct DictEntry { + DictEntry(const char * str, size_t len):data(str),length(len) {} + const char * data; + size_t length; + }; + + SortedStringDictionary():totalLength(0) {} + + // insert a new string into dictionary, return its insertion order + size_t insert(const char * data, size_t len); + + // write dictionary data & length to output buffer + void flush(AppendOnlyBufferedStream * dataStream, + RleEncoder * lengthEncoder) const; + + // reorder input index buffer from insertion order to dictionary order + void reorder(std::vector<int64_t>& idxBuffer) const; + + // get dict entries in insertion order + void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const; + + // return count of entries + size_t size() const; + + // return total length of strings in the dictioanry + uint64_t length() const; + + void clear(); + + private: + struct LessThan { + bool operator()(const DictEntry& left, const DictEntry& right) const { + int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); + if (ret != 0) { + return ret < 0; + } + return left.length < right.length; + } + }; + + std::map<DictEntry, size_t, LessThan> dict; + std::vector<std::vector<char>> data; + uint64_t totalLength; + + // use friend class here to avoid being bothered by const function calls + friend class StringColumnWriter; + friend class CharColumnWriter; + friend class VarCharColumnWriter; + // store indexes of insertion order in the dictionary for not-null rows + std::vector<int64_t> idxInDictBuffer; + }; + + // insert a new string into dictionary, return its insertion order + size_t SortedStringDictionary::insert(const char * str, size_t len) { + auto ret = dict.insert({DictEntry(str, len), dict.size()}); + if (ret.second) { + // make a copy to internal storage + data.push_back(std::vector<char>(len)); + memcpy(data.back().data(), str, len); + // update dictionary entry to link pointer to internal storage + DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first)); + entry->data = data.back().data(); + totalLength += len; + } + return ret.first->second; + } + + // write dictionary data & length to output buffer + void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream, + RleEncoder * lengthEncoder) const { + for (auto it = dict.cbegin(); it != dict.cend(); ++it) { + dataStream->write(it->first.data, it->first.length); + lengthEncoder->write(static_cast<int64_t>(it->first.length)); + } + } + + /** + * Reorder input index buffer from insertion order to dictionary order + * + * We require this function because string values are buffered by indexes + * in their insertion order. Until the entire dictionary is complete can + * we get their sorted indexes in the dictionary in that ORC specification + * demands dictionary should be ordered. Therefore this function transforms + * the indexes from insertion order to dictionary value order for final + * output. + */ + void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const { + // iterate the dictionary to get mapping from insertion order to value order + std::vector<size_t> mapping(dict.size()); + size_t dictIdx = 0; + for (auto it = dict.cbegin(); it != dict.cend(); ++it) { + mapping[it->second] = dictIdx++; + } + + // do the transformation + for (size_t i = 0; i != idxBuffer.size(); ++i) { + idxBuffer[i] = static_cast<int64_t>( + mapping[static_cast<size_t>(idxBuffer[i])]); + } + } + + // get dict entries in insertion order + void SortedStringDictionary::getEntriesInInsertionOrder( + std::vector<const DictEntry *>& entries) const { + entries.resize(dict.size()); + for (auto it = dict.cbegin(); it != dict.cend(); ++it) { + entries[it->second] = &(it->first); + } + } + + // return count of entries + size_t SortedStringDictionary::size() const { + return dict.size(); + } + + // return total length of strings in the dictioanry + uint64_t SortedStringDictionary::length() const { + return totalLength; + } + + void SortedStringDictionary::clear() { + totalLength = 0; + data.clear(); + dict.clear(); + } + + class StringColumnWriter : public ColumnWriter { + public: + StringColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + virtual void createRowIndexEntry() override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + /** + * dictionary related functions + */ + bool checkDictionaryKeyRatio(); + void createDirectStreams(); + void createDictStreams(); + void deleteDictStreams(); + void fallbackToDirectEncoding(); + + protected: + RleVersion rleVersion; + bool useCompression; + const StreamsFactory& streamsFactory; + bool alignedBitPacking; + + // direct encoding streams + std::unique_ptr<RleEncoder> directLengthEncoder; + std::unique_ptr<AppendOnlyBufferedStream> directDataStream; + + // dictionary encoding streams + std::unique_ptr<RleEncoder> dictDataEncoder; + std::unique_ptr<RleEncoder> dictLengthEncoder; + std::unique_ptr<AppendOnlyBufferedStream> dictStream; + + /** + * dictionary related variables + */ + SortedStringDictionary dictionary; + // whether or not dictionary checking is done + bool doneDictionaryCheck; + // whether or not it should be used + bool useDictionary; + // keys in the dictionary should not exceed this ratio + double dictSizeThreshold; + + // record start row of each row group; null rows are skipped + mutable std::vector<size_t> startOfRowGroups; + }; + + StringColumnWriter::StringColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + useCompression(options.getCompression() != CompressionKind_NONE), + streamsFactory(factory), + alignedBitPacking(options.getAlignedBitpacking()), + doneDictionaryCheck(false), + useDictionary(options.getEnableDictionary()), + dictSizeThreshold(options.getDictionaryKeySizeThreshold()){ + if (type.getKind() == TypeKind::BINARY) { + useDictionary = false; + doneDictionaryCheck = true; + } + + if (useDictionary) { + createDictStreams(); + } else { + doneDictionaryCheck = true; + createDirectStreams(); + } + + if (enableIndex) { + recordPosition(); + } + } + + void StringColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const StringVectorBatch* stringBatch = + dynamic_cast<const StringVectorBatch*>(&rowBatch); + if (stringBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + StringColumnStatisticsImpl* strStats = + dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); + if (strStats == nullptr) { + throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char *const * data = stringBatch->data.data() + offset; + const int64_t* length = stringBatch->length.data() + offset; + const char* notNull = stringBatch->hasNulls ? + stringBatch->notNull.data() + offset : nullptr; + + if (!useDictionary){ + directLengthEncoder->add(length, numValues, notNull); + } + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + const size_t len = static_cast<size_t>(length[i]); + if (useDictionary) { + size_t index = dictionary.insert(data[i], len); + dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + } else { + directDataStream->write(data[i], len); + } + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], static_cast<int64_t>(len)); + } + strStats->update(data[i], len); + ++count; + } + } + strStats->increase(count); + if (count < numValues) { + strStats->setHasNull(true); + } + } + + void StringColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + if (useDictionary) { + proto::Stream data; + data.set_kind(proto::Stream_Kind_DATA); + data.set_column(static_cast<uint32_t>(columnId)); + data.set_length(dictDataEncoder->flush()); + streams.push_back(data); + + proto::Stream dict; + dict.set_kind(proto::Stream_Kind_DICTIONARY_DATA); + dict.set_column(static_cast<uint32_t>(columnId)); + dict.set_length(dictStream->flush()); + streams.push_back(dict); + + proto::Stream length; + length.set_kind(proto::Stream_Kind_LENGTH); + length.set_column(static_cast<uint32_t>(columnId)); + length.set_length(dictLengthEncoder->flush()); + streams.push_back(length); + } else { + proto::Stream length; + length.set_kind(proto::Stream_Kind_LENGTH); + length.set_column(static_cast<uint32_t>(columnId)); + length.set_length(directLengthEncoder->flush()); + streams.push_back(length); + + proto::Stream data; + data.set_kind(proto::Stream_Kind_DATA); + data.set_column(static_cast<uint32_t>(columnId)); + data.set_length(directDataStream->flush()); + streams.push_back(data); + } + } + + uint64_t StringColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + if (!useDictionary) { + size += directLengthEncoder->getBufferSize(); + size += directDataStream->getSize(); + } else { + size += dictionary.length(); + size += dictionary.size() * sizeof(int32_t); + size += dictionary.idxInDictBuffer.size() * sizeof(int32_t); + if (useCompression) { + size /= 3; // estimated ratio is 3:1 + } + } + return size; + } + + void StringColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + if (!useDictionary) { + encoding.set_kind(rleVersion == RleVersion_1 ? + proto::ColumnEncoding_Kind_DIRECT : + proto::ColumnEncoding_Kind_DIRECT_V2); + } else { + encoding.set_kind(rleVersion == RleVersion_1 ? + proto::ColumnEncoding_Kind_DICTIONARY : + proto::ColumnEncoding_Kind_DICTIONARY_V2); + } + encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size())); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void StringColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + if (!useDictionary) { + directDataStream->recordPosition(rowIndexPosition.get()); + directLengthEncoder->recordPosition(rowIndexPosition.get()); + } else { + if (enableIndex) { + startOfRowGroups.push_back(dictionary.idxInDictBuffer.size()); + } + } + } + + bool StringColumnWriter::checkDictionaryKeyRatio() { + if (!doneDictionaryCheck) { + useDictionary = dictionary.size() <= static_cast<size_t>( + static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold); + doneDictionaryCheck = true; + } + + return useDictionary; + } + + void StringColumnWriter::createRowIndexEntry() { + if (useDictionary && !doneDictionaryCheck) { + if (!checkDictionaryKeyRatio()) { + fallbackToDirectEncoding(); + } + } + ColumnWriter::createRowIndexEntry(); + } + + void StringColumnWriter::reset() { + ColumnWriter::reset(); + + dictionary.clear(); + dictionary.idxInDictBuffer.resize(0); + startOfRowGroups.clear(); + startOfRowGroups.push_back(0); + } + + void StringColumnWriter::createDirectStreams() { + std::unique_ptr<BufferedOutputStream> directLengthStream = + streamsFactory.createStream(proto::Stream_Kind_LENGTH); + directLengthEncoder = createRleEncoder(std::move(directLengthStream), + false, + rleVersion, + memPool, + alignedBitPacking); + directDataStream.reset(new AppendOnlyBufferedStream( + streamsFactory.createStream(proto::Stream_Kind_DATA))); + } + + void StringColumnWriter::createDictStreams() { + std::unique_ptr<BufferedOutputStream> dictDataStream = + streamsFactory.createStream(proto::Stream_Kind_DATA); + dictDataEncoder = createRleEncoder(std::move(dictDataStream), + false, + rleVersion, + memPool, + alignedBitPacking); + std::unique_ptr<BufferedOutputStream> dictLengthStream = + streamsFactory.createStream(proto::Stream_Kind_LENGTH); + dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), + false, + rleVersion, + memPool, + alignedBitPacking); + dictStream.reset(new AppendOnlyBufferedStream( + streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA))); + } + + void StringColumnWriter::deleteDictStreams() { + dictDataEncoder.reset(nullptr); + dictLengthEncoder.reset(nullptr); + dictStream.reset(nullptr); + + dictionary.clear(); + dictionary.idxInDictBuffer.clear(); + startOfRowGroups.clear(); + } + + void StringColumnWriter::writeDictionary() { + if (useDictionary && !doneDictionaryCheck) { + // when index is disabled, dictionary check happens while writing 1st stripe + if (!checkDictionaryKeyRatio()) { + fallbackToDirectEncoding(); + return; + } + } + + if (useDictionary) { + // flush dictionary data & length streams + dictionary.flush(dictStream.get(), dictLengthEncoder.get()); + + // convert index from insertion order to dictionary order + dictionary.reorder(dictionary.idxInDictBuffer); + + // write data sequences + int64_t * data = dictionary.idxInDictBuffer.data(); + if (enableIndex) { + size_t prevOffset = 0; + for (size_t i = 0; i < startOfRowGroups.size(); ++i) { + // write sequences in batch for a row group stride + size_t offset = startOfRowGroups[i]; + dictDataEncoder->add(data + prevOffset, offset - prevOffset, nullptr); + + // update index positions + int rowGroupId = static_cast<int>(i); + proto::RowIndexEntry* indexEntry = + (rowGroupId < rowIndex->entry_size()) ? + rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get(); + + // add positions for direct streams + RowIndexPositionRecorder recorder(*indexEntry); + dictDataEncoder->recordPosition(&recorder); + + prevOffset = offset; + } + + dictDataEncoder->add(data + prevOffset, + dictionary.idxInDictBuffer.size() - prevOffset, + nullptr); + } else { + dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr); + } + } + } + + void StringColumnWriter::fallbackToDirectEncoding() { + createDirectStreams(); + + if (enableIndex) { + // fallback happens at the 1st row group; + // simply complete positions for direct streams + proto::RowIndexEntry * indexEntry = rowIndexEntry.get(); + RowIndexPositionRecorder recorder(*indexEntry); + directDataStream->recordPosition(&recorder); + directLengthEncoder->recordPosition(&recorder); + } + + // get dictionary entries in insertion order + std::vector<const SortedStringDictionary::DictEntry *> entries; + dictionary.getEntriesInInsertionOrder(entries); + + // store each length of the data into a vector + const SortedStringDictionary::DictEntry * dictEntry = nullptr; + for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) { + // write one row data in direct encoding + dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])]; + directDataStream->write(dictEntry->data, dictEntry->length); + directLengthEncoder->write(static_cast<int64_t>(dictEntry->length)); + } + + deleteDictStreams(); + } + + struct Utf8Utils { + /** + * Counts how many utf-8 chars of the input data + */ + static uint64_t charLength(const char * data, uint64_t length) { + uint64_t chars = 0; + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + } + return chars; + } + + /** + * Return the number of bytes required to read at most maxCharLength + * characters in full from a utf-8 encoded byte array provided + * by data. This does not validate utf-8 data, but + * operates correctly on already valid utf-8 data. + * + * @param maxCharLength number of characters required + * @param data the bytes of UTF-8 + * @param length the length of data to truncate + */ + static uint64_t truncateBytesTo(uint64_t maxCharLength, + const char * data, + uint64_t length) { + uint64_t chars = 0; + if (length <= maxCharLength) { + return length; + } + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + if (chars > maxCharLength) { + return i; + } + } + // everything fits + return length; + } + + /** + * Checks if b is the first byte of a UTF-8 character. + */ + inline static bool isUtfStartByte(char b) { + return (b & 0xC0) != 0x80; + } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) { + uint64_t posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (isUtfStartByte(text[posn])) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw std::logic_error( + "Could not truncate string, beginning of a valid char not found"); + } + }; + + class CharColumnWriter : public StringColumnWriter { + public: + CharColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + StringColumnWriter(type, factory, options), + maxLength(type.getMaximumLength()), + padBuffer(*options.getMemoryPool()) { + // utf-8 is currently 4 bytes long, but it could be up to 6 + padBuffer.resize(maxLength * 6); + } + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + private: + uint64_t maxLength; + DataBuffer<char> padBuffer; + }; + + void CharColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); + if (charsBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + StringColumnStatisticsImpl* strStats = + dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); + if (strStats == nullptr) { + throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char** data = charsBatch->data.data() + offset; + int64_t* length = charsBatch->length.data() + offset; + const char* notNull = charsBatch->hasNulls ? + charsBatch->notNull.data() + offset : nullptr; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + const char * charData = nullptr; + uint64_t originLength = static_cast<uint64_t>(length[i]); + uint64_t charLength = Utf8Utils::charLength(data[i], originLength); + if (charLength >= maxLength) { + charData = data[i]; + length[i] = static_cast<int64_t>( + Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); + } else { + charData = padBuffer.data(); + // the padding is exactly 1 byte per char + length[i] = length[i] + static_cast<int64_t>(maxLength - charLength); + memcpy(padBuffer.data(), data[i], originLength); + memset(padBuffer.data() + originLength, + ' ', + static_cast<size_t>(length[i]) - originLength); + } + + if (useDictionary) { + size_t index = dictionary.insert(charData, static_cast<size_t>(length[i])); + dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + } else { + directDataStream->write(charData, static_cast<size_t>(length[i])); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + strStats->update(charData, static_cast<size_t>(length[i])); + ++count; + } + } + + if (!useDictionary) { + directLengthEncoder->add(length, numValues, notNull); + } + + strStats->increase(count); + if (count < numValues) { + strStats->setHasNull(true); + } + } + + class VarCharColumnWriter : public StringColumnWriter { + public: + VarCharColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + StringColumnWriter(type, factory, options), + maxLength(type.getMaximumLength()) { + // PASS + } + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + private: + uint64_t maxLength; + }; + + void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); + if (charsBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + StringColumnStatisticsImpl* strStats = + dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); + if (strStats == nullptr) { + throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char* const* data = charsBatch->data.data() + offset; + int64_t* length = charsBatch->length.data() + offset; + const char* notNull = charsBatch->hasNulls ? + charsBatch->notNull.data() + offset : nullptr; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + uint64_t itemLength = Utf8Utils::truncateBytesTo( + maxLength, data[i], static_cast<uint64_t>(length[i])); + length[i] = static_cast<int64_t>(itemLength); + + if (useDictionary) { + size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i])); + dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + } else { + directDataStream->write(data[i], static_cast<size_t>(length[i])); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + strStats->update(data[i], static_cast<size_t>(length[i])); + ++count; + } + } + + if (!useDictionary) { + directLengthEncoder->add(length, numValues, notNull); + } + + strStats->increase(count); + if (count < numValues) { + strStats->setHasNull(true); + } + } + + class BinaryColumnWriter : public StringColumnWriter { + public: + BinaryColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + StringColumnWriter(type, factory, options) { + // PASS + } + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + }; + + void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); + if (binBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + BinaryColumnStatisticsImpl* binStats = + dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get()); + if (binStats == nullptr) { + throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char** data = binBatch->data.data() + offset; + int64_t* length = binBatch->length.data() + offset; + const char* notNull = binBatch->hasNulls ? + binBatch->notNull.data() + offset : nullptr; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + uint64_t unsignedLength = static_cast<uint64_t>(length[i]); + if (!notNull || notNull[i]) { + directDataStream->write(data[i], unsignedLength); + + binStats->update(unsignedLength); + ++count; + } + } + directLengthEncoder->add(length, numValues, notNull); + binStats->increase(count); + if (count < numValues) { + binStats->setHasNull(true); + } + } + + class TimestampColumnWriter : public ColumnWriter { + public: + TimestampColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + protected: + std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder; + + private: + RleVersion rleVersion; + const Timezone& timezone; + }; + + TimestampColumnWriter::TimestampColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + timezone(getTimezoneByName("GMT")){ + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + std::unique_ptr<BufferedOutputStream> secondaryStream = + factory.createStream(proto::Stream_Kind_SECONDARY); + secRleEncoder = createRleEncoder(std::move(dataStream), + true, + rleVersion, + memPool, + options.getAlignedBitpacking()); + nanoRleEncoder = createRleEncoder(std::move(secondaryStream), + false, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (enableIndex) { + recordPosition(); + } + } + + // Because the number of nanoseconds often has a large number of trailing zeros, + // the number has trailing decimal zero digits removed and the last three bits + // are used to record how many zeros were removed if the trailing zeros are + // more than 2. Thus 1000 nanoseconds would be serialized as 0x0a and + // 100000 would be serialized as 0x0c. + static int64_t formatNano(int64_t nanos) { + if (nanos == 0) { + return 0; + } else if (nanos % 100 != 0) { + return (nanos) << 3; + } else { + nanos /= 100; + int64_t trailingZeros = 1; + while (nanos % 10 == 0 && trailingZeros < 7) { + nanos /= 10; + trailingZeros += 1; + } + return (nanos) << 3 | trailingZeros; + } + } + + void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + TimestampVectorBatch* tsBatch = + dynamic_cast<TimestampVectorBatch*>(&rowBatch); + if (tsBatch == nullptr) { + throw InvalidArgument("Failed to cast to TimestampVectorBatch"); + } + + TimestampColumnStatisticsImpl* tsStats = + dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get()); + if (tsStats == nullptr) { + throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = tsBatch->hasNulls ? + tsBatch->notNull.data() + offset : nullptr; + int64_t *secs = tsBatch->data.data() + offset; + int64_t *nanos = tsBatch->nanoseconds.data() + offset; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + // TimestampVectorBatch already stores data in UTC + int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(millsUTC); + } + tsStats->update(millsUTC); + if (secs[i] < 0 && nanos[i] > 999999) { - secs[i] += 1; - } - - secs[i] -= timezone.getEpoch(); - nanos[i] = formatNano(nanos[i]); - } - } - tsStats->increase(count); - if (count < numValues) { - tsStats->setHasNull(true); - } - - secRleEncoder->add(secs, numValues, notNull); - nanoRleEncoder->add(nanos, numValues, notNull); - } - - void TimestampColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream dataStream; - dataStream.set_kind(proto::Stream_Kind_DATA); - dataStream.set_column(static_cast<uint32_t>(columnId)); - dataStream.set_length(secRleEncoder->flush()); - streams.push_back(dataStream); - - proto::Stream secondaryStream; - secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); - secondaryStream.set_column(static_cast<uint32_t>(columnId)); - secondaryStream.set_length(nanoRleEncoder->flush()); - streams.push_back(secondaryStream); - } - - uint64_t TimestampColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += secRleEncoder->getBufferSize(); - size += nanoRleEncoder->getBufferSize(); - return size; - } - - void TimestampColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void TimestampColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - secRleEncoder->recordPosition(rowIndexPosition.get()); - nanoRleEncoder->recordPosition(rowIndexPosition.get()); - } - - class DateColumnWriter : public IntegerColumnWriter { - public: - DateColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - }; - - DateColumnWriter::DateColumnWriter( - const Type &type, - const StreamsFactory &factory, - const WriterOptions &options) : - IntegerColumnWriter(type, factory, options) { - // PASS - } - - void DateColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const LongVectorBatch* longBatch = - dynamic_cast<const LongVectorBatch*>(&rowBatch); - if (longBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - - DateColumnStatisticsImpl* dateStats = - dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get()); - if (dateStats == nullptr) { - throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const int64_t* data = longBatch->data.data() + offset; - const char* notNull = longBatch->hasNulls ? - longBatch->notNull.data() + offset : nullptr; - - rleEncoder->add(data, numValues, notNull); - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - ++count; - dateStats->update(static_cast<int32_t>(data[i])); - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - } - } - dateStats->increase(count); - if (count < numValues) { - dateStats->setHasNull(true); - } - } - - class Decimal64ColumnWriter : public ColumnWriter { - public: - static const uint32_t MAX_PRECISION_64 = 18; - static const uint32_t MAX_PRECISION_128 = 38; - - Decimal64ColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - protected: - RleVersion rleVersion; - uint64_t precision; - uint64_t scale; - std::unique_ptr<AppendOnlyBufferedStream> valueStream; - std::unique_ptr<RleEncoder> scaleEncoder; - - private: - char buffer[10]; - }; - - Decimal64ColumnWriter::Decimal64ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - precision(type.getPrecision()), - scale(type.getScale()) { - valueStream.reset(new AppendOnlyBufferedStream( - factory.createStream(proto::Stream_Kind_DATA))); - std::unique_ptr<BufferedOutputStream> scaleStream = - factory.createStream(proto::Stream_Kind_SECONDARY); - scaleEncoder = createRleEncoder(std::move(scaleStream), - true, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (enableIndex) { - recordPosition(); - } - } - - void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const Decimal64VectorBatch* decBatch = - dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); - if (decBatch == nullptr) { - throw InvalidArgument("Failed to cast to Decimal64VectorBatch"); - } - - DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); - if (decStats == nullptr) { - throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; - const int64_t* values = decBatch->values.data() + offset; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - int64_t val = zigZag(values[i]); - char* data = buffer; - while (true) { - if ((val & ~0x7f) == 0) { - *(data++) = (static_cast<char>(val)); - break; - } else { - *(data++) = static_cast<char>(0x80 | (val & 0x7f)); - // cast val to unsigned so as to force 0-fill right shift - val = (static_cast<uint64_t>(val) >> 7); - } - } - valueStream->write(buffer, static_cast<size_t>(data - buffer)); - ++count; - if (enableBloomFilter) { - std::string decimal = Decimal( - values[i], static_cast<int32_t>(scale)).toString(); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); - } - decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); - } - } - decStats->increase(count); - if (count < numValues) { - decStats->setHasNull(true); - } - std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); - scaleEncoder->add(scales.data(), numValues, notNull); - } - - void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream dataStream; - dataStream.set_kind(proto::Stream_Kind_DATA); - dataStream.set_column(static_cast<uint32_t>(columnId)); - dataStream.set_length(valueStream->flush()); - streams.push_back(dataStream); - - proto::Stream secondaryStream; - secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); - secondaryStream.set_column(static_cast<uint32_t>(columnId)); - secondaryStream.set_length(scaleEncoder->flush()); - streams.push_back(secondaryStream); - } - - uint64_t Decimal64ColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += valueStream->getSize(); - size += scaleEncoder->getBufferSize(); - return size; - } - - void Decimal64ColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void Decimal64ColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - valueStream->recordPosition(rowIndexPosition.get()); - scaleEncoder->recordPosition(rowIndexPosition.get()); - } - - class Decimal128ColumnWriter : public Decimal64ColumnWriter { - public: - Decimal128ColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - private: - char buffer[20]; - }; - - Decimal128ColumnWriter::Decimal128ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - Decimal64ColumnWriter(type, factory, options) { - // PASS - } - - // Zigzag encoding moves the sign bit to the least significant bit using the - // expression (val « 1) ^ (val » 63) and derives its name from the fact that - // positive and negative numbers alternate once encoded. - Int128 zigZagInt128(const Int128& value) { - bool isNegative = value < 0; - Int128 val = value.abs(); - val <<= 1; - if (isNegative) { - val -= 1; - } - return val; - } - - void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const Decimal128VectorBatch* decBatch = - dynamic_cast<const Decimal128VectorBatch*>(&rowBatch); - if (decBatch == nullptr) { - throw InvalidArgument("Failed to cast to Decimal128VectorBatch"); - } - - DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); - if (decStats == nullptr) { - throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; - const Int128* values = decBatch->values.data() + offset; - - // The current encoding of decimal columns stores the integer representation - // of the value as an unbounded length zigzag encoded base 128 varint. - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - Int128 val = zigZagInt128(values[i]); - char* data = buffer; - while (true) { - if ((val & ~0x7f) == 0) { - *(data++) = (static_cast<char>(val.getLowBits())); - break; - } else { - *(data++) = static_cast<char>(0x80 | (val.getLowBits() & 0x7f)); - val >>= 7; - } - } - valueStream->write(buffer, static_cast<size_t>(data - buffer)); - - ++count; - if (enableBloomFilter) { - std::string decimal = Decimal( - values[i], static_cast<int32_t>(scale)).toString(); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); - } - decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); - } - } - decStats->increase(count); - if (count < numValues) { - decStats->setHasNull(true); - } - std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); - scaleEncoder->add(scales.data(), numValues, notNull); - } - - class ListColumnWriter : public ColumnWriter { - public: - ListColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~ListColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void recordPosition() const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::unique_ptr<RleEncoder> lengthEncoder; - RleVersion rleVersion; - std::unique_ptr<ColumnWriter> child; - }; - - ListColumnWriter::ListColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()){ - - std::unique_ptr<BufferedOutputStream> lengthStream = - factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), - false, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (type.getSubtypeCount() == 1) { - child = buildWriter(*type.getSubtype(0), factory, options); - } - - if (enableIndex) { - recordPosition(); - } - } - - ListColumnWriter::~ListColumnWriter() { - // PASS - } - - void ListColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch); - if (listBatch == nullptr) { - throw InvalidArgument("Failed to cast to ListVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* offsets = listBatch->offsets.data() + offset; - const char* notNull = listBatch->hasNulls ? - listBatch->notNull.data() + offset : nullptr; - - uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); - uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); - - // translate offsets to lengths - for (uint64_t i = 0; i != numValues; ++i) { - offsets[i] = offsets[i + 1] - offsets[i]; - } - - // unnecessary to deal with null as elements are packed together - if (child.get()) { - child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr); - } - lengthEncoder->add(offsets, numValues, notNull); - - if (enableIndex) { - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(offsets[i]); - } - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - } - - void ListColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_LENGTH); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(lengthEncoder->flush()); - streams.push_back(stream); - - if (child.get()) { - child->flush(streams); - } - } - - void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - if (child.get()) { - child->writeIndex(streams); - } - } - - uint64_t ListColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - if (child.get()) { - size += lengthEncoder->getBufferSize(); - size += child->getEstimatedSize(); - } - return size; - } - - void ListColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - if (child.get()) { - child->getColumnEncoding(encodings); - } - } - - void ListColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - if (child.get()) { - child->getStripeStatistics(stats); - } - } - - void ListColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - if (child.get()) { - child->mergeStripeStatsIntoFileStats(); - } - } - - void ListColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - if (child.get()) { - child->getFileStatistics(stats); - } - } - - void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - if (child.get()) { - child->mergeRowGroupStatsIntoStripeStats(); - } - } - - void ListColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - if (child.get()) { - child->createRowIndexEntry(); - } - } - - void ListColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - lengthEncoder->recordPosition(rowIndexPosition.get()); - } - - void ListColumnWriter::reset() { - ColumnWriter::reset(); - if (child) { - child->reset(); - } - } - - void ListColumnWriter::writeDictionary() { - if (child) { - child->writeDictionary(); - } - } - - class MapColumnWriter : public ColumnWriter { - public: - MapColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~MapColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void recordPosition() const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::unique_ptr<ColumnWriter> keyWriter; - std::unique_ptr<ColumnWriter> elemWriter; - std::unique_ptr<RleEncoder> lengthEncoder; - RleVersion rleVersion; - }; - - MapColumnWriter::MapColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()){ - std::unique_ptr<BufferedOutputStream> lengthStream = - factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), - false, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (type.getSubtypeCount() > 0) { - keyWriter = buildWriter(*type.getSubtype(0), factory, options); - } - - if (type.getSubtypeCount() > 1) { - elemWriter = buildWriter(*type.getSubtype(1), factory, options); - } - - if (enableIndex) { - recordPosition(); - } - } - - MapColumnWriter::~MapColumnWriter() { - // PASS - } - - void MapColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch); - if (mapBatch == nullptr) { - throw InvalidArgument("Failed to cast to MapVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* offsets = mapBatch->offsets.data() + offset; - const char* notNull = mapBatch->hasNulls ? - mapBatch->notNull.data() + offset : nullptr; - - uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); - uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); - - // translate offsets to lengths - for (uint64_t i = 0; i != numValues; ++i) { - offsets[i] = offsets[i + 1] - offsets[i]; - } - - lengthEncoder->add(offsets, numValues, notNull); - - // unnecessary to deal with null as keys and values are packed together - if (keyWriter.get()) { - keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr); - } - if (elemWriter.get()) { - elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr); - } - - if (enableIndex) { - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(offsets[i]); - } - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - } - - void MapColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_LENGTH); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(lengthEncoder->flush()); - streams.push_back(stream); - - if (keyWriter.get()) { - keyWriter->flush(streams); - } - if (elemWriter.get()) { - elemWriter->flush(streams); - } - } - - void MapColumnWriter::writeIndex( - std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - if (keyWriter.get()) { - keyWriter->writeIndex(streams); - } - if (elemWriter.get()) { - elemWriter->writeIndex(streams); - } - } - - uint64_t MapColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += lengthEncoder->getBufferSize(); - if (keyWriter.get()) { - size += keyWriter->getEstimatedSize(); - } - if (elemWriter.get()) { - size += elemWriter->getEstimatedSize(); - } - return size; - } - - void MapColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - if (keyWriter.get()) { - keyWriter->getColumnEncoding(encodings); - } - if (elemWriter.get()) { - elemWriter->getColumnEncoding(encodings); - } - } - - void MapColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - if (keyWriter.get()) { - keyWriter->getStripeStatistics(stats); - } - if (elemWriter.get()) { - elemWriter->getStripeStatistics(stats); - } - } - - void MapColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - if (keyWriter.get()) { - keyWriter->mergeStripeStatsIntoFileStats(); - } - if (elemWriter.get()) { - elemWriter->mergeStripeStatsIntoFileStats(); - } - } - - void MapColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - if (keyWriter.get()) { - keyWriter->getFileStatistics(stats); - } - if (elemWriter.get()) { - elemWriter->getFileStatistics(stats); - } - } - - void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - if (keyWriter.get()) { - keyWriter->mergeRowGroupStatsIntoStripeStats(); - } - if (elemWriter.get()) { - elemWriter->mergeRowGroupStatsIntoStripeStats(); - } - } - - void MapColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - if (keyWriter.get()) { - keyWriter->createRowIndexEntry(); - } - if (elemWriter.get()) { - elemWriter->createRowIndexEntry(); - } - } - - void MapColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - lengthEncoder->recordPosition(rowIndexPosition.get()); - } - - void MapColumnWriter::reset() { - ColumnWriter::reset(); - if (keyWriter) { - keyWriter->reset(); - } - if (elemWriter) { - elemWriter->reset(); - } - } - - void MapColumnWriter::writeDictionary() { - if (keyWriter) { - keyWriter->writeDictionary(); - } - if (elemWriter) { - elemWriter->writeDictionary(); - } - } - - class UnionColumnWriter : public ColumnWriter { - public: - UnionColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~UnionColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void recordPosition() const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::unique_ptr<ByteRleEncoder> rleEncoder; - std::vector<ColumnWriter*> children; - }; - - UnionColumnWriter::UnionColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createByteRleEncoder(std::move(dataStream)); - - for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { - children.push_back(buildWriter(*type.getSubtype(i), - factory, - options).release()); - } - - if (enableIndex) { - recordPosition(); - } - } - - UnionColumnWriter::~UnionColumnWriter() { - for (uint32_t i = 0; i < children.size(); ++i) { - delete children[i]; - } - } - - void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch); - if (unionBatch == nullptr) { - throw InvalidArgument("Failed to cast to UnionVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = unionBatch->hasNulls ? - unionBatch->notNull.data() + offset : nullptr; - unsigned char * tags = unionBatch->tags.data() + offset; - uint64_t * offsets = unionBatch->offsets.data() + offset; - - std::vector<int64_t> childOffset(children.size(), -1); - std::vector<uint64_t> childLength(children.size(), 0); - - for (uint64_t i = 0; i != numValues; ++i) { - if (childOffset[tags[i]] == -1) { - childOffset[tags[i]] = static_cast<int64_t>(offsets[i]); - } - ++childLength[tags[i]]; - } - - rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull); - - for (uint32_t i = 0; i < children.size(); ++i) { - if (childLength[i] > 0) { - children[i]->add(*unionBatch->children[i], - static_cast<uint64_t>(childOffset[i]), - childLength[i], nullptr); - } - } - - // update stats - if (enableIndex) { - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(tags[i]); - } - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - } - - void UnionColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); - streams.push_back(stream); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->flush(streams); - } - } - - void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeIndex(streams); - } - } - - uint64_t UnionColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - for (uint32_t i = 0; i < children.size(); ++i) { - size += children[i]->getEstimatedSize(); - } - return size; - } - - void UnionColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getColumnEncoding(encodings); - } - } - - void UnionColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getStripeStatistics(stats); - } - } - - void UnionColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeStripeStatsIntoFileStats(); - } - } - - void UnionColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getFileStatistics(stats); - } - } - - void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeRowGroupStatsIntoStripeStats(); - } - } - - void UnionColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->createRowIndexEntry(); - } - } - - void UnionColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); - } - - void UnionColumnWriter::reset() { - ColumnWriter::reset(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->reset(); - } - } - - void UnionColumnWriter::writeDictionary() { - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeDictionary(); - } - } - - std::unique_ptr<ColumnWriter> buildWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) { - switch (static_cast<int64_t>(type.getKind())) { - case STRUCT: - return std::unique_ptr<ColumnWriter>( - new StructColumnWriter( - type, - factory, - options)); - case INT: - case LONG: - case SHORT: - return std::unique_ptr<ColumnWriter>( - new IntegerColumnWriter( - type, - factory, - options)); - case BYTE: - return std::unique_ptr<ColumnWriter>( - new ByteColumnWriter( - type, - factory, - options)); - case BOOLEAN: - return std::unique_ptr<ColumnWriter>( - new BooleanColumnWriter( - type, - factory, - options)); - case DOUBLE: - return std::unique_ptr<ColumnWriter>( - new DoubleColumnWriter( - type, - factory, - options, - false)); - case FLOAT: - return std::unique_ptr<ColumnWriter>( - new DoubleColumnWriter( - type, - factory, - options, - true)); - case BINARY: - return std::unique_ptr<ColumnWriter>( - new BinaryColumnWriter( - type, - factory, - options)); - case STRING: - return std::unique_ptr<ColumnWriter>( - new StringColumnWriter( - type, - factory, - options)); - case CHAR: - return std::unique_ptr<ColumnWriter>( - new CharColumnWriter( - type, - factory, - options)); - case VARCHAR: - return std::unique_ptr<ColumnWriter>( - new VarCharColumnWriter( - type, - factory, - options)); - case DATE: - return std::unique_ptr<ColumnWriter>( - new DateColumnWriter( - type, - factory, - options)); - case TIMESTAMP: - return std::unique_ptr<ColumnWriter>( - new TimestampColumnWriter( - type, - factory, - options)); - case DECIMAL: - if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) { - return std::unique_ptr<ColumnWriter>( - new Decimal64ColumnWriter( - type, - factory, - options)); - } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) { - return std::unique_ptr<ColumnWriter>( - new Decimal128ColumnWriter( - type, - factory, - options)); - } else { - throw NotImplementedYet("Decimal precision more than 38 is not " - "supported"); - } - case LIST: - return std::unique_ptr<ColumnWriter>( - new ListColumnWriter( - type, - factory, - options)); - case MAP: - return std::unique_ptr<ColumnWriter>( - new MapColumnWriter( - type, - factory, - options)); - case UNION: - return std::unique_ptr<ColumnWriter>( - new UnionColumnWriter( - type, - factory, - options)); - default: - throw NotImplementedYet("Type is not supported yet for creating " - "ColumnWriter."); - } - } -} + secs[i] += 1; + } + + secs[i] -= timezone.getEpoch(); + nanos[i] = formatNano(nanos[i]); + } + } + tsStats->increase(count); + if (count < numValues) { + tsStats->setHasNull(true); + } + + secRleEncoder->add(secs, numValues, notNull); + nanoRleEncoder->add(nanos, numValues, notNull); + } + + void TimestampColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream dataStream; + dataStream.set_kind(proto::Stream_Kind_DATA); + dataStream.set_column(static_cast<uint32_t>(columnId)); + dataStream.set_length(secRleEncoder->flush()); + streams.push_back(dataStream); + + proto::Stream secondaryStream; + secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); + secondaryStream.set_column(static_cast<uint32_t>(columnId)); + secondaryStream.set_length(nanoRleEncoder->flush()); + streams.push_back(secondaryStream); + } + + uint64_t TimestampColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += secRleEncoder->getBufferSize(); + size += nanoRleEncoder->getBufferSize(); + return size; + } + + void TimestampColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void TimestampColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + secRleEncoder->recordPosition(rowIndexPosition.get()); + nanoRleEncoder->recordPosition(rowIndexPosition.get()); + } + + class DateColumnWriter : public IntegerColumnWriter { + public: + DateColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + }; + + DateColumnWriter::DateColumnWriter( + const Type &type, + const StreamsFactory &factory, + const WriterOptions &options) : + IntegerColumnWriter(type, factory, options) { + // PASS + } + + void DateColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const LongVectorBatch* longBatch = + dynamic_cast<const LongVectorBatch*>(&rowBatch); + if (longBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + + DateColumnStatisticsImpl* dateStats = + dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get()); + if (dateStats == nullptr) { + throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const int64_t* data = longBatch->data.data() + offset; + const char* notNull = longBatch->hasNulls ? + longBatch->notNull.data() + offset : nullptr; + + rleEncoder->add(data, numValues, notNull); + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + ++count; + dateStats->update(static_cast<int32_t>(data[i])); + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + } + } + dateStats->increase(count); + if (count < numValues) { + dateStats->setHasNull(true); + } + } + + class Decimal64ColumnWriter : public ColumnWriter { + public: + static const uint32_t MAX_PRECISION_64 = 18; + static const uint32_t MAX_PRECISION_128 = 38; + + Decimal64ColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + protected: + RleVersion rleVersion; + uint64_t precision; + uint64_t scale; + std::unique_ptr<AppendOnlyBufferedStream> valueStream; + std::unique_ptr<RleEncoder> scaleEncoder; + + private: + char buffer[10]; + }; + + Decimal64ColumnWriter::Decimal64ColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + precision(type.getPrecision()), + scale(type.getScale()) { + valueStream.reset(new AppendOnlyBufferedStream( + factory.createStream(proto::Stream_Kind_DATA))); + std::unique_ptr<BufferedOutputStream> scaleStream = + factory.createStream(proto::Stream_Kind_SECONDARY); + scaleEncoder = createRleEncoder(std::move(scaleStream), + true, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (enableIndex) { + recordPosition(); + } + } + + void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const Decimal64VectorBatch* decBatch = + dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); + if (decBatch == nullptr) { + throw InvalidArgument("Failed to cast to Decimal64VectorBatch"); + } + + DecimalColumnStatisticsImpl* decStats = + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + if (decStats == nullptr) { + throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = decBatch->hasNulls ? + decBatch->notNull.data() + offset : nullptr; + const int64_t* values = decBatch->values.data() + offset; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + int64_t val = zigZag(values[i]); + char* data = buffer; + while (true) { + if ((val & ~0x7f) == 0) { + *(data++) = (static_cast<char>(val)); + break; + } else { + *(data++) = static_cast<char>(0x80 | (val & 0x7f)); + // cast val to unsigned so as to force 0-fill right shift + val = (static_cast<uint64_t>(val) >> 7); + } + } + valueStream->write(buffer, static_cast<size_t>(data - buffer)); + ++count; + if (enableBloomFilter) { + std::string decimal = Decimal( + values[i], static_cast<int32_t>(scale)).toString(); + bloomFilter->addBytes( + decimal.c_str(), static_cast<int64_t>(decimal.size())); + } + decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); + } + } + decStats->increase(count); + if (count < numValues) { + decStats->setHasNull(true); + } + std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); + scaleEncoder->add(scales.data(), numValues, notNull); + } + + void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream dataStream; + dataStream.set_kind(proto::Stream_Kind_DATA); + dataStream.set_column(static_cast<uint32_t>(columnId)); + dataStream.set_length(valueStream->flush()); + streams.push_back(dataStream); + + proto::Stream secondaryStream; + secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); + secondaryStream.set_column(static_cast<uint32_t>(columnId)); + secondaryStream.set_length(scaleEncoder->flush()); + streams.push_back(secondaryStream); + } + + uint64_t Decimal64ColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += valueStream->getSize(); + size += scaleEncoder->getBufferSize(); + return size; + } + + void Decimal64ColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void Decimal64ColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + valueStream->recordPosition(rowIndexPosition.get()); + scaleEncoder->recordPosition(rowIndexPosition.get()); + } + + class Decimal128ColumnWriter : public Decimal64ColumnWriter { + public: + Decimal128ColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + private: + char buffer[20]; + }; + + Decimal128ColumnWriter::Decimal128ColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + Decimal64ColumnWriter(type, factory, options) { + // PASS + } + + // Zigzag encoding moves the sign bit to the least significant bit using the + // expression (val « 1) ^ (val » 63) and derives its name from the fact that + // positive and negative numbers alternate once encoded. + Int128 zigZagInt128(const Int128& value) { + bool isNegative = value < 0; + Int128 val = value.abs(); + val <<= 1; + if (isNegative) { + val -= 1; + } + return val; + } + + void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const Decimal128VectorBatch* decBatch = + dynamic_cast<const Decimal128VectorBatch*>(&rowBatch); + if (decBatch == nullptr) { + throw InvalidArgument("Failed to cast to Decimal128VectorBatch"); + } + + DecimalColumnStatisticsImpl* decStats = + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + if (decStats == nullptr) { + throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = decBatch->hasNulls ? + decBatch->notNull.data() + offset : nullptr; + const Int128* values = decBatch->values.data() + offset; + + // The current encoding of decimal columns stores the integer representation + // of the value as an unbounded length zigzag encoded base 128 varint. + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + Int128 val = zigZagInt128(values[i]); + char* data = buffer; + while (true) { + if ((val & ~0x7f) == 0) { + *(data++) = (static_cast<char>(val.getLowBits())); + break; + } else { + *(data++) = static_cast<char>(0x80 | (val.getLowBits() & 0x7f)); + val >>= 7; + } + } + valueStream->write(buffer, static_cast<size_t>(data - buffer)); + + ++count; + if (enableBloomFilter) { + std::string decimal = Decimal( + values[i], static_cast<int32_t>(scale)).toString(); + bloomFilter->addBytes( + decimal.c_str(), static_cast<int64_t>(decimal.size())); + } + decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); + } + } + decStats->increase(count); + if (count < numValues) { + decStats->setHasNull(true); + } + std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); + scaleEncoder->add(scales.data(), numValues, notNull); + } + + class ListColumnWriter : public ColumnWriter { + public: + ListColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~ListColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void recordPosition() const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::unique_ptr<RleEncoder> lengthEncoder; + RleVersion rleVersion; + std::unique_ptr<ColumnWriter> child; + }; + + ListColumnWriter::ListColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()){ + + std::unique_ptr<BufferedOutputStream> lengthStream = + factory.createStream(proto::Stream_Kind_LENGTH); + lengthEncoder = createRleEncoder(std::move(lengthStream), + false, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (type.getSubtypeCount() == 1) { + child = buildWriter(*type.getSubtype(0), factory, options); + } + + if (enableIndex) { + recordPosition(); + } + } + + ListColumnWriter::~ListColumnWriter() { + // PASS + } + + void ListColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch); + if (listBatch == nullptr) { + throw InvalidArgument("Failed to cast to ListVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* offsets = listBatch->offsets.data() + offset; + const char* notNull = listBatch->hasNulls ? + listBatch->notNull.data() + offset : nullptr; + + uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); + uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); + + // translate offsets to lengths + for (uint64_t i = 0; i != numValues; ++i) { + offsets[i] = offsets[i + 1] - offsets[i]; + } + + // unnecessary to deal with null as elements are packed together + if (child.get()) { + child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr); + } + lengthEncoder->add(offsets, numValues, notNull); + + if (enableIndex) { + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(offsets[i]); + } + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + } + + void ListColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_LENGTH); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(lengthEncoder->flush()); + streams.push_back(stream); + + if (child.get()) { + child->flush(streams); + } + } + + void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + if (child.get()) { + child->writeIndex(streams); + } + } + + uint64_t ListColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + if (child.get()) { + size += lengthEncoder->getBufferSize(); + size += child->getEstimatedSize(); + } + return size; + } + + void ListColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + if (child.get()) { + child->getColumnEncoding(encodings); + } + } + + void ListColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + if (child.get()) { + child->getStripeStatistics(stats); + } + } + + void ListColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + if (child.get()) { + child->mergeStripeStatsIntoFileStats(); + } + } + + void ListColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + if (child.get()) { + child->getFileStatistics(stats); + } + } + + void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + if (child.get()) { + child->mergeRowGroupStatsIntoStripeStats(); + } + } + + void ListColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + if (child.get()) { + child->createRowIndexEntry(); + } + } + + void ListColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + lengthEncoder->recordPosition(rowIndexPosition.get()); + } + + void ListColumnWriter::reset() { + ColumnWriter::reset(); + if (child) { + child->reset(); + } + } + + void ListColumnWriter::writeDictionary() { + if (child) { + child->writeDictionary(); + } + } + + class MapColumnWriter : public ColumnWriter { + public: + MapColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~MapColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void recordPosition() const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::unique_ptr<ColumnWriter> keyWriter; + std::unique_ptr<ColumnWriter> elemWriter; + std::unique_ptr<RleEncoder> lengthEncoder; + RleVersion rleVersion; + }; + + MapColumnWriter::MapColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()){ + std::unique_ptr<BufferedOutputStream> lengthStream = + factory.createStream(proto::Stream_Kind_LENGTH); + lengthEncoder = createRleEncoder(std::move(lengthStream), + false, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (type.getSubtypeCount() > 0) { + keyWriter = buildWriter(*type.getSubtype(0), factory, options); + } + + if (type.getSubtypeCount() > 1) { + elemWriter = buildWriter(*type.getSubtype(1), factory, options); + } + + if (enableIndex) { + recordPosition(); + } + } + + MapColumnWriter::~MapColumnWriter() { + // PASS + } + + void MapColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch); + if (mapBatch == nullptr) { + throw InvalidArgument("Failed to cast to MapVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* offsets = mapBatch->offsets.data() + offset; + const char* notNull = mapBatch->hasNulls ? + mapBatch->notNull.data() + offset : nullptr; + + uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); + uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); + + // translate offsets to lengths + for (uint64_t i = 0; i != numValues; ++i) { + offsets[i] = offsets[i + 1] - offsets[i]; + } + + lengthEncoder->add(offsets, numValues, notNull); + + // unnecessary to deal with null as keys and values are packed together + if (keyWriter.get()) { + keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr); + } + if (elemWriter.get()) { + elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr); + } + + if (enableIndex) { + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(offsets[i]); + } + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + } + + void MapColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_LENGTH); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(lengthEncoder->flush()); + streams.push_back(stream); + + if (keyWriter.get()) { + keyWriter->flush(streams); + } + if (elemWriter.get()) { + elemWriter->flush(streams); + } + } + + void MapColumnWriter::writeIndex( + std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + if (keyWriter.get()) { + keyWriter->writeIndex(streams); + } + if (elemWriter.get()) { + elemWriter->writeIndex(streams); + } + } + + uint64_t MapColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += lengthEncoder->getBufferSize(); + if (keyWriter.get()) { + size += keyWriter->getEstimatedSize(); + } + if (elemWriter.get()) { + size += elemWriter->getEstimatedSize(); + } + return size; + } + + void MapColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + if (keyWriter.get()) { + keyWriter->getColumnEncoding(encodings); + } + if (elemWriter.get()) { + elemWriter->getColumnEncoding(encodings); + } + } + + void MapColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + if (keyWriter.get()) { + keyWriter->getStripeStatistics(stats); + } + if (elemWriter.get()) { + elemWriter->getStripeStatistics(stats); + } + } + + void MapColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + if (keyWriter.get()) { + keyWriter->mergeStripeStatsIntoFileStats(); + } + if (elemWriter.get()) { + elemWriter->mergeStripeStatsIntoFileStats(); + } + } + + void MapColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + if (keyWriter.get()) { + keyWriter->getFileStatistics(stats); + } + if (elemWriter.get()) { + elemWriter->getFileStatistics(stats); + } + } + + void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + if (keyWriter.get()) { + keyWriter->mergeRowGroupStatsIntoStripeStats(); + } + if (elemWriter.get()) { + elemWriter->mergeRowGroupStatsIntoStripeStats(); + } + } + + void MapColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + if (keyWriter.get()) { + keyWriter->createRowIndexEntry(); + } + if (elemWriter.get()) { + elemWriter->createRowIndexEntry(); + } + } + + void MapColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + lengthEncoder->recordPosition(rowIndexPosition.get()); + } + + void MapColumnWriter::reset() { + ColumnWriter::reset(); + if (keyWriter) { + keyWriter->reset(); + } + if (elemWriter) { + elemWriter->reset(); + } + } + + void MapColumnWriter::writeDictionary() { + if (keyWriter) { + keyWriter->writeDictionary(); + } + if (elemWriter) { + elemWriter->writeDictionary(); + } + } + + class UnionColumnWriter : public ColumnWriter { + public: + UnionColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~UnionColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void recordPosition() const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::unique_ptr<ByteRleEncoder> rleEncoder; + std::vector<ColumnWriter*> children; + }; + + UnionColumnWriter::UnionColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createByteRleEncoder(std::move(dataStream)); + + for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { + children.push_back(buildWriter(*type.getSubtype(i), + factory, + options).release()); + } + + if (enableIndex) { + recordPosition(); + } + } + + UnionColumnWriter::~UnionColumnWriter() { + for (uint32_t i = 0; i < children.size(); ++i) { + delete children[i]; + } + } + + void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch); + if (unionBatch == nullptr) { + throw InvalidArgument("Failed to cast to UnionVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = unionBatch->hasNulls ? + unionBatch->notNull.data() + offset : nullptr; + unsigned char * tags = unionBatch->tags.data() + offset; + uint64_t * offsets = unionBatch->offsets.data() + offset; + + std::vector<int64_t> childOffset(children.size(), -1); + std::vector<uint64_t> childLength(children.size(), 0); + + for (uint64_t i = 0; i != numValues; ++i) { + if (childOffset[tags[i]] == -1) { + childOffset[tags[i]] = static_cast<int64_t>(offsets[i]); + } + ++childLength[tags[i]]; + } + + rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull); + + for (uint32_t i = 0; i < children.size(); ++i) { + if (childLength[i] > 0) { + children[i]->add(*unionBatch->children[i], + static_cast<uint64_t>(childOffset[i]), + childLength[i], nullptr); + } + } + + // update stats + if (enableIndex) { + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(tags[i]); + } + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + } + + void UnionColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(rleEncoder->flush()); + streams.push_back(stream); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->flush(streams); + } + } + + void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeIndex(streams); + } + } + + uint64_t UnionColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += rleEncoder->getBufferSize(); + for (uint32_t i = 0; i < children.size(); ++i) { + size += children[i]->getEstimatedSize(); + } + return size; + } + + void UnionColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getColumnEncoding(encodings); + } + } + + void UnionColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getStripeStatistics(stats); + } + } + + void UnionColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeStripeStatsIntoFileStats(); + } + } + + void UnionColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getFileStatistics(stats); + } + } + + void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeRowGroupStatsIntoStripeStats(); + } + } + + void UnionColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->createRowIndexEntry(); + } + } + + void UnionColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + rleEncoder->recordPosition(rowIndexPosition.get()); + } + + void UnionColumnWriter::reset() { + ColumnWriter::reset(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->reset(); + } + } + + void UnionColumnWriter::writeDictionary() { + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeDictionary(); + } + } + + std::unique_ptr<ColumnWriter> buildWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) { + switch (static_cast<int64_t>(type.getKind())) { + case STRUCT: + return std::unique_ptr<ColumnWriter>( + new StructColumnWriter( + type, + factory, + options)); + case INT: + case LONG: + case SHORT: + return std::unique_ptr<ColumnWriter>( + new IntegerColumnWriter( + type, + factory, + options)); + case BYTE: + return std::unique_ptr<ColumnWriter>( + new ByteColumnWriter( + type, + factory, + options)); + case BOOLEAN: + return std::unique_ptr<ColumnWriter>( + new BooleanColumnWriter( + type, + factory, + options)); + case DOUBLE: + return std::unique_ptr<ColumnWriter>( + new DoubleColumnWriter( + type, + factory, + options, + false)); + case FLOAT: + return std::unique_ptr<ColumnWriter>( + new DoubleColumnWriter( + type, + factory, + options, + true)); + case BINARY: + return std::unique_ptr<ColumnWriter>( + new BinaryColumnWriter( + type, + factory, + options)); + case STRING: + return std::unique_ptr<ColumnWriter>( + new StringColumnWriter( + type, + factory, + options)); + case CHAR: + return std::unique_ptr<ColumnWriter>( + new CharColumnWriter( + type, + factory, + options)); + case VARCHAR: + return std::unique_ptr<ColumnWriter>( + new VarCharColumnWriter( + type, + factory, + options)); + case DATE: + return std::unique_ptr<ColumnWriter>( + new DateColumnWriter( + type, + factory, + options)); + case TIMESTAMP: + return std::unique_ptr<ColumnWriter>( + new TimestampColumnWriter( + type, + factory, + options)); + case DECIMAL: + if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) { + return std::unique_ptr<ColumnWriter>( + new Decimal64ColumnWriter( + type, + factory, + options)); + } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) { + return std::unique_ptr<ColumnWriter>( + new Decimal128ColumnWriter( + type, + factory, + options)); + } else { + throw NotImplementedYet("Decimal precision more than 38 is not " + "supported"); + } + case LIST: + return std::unique_ptr<ColumnWriter>( + new ListColumnWriter( + type, + factory, + options)); + case MAP: + return std::unique_ptr<ColumnWriter>( + new MapColumnWriter( + type, + factory, + options)); + case UNION: + return std::unique_ptr<ColumnWriter>( + new UnionColumnWriter( + type, + factory, + options)); + default: + throw NotImplementedYet("Type is not supported yet for creating " + "ColumnWriter."); + } + } +} diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh index cbbb5d00dc..4d7d71cb37 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh @@ -1,221 +1,221 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COLUMN_WRITER_HH -#define ORC_COLUMN_WRITER_HH - -#include "orc/Vector.hh" - -#include "BloomFilter.hh" -#include "ByteRLE.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" -#include "Statistics.hh" - -#include "wrap/orc-proto-wrapper.hh" - -namespace orc { - - class StreamsFactory { - public: - virtual ~StreamsFactory(); - - /** - * Get the stream for the given column/kind in this stripe. - * @param kind the kind of the stream - * @return the buffered output stream - */ - virtual std::unique_ptr<BufferedOutputStream> - createStream(proto::Stream_Kind kind) const = 0; - }; - - std::unique_ptr<StreamsFactory> createStreamsFactory( - const WriterOptions& options, - OutputStream * outStream); - - /** - * record stream positions for row index - */ - class RowIndexPositionRecorder : public PositionRecorder { - public: - virtual ~RowIndexPositionRecorder() override; - - RowIndexPositionRecorder(proto::RowIndexEntry& entry): - rowIndexEntry(entry) {} - - virtual void add(uint64_t pos) override { - rowIndexEntry.add_positions(pos); - } - - private: - proto::RowIndexEntry& rowIndexEntry; - }; - - /** - * The interface for writing ORC data types. - */ - class ColumnWriter { - protected: - std::unique_ptr<ByteRleEncoder> notNullEncoder; - uint64_t columnId; - std::unique_ptr<MutableColumnStatistics> colIndexStatistics; - std::unique_ptr<MutableColumnStatistics> colStripeStatistics; - std::unique_ptr<MutableColumnStatistics> colFileStatistics; - - bool enableIndex; - // row index for this column, contains all RowIndexEntries in 1 stripe - std::unique_ptr<proto::RowIndex> rowIndex; - std::unique_ptr<proto::RowIndexEntry> rowIndexEntry; - std::unique_ptr<RowIndexPositionRecorder> rowIndexPosition; - - // bloom filters are recorded per row group - bool enableBloomFilter; - std::unique_ptr<BloomFilterImpl> bloomFilter; - std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex; - - public: - ColumnWriter(const Type& type, const StreamsFactory& factory, - const WriterOptions& options); - - virtual ~ColumnWriter(); - - /** - * Write the next group of values from this rowBatch. - * @param rowBatch the row batch data to write - * @param offset the starting point of row batch to write - * @param numValues the number of values to write - * @param incomingMask if null, all values are not null. Otherwise, it is - * a mask (with at least numValues bytes) for which - * values to write. - */ - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char * incomingMask); - /** - * Flush column writer output streams. - * @param streams vector to store streams generated by flush() - */ - virtual void flush(std::vector<proto::Stream>& streams); - - /** - * Get estimated size of buffer used. - * @return estimated size of buffer used - */ - virtual uint64_t getEstimatedSize() const; - - /** - * Get the encoding used by the writer for this column. - * @param encodings vector to store the returned ColumnEncoding info - */ - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const = 0; - - /** - * Get the stripe statistics for this column. - * @param stats vector to store the returned stripe statistics - */ - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const; - - /** - * Get the file statistics for this column. - * @param stats vector to store the returned file statistics - */ - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const; - - /** - * Merge index stats into stripe stats and reset index stats. - */ - virtual void mergeRowGroupStatsIntoStripeStats(); - - /** - * Merge stripe stats into file stats and reset stripe stats. - */ - virtual void mergeStripeStatsIntoFileStats(); - - /** - * Create a row index entry with the previous location and the current - * index statistics. Also merges the index statistics into the stripe - * statistics before they are cleared. Finally, it records the start of the - * next index and ensures all of the children columns also create an entry. - */ - virtual void createRowIndexEntry(); - - /** - * Create a new BloomFilter entry and add the previous one to BloomFilterIndex - */ - virtual void addBloomFilterEntry(); - - /** - * Write row index streams for this column. - * @param streams output list of ROW_INDEX streams - */ - virtual void writeIndex(std::vector<proto::Stream> &streams) const; - - /** - * Record positions for index. - * - * This function is called by createRowIndexEntry() and ColumnWriter's - * constructor. So base classes do not need to call inherited classes' - * recordPosition() function. - */ - virtual void recordPosition() const; - - /** - * Reset positions for index. - */ - virtual void reset(); - - /** - * Write dictionary to streams for string columns - */ - virtual void writeDictionary(); - - protected: - /** - * Utility function to translate ColumnStatistics into protobuf form and - * add it to output list. - * @param statsList output list for protobuf stats - * @param stats ColumnStatistics to be transformed and added - */ - void getProtoBufStatistics( - std::vector<proto::ColumnStatistics>& statsList, - const MutableColumnStatistics* stats) const { - proto::ColumnStatistics pbStats; - stats->toProtoBuf(pbStats); - statsList.push_back(pbStats); - } - - protected: - MemoryPool& memPool; - std::unique_ptr<BufferedOutputStream> indexStream; - std::unique_ptr<BufferedOutputStream> bloomFilterStream; - }; - - /** - * Create a writer for the given type. - */ - std::unique_ptr<ColumnWriter> buildWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COLUMN_WRITER_HH +#define ORC_COLUMN_WRITER_HH + +#include "orc/Vector.hh" + +#include "BloomFilter.hh" +#include "ByteRLE.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" +#include "Statistics.hh" + +#include "wrap/orc-proto-wrapper.hh" + +namespace orc { + + class StreamsFactory { + public: + virtual ~StreamsFactory(); + + /** + * Get the stream for the given column/kind in this stripe. + * @param kind the kind of the stream + * @return the buffered output stream + */ + virtual std::unique_ptr<BufferedOutputStream> + createStream(proto::Stream_Kind kind) const = 0; + }; + + std::unique_ptr<StreamsFactory> createStreamsFactory( + const WriterOptions& options, + OutputStream * outStream); + + /** + * record stream positions for row index + */ + class RowIndexPositionRecorder : public PositionRecorder { + public: + virtual ~RowIndexPositionRecorder() override; + + RowIndexPositionRecorder(proto::RowIndexEntry& entry): + rowIndexEntry(entry) {} + + virtual void add(uint64_t pos) override { + rowIndexEntry.add_positions(pos); + } + + private: + proto::RowIndexEntry& rowIndexEntry; + }; + + /** + * The interface for writing ORC data types. + */ + class ColumnWriter { + protected: + std::unique_ptr<ByteRleEncoder> notNullEncoder; + uint64_t columnId; + std::unique_ptr<MutableColumnStatistics> colIndexStatistics; + std::unique_ptr<MutableColumnStatistics> colStripeStatistics; + std::unique_ptr<MutableColumnStatistics> colFileStatistics; + + bool enableIndex; + // row index for this column, contains all RowIndexEntries in 1 stripe + std::unique_ptr<proto::RowIndex> rowIndex; + std::unique_ptr<proto::RowIndexEntry> rowIndexEntry; + std::unique_ptr<RowIndexPositionRecorder> rowIndexPosition; + + // bloom filters are recorded per row group + bool enableBloomFilter; + std::unique_ptr<BloomFilterImpl> bloomFilter; + std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex; + + public: + ColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options); + + virtual ~ColumnWriter(); + + /** + * Write the next group of values from this rowBatch. + * @param rowBatch the row batch data to write + * @param offset the starting point of row batch to write + * @param numValues the number of values to write + * @param incomingMask if null, all values are not null. Otherwise, it is + * a mask (with at least numValues bytes) for which + * values to write. + */ + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char * incomingMask); + /** + * Flush column writer output streams. + * @param streams vector to store streams generated by flush() + */ + virtual void flush(std::vector<proto::Stream>& streams); + + /** + * Get estimated size of buffer used. + * @return estimated size of buffer used + */ + virtual uint64_t getEstimatedSize() const; + + /** + * Get the encoding used by the writer for this column. + * @param encodings vector to store the returned ColumnEncoding info + */ + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const = 0; + + /** + * Get the stripe statistics for this column. + * @param stats vector to store the returned stripe statistics + */ + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const; + + /** + * Get the file statistics for this column. + * @param stats vector to store the returned file statistics + */ + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const; + + /** + * Merge index stats into stripe stats and reset index stats. + */ + virtual void mergeRowGroupStatsIntoStripeStats(); + + /** + * Merge stripe stats into file stats and reset stripe stats. + */ + virtual void mergeStripeStatsIntoFileStats(); + + /** + * Create a row index entry with the previous location and the current + * index statistics. Also merges the index statistics into the stripe + * statistics before they are cleared. Finally, it records the start of the + * next index and ensures all of the children columns also create an entry. + */ + virtual void createRowIndexEntry(); + + /** + * Create a new BloomFilter entry and add the previous one to BloomFilterIndex + */ + virtual void addBloomFilterEntry(); + + /** + * Write row index streams for this column. + * @param streams output list of ROW_INDEX streams + */ + virtual void writeIndex(std::vector<proto::Stream> &streams) const; + + /** + * Record positions for index. + * + * This function is called by createRowIndexEntry() and ColumnWriter's + * constructor. So base classes do not need to call inherited classes' + * recordPosition() function. + */ + virtual void recordPosition() const; + + /** + * Reset positions for index. + */ + virtual void reset(); + + /** + * Write dictionary to streams for string columns + */ + virtual void writeDictionary(); + + protected: + /** + * Utility function to translate ColumnStatistics into protobuf form and + * add it to output list. + * @param statsList output list for protobuf stats + * @param stats ColumnStatistics to be transformed and added + */ + void getProtoBufStatistics( + std::vector<proto::ColumnStatistics>& statsList, + const MutableColumnStatistics* stats) const { + proto::ColumnStatistics pbStats; + stats->toProtoBuf(pbStats); + statsList.push_back(pbStats); + } + + protected: + MemoryPool& memPool; + std::unique_ptr<BufferedOutputStream> indexStream; + std::unique_ptr<BufferedOutputStream> bloomFilterStream; + }; + + /** + * Create a writer for the given type. + */ + std::unique_ptr<ColumnWriter> buildWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc index dbf073797e..e50f085d30 100644 --- a/contrib/libs/apache/orc/c++/src/Common.cc +++ b/contrib/libs/apache/orc/c++/src/Common.cc @@ -1,75 +1,75 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Common.hh" - -#include <sstream> - -namespace orc { - - std::string compressionKindToString(CompressionKind kind) { - switch (static_cast<int>(kind)) { - case CompressionKind_NONE: - return "none"; - case CompressionKind_ZLIB: - return "zlib"; - case CompressionKind_SNAPPY: - return "snappy"; - case CompressionKind_LZO: - return "lzo"; - case CompressionKind_LZ4: - return "lz4"; - case CompressionKind_ZSTD: - return "zstd"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string writerVersionToString(WriterVersion version) { - switch (static_cast<int>(version)) { - case WriterVersion_ORIGINAL: - return "original"; - case WriterVersion_HIVE_8732: - return "HIVE-8732"; - case WriterVersion_HIVE_4243: - return "HIVE-4243"; - case WriterVersion_HIVE_12055: - return "HIVE-12055"; - case WriterVersion_HIVE_13083: - return "HIVE-13083"; - case WriterVersion_ORC_101: - return "ORC-101"; - case WriterVersion_ORC_135: - return "ORC-135"; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Common.hh" + +#include <sstream> + +namespace orc { + + std::string compressionKindToString(CompressionKind kind) { + switch (static_cast<int>(kind)) { + case CompressionKind_NONE: + return "none"; + case CompressionKind_ZLIB: + return "zlib"; + case CompressionKind_SNAPPY: + return "snappy"; + case CompressionKind_LZO: + return "lzo"; + case CompressionKind_LZ4: + return "lz4"; + case CompressionKind_ZSTD: + return "zstd"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string writerVersionToString(WriterVersion version) { + switch (static_cast<int>(version)) { + case WriterVersion_ORIGINAL: + return "original"; + case WriterVersion_HIVE_8732: + return "HIVE-8732"; + case WriterVersion_HIVE_4243: + return "HIVE-4243"; + case WriterVersion_HIVE_12055: + return "HIVE-12055"; + case WriterVersion_HIVE_13083: + return "HIVE-13083"; + case WriterVersion_ORC_101: + return "ORC-101"; + case WriterVersion_ORC_135: + return "ORC-135"; case WriterVersion_ORC_517: return "ORC-517"; case WriterVersion_ORC_203: return "ORC-203"; case WriterVersion_ORC_14: return "ORC-14"; - } - std::stringstream buffer; - buffer << "future - " << version; - return buffer.str(); - } - + } + std::stringstream buffer; + buffer << "future - " << version; + return buffer.str(); + } + std::string writerIdToString(uint32_t id) { switch (id) { case ORC_JAVA_WRITER: @@ -90,59 +90,59 @@ namespace orc { } } - std::string streamKindToString(StreamKind kind) { - switch (static_cast<int>(kind)) { - case StreamKind_PRESENT: - return "present"; - case StreamKind_DATA: - return "data"; - case StreamKind_LENGTH: - return "length"; - case StreamKind_DICTIONARY_DATA: - return "dictionary"; - case StreamKind_DICTIONARY_COUNT: - return "dictionary count"; - case StreamKind_SECONDARY: - return "secondary"; - case StreamKind_ROW_INDEX: - return "index"; - case StreamKind_BLOOM_FILTER: - return "bloom"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string columnEncodingKindToString(ColumnEncodingKind kind) { - switch (static_cast<int>(kind)) { - case ColumnEncodingKind_DIRECT: - return "direct"; - case ColumnEncodingKind_DICTIONARY: - return "dictionary"; - case ColumnEncodingKind_DIRECT_V2: - return "direct rle2"; - case ColumnEncodingKind_DICTIONARY_V2: - return "dictionary rle2"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string FileVersion::toString() const { - std::stringstream ss; - ss << getMajor() << '.' << getMinor(); - return ss.str(); - } - - const FileVersion& FileVersion::v_0_11(){ - static FileVersion version(0,11); - return version; - } - - const FileVersion& FileVersion::v_0_12(){ - static FileVersion version(0,12); - return version; - } -} + std::string streamKindToString(StreamKind kind) { + switch (static_cast<int>(kind)) { + case StreamKind_PRESENT: + return "present"; + case StreamKind_DATA: + return "data"; + case StreamKind_LENGTH: + return "length"; + case StreamKind_DICTIONARY_DATA: + return "dictionary"; + case StreamKind_DICTIONARY_COUNT: + return "dictionary count"; + case StreamKind_SECONDARY: + return "secondary"; + case StreamKind_ROW_INDEX: + return "index"; + case StreamKind_BLOOM_FILTER: + return "bloom"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string columnEncodingKindToString(ColumnEncodingKind kind) { + switch (static_cast<int>(kind)) { + case ColumnEncodingKind_DIRECT: + return "direct"; + case ColumnEncodingKind_DICTIONARY: + return "dictionary"; + case ColumnEncodingKind_DIRECT_V2: + return "direct rle2"; + case ColumnEncodingKind_DICTIONARY_V2: + return "dictionary rle2"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string FileVersion::toString() const { + std::stringstream ss; + ss << getMajor() << '.' << getMinor(); + return ss.str(); + } + + const FileVersion& FileVersion::v_0_11(){ + static FileVersion version(0,11); + return version; + } + + const FileVersion& FileVersion::v_0_12(){ + static FileVersion version(0,12); + return version; + } +} diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc index 4278ed7aae..057641ec1f 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.cc +++ b/contrib/libs/apache/orc/c++/src/Compression.cc @@ -1,1071 +1,1071 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" -#include "LzoDecompressor.hh" -#include "lz4.h" - -#include <algorithm> -#include <iomanip> -#include <iostream> -#include <sstream> - -#include "zlib.h" -#include "zstd.h" - -#include "wrap/snappy-wrapper.h" - -#ifndef ZSTD_CLEVEL_DEFAULT -#define ZSTD_CLEVEL_DEFAULT 3 -#endif - -namespace orc { - - class CompressionStreamBase: public BufferedOutputStream { - public: - CompressionStreamBase(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual bool Next(void** data, int*size) override = 0; - virtual void BackUp(int count) override; - - virtual std::string getName() const override = 0; - virtual uint64_t flush() override; - - virtual bool isCompressed() const override { return true; } - virtual uint64_t getSize() const override; - - protected: - void writeHeader(char * buffer, size_t compressedSize, bool original) { - buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); - buffer[1] = static_cast<char>(compressedSize >> 7); - buffer[2] = static_cast<char>(compressedSize >> 15); - } - - // ensure enough room for compression block header - void ensureHeader(); - - // Buffer to hold uncompressed data until user calls Next() - DataBuffer<unsigned char> rawInputBuffer; - - // Compress level - int level; - - // Compressed data output buffer - char * outputBuffer; - - // Size for compressionBuffer - int bufferSize; - - // Compress output position - int outputPosition; - - // Compress output buffer size - int outputSize; - }; - - CompressionStreamBase::CompressionStreamBase(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) : - BufferedOutputStream(pool, - outStream, - capacity, - blockSize), - rawInputBuffer(pool, blockSize), - level(compressionLevel), - outputBuffer(nullptr), - bufferSize(0), - outputPosition(0), - outputSize(0) { - // PASS - } - - void CompressionStreamBase::BackUp(int count) { - if (count > bufferSize) { - throw std::logic_error("Can't backup that much!"); - } - bufferSize -= count; - } - - uint64_t CompressionStreamBase::flush() { - void * data; - int size; - if (!Next(&data, &size)) { - throw std::runtime_error("Failed to flush compression buffer."); - } - BufferedOutputStream::BackUp(outputSize - outputPosition); - bufferSize = outputSize = outputPosition = 0; - return BufferedOutputStream::flush(); - } - - uint64_t CompressionStreamBase::getSize() const { - return BufferedOutputStream::getSize() - - static_cast<uint64_t>(outputSize - outputPosition); - } - - void CompressionStreamBase::ensureHeader() { - // adjust 3 bytes for the compression header - if (outputPosition + 3 >= outputSize) { - int newPosition = outputPosition + 3 - outputSize; - if (!BufferedOutputStream::Next( - reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::runtime_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = newPosition; - } else { - outputPosition += 3; - } - } - - /** - * Streaming compression base class - */ - class CompressionStream: public CompressionStreamBase { - public: - CompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual bool Next(void** data, int*size) override; - virtual std::string getName() const override = 0; - - protected: - // return total compressed size - virtual uint64_t doStreamingCompression() = 0; - }; - - CompressionStream::CompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) : - CompressionStreamBase(outStream, - compressionLevel, - capacity, - blockSize, - pool) { - // PASS - } - - bool CompressionStream::Next(void** data, int*size) { - if (bufferSize != 0) { - ensureHeader(); - - uint64_t totalCompressedSize = doStreamingCompression(); - - char * header = outputBuffer + outputPosition - totalCompressedSize - 3; - if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { - writeHeader(header, static_cast<size_t>(bufferSize), true); - memcpy( - header + 3, - rawInputBuffer.data(), - static_cast<size_t>(bufferSize)); - - int backup = static_cast<int>(totalCompressedSize) - bufferSize; - BufferedOutputStream::BackUp(backup); - outputPosition -= backup; - outputSize -= backup; - } else { - writeHeader(header, totalCompressedSize, false); - } - } - - *data = rawInputBuffer.data(); - *size = static_cast<int>(rawInputBuffer.size()); - bufferSize = *size; - - return true; - } - - class ZlibCompressionStream: public CompressionStream { - public: - ZlibCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual ~ZlibCompressionStream() override { - end(); - } - - virtual std::string getName() const override; - - protected: - virtual uint64_t doStreamingCompression() override; - - private: - void init(); - void end(); - z_stream strm; - }; - - ZlibCompressionStream::ZlibCompressionStream( - OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : CompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { - init(); - } - - uint64_t ZlibCompressionStream::doStreamingCompression() { - if (deflateReset(&strm) != Z_OK) { - throw std::runtime_error("Failed to reset inflate."); - } - - strm.avail_in = static_cast<unsigned int>(bufferSize); - strm.next_in = rawInputBuffer.data(); - - do { - if (outputPosition >= outputSize) { - if (!BufferedOutputStream::Next( - reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::runtime_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = 0; - } - strm.next_out = reinterpret_cast<unsigned char *> - (outputBuffer + outputPosition); - strm.avail_out = static_cast<unsigned int> - (outputSize - outputPosition); - - int ret = deflate(&strm, Z_FINISH); - outputPosition = outputSize - static_cast<int>(strm.avail_out); - - if (ret == Z_STREAM_END) { - break; - } else if (ret == Z_OK) { - // needs more buffer so will continue the loop - } else { - throw std::runtime_error("Failed to deflate input data."); - } - } while (strm.avail_out == 0); - - return strm.total_out; - } - - std::string ZlibCompressionStream::getName() const { - return "ZlibCompressionStream"; - } - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wold-style-cast") -#endif - - void ZlibCompressionStream::init() { - strm.zalloc = nullptr; - strm.zfree = nullptr; - strm.opaque = nullptr; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" +#include "LzoDecompressor.hh" +#include "lz4.h" + +#include <algorithm> +#include <iomanip> +#include <iostream> +#include <sstream> + +#include "zlib.h" +#include "zstd.h" + +#include "wrap/snappy-wrapper.h" + +#ifndef ZSTD_CLEVEL_DEFAULT +#define ZSTD_CLEVEL_DEFAULT 3 +#endif + +namespace orc { + + class CompressionStreamBase: public BufferedOutputStream { + public: + CompressionStreamBase(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool); + + virtual bool Next(void** data, int*size) override = 0; + virtual void BackUp(int count) override; + + virtual std::string getName() const override = 0; + virtual uint64_t flush() override; + + virtual bool isCompressed() const override { return true; } + virtual uint64_t getSize() const override; + + protected: + void writeHeader(char * buffer, size_t compressedSize, bool original) { + buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); + buffer[1] = static_cast<char>(compressedSize >> 7); + buffer[2] = static_cast<char>(compressedSize >> 15); + } + + // ensure enough room for compression block header + void ensureHeader(); + + // Buffer to hold uncompressed data until user calls Next() + DataBuffer<unsigned char> rawInputBuffer; + + // Compress level + int level; + + // Compressed data output buffer + char * outputBuffer; + + // Size for compressionBuffer + int bufferSize; + + // Compress output position + int outputPosition; + + // Compress output buffer size + int outputSize; + }; + + CompressionStreamBase::CompressionStreamBase(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) : + BufferedOutputStream(pool, + outStream, + capacity, + blockSize), + rawInputBuffer(pool, blockSize), + level(compressionLevel), + outputBuffer(nullptr), + bufferSize(0), + outputPosition(0), + outputSize(0) { + // PASS + } + + void CompressionStreamBase::BackUp(int count) { + if (count > bufferSize) { + throw std::logic_error("Can't backup that much!"); + } + bufferSize -= count; + } + + uint64_t CompressionStreamBase::flush() { + void * data; + int size; + if (!Next(&data, &size)) { + throw std::runtime_error("Failed to flush compression buffer."); + } + BufferedOutputStream::BackUp(outputSize - outputPosition); + bufferSize = outputSize = outputPosition = 0; + return BufferedOutputStream::flush(); + } + + uint64_t CompressionStreamBase::getSize() const { + return BufferedOutputStream::getSize() - + static_cast<uint64_t>(outputSize - outputPosition); + } + + void CompressionStreamBase::ensureHeader() { + // adjust 3 bytes for the compression header + if (outputPosition + 3 >= outputSize) { + int newPosition = outputPosition + 3 - outputSize; + if (!BufferedOutputStream::Next( + reinterpret_cast<void **>(&outputBuffer), + &outputSize)) { + throw std::runtime_error( + "Failed to get next output buffer from output stream."); + } + outputPosition = newPosition; + } else { + outputPosition += 3; + } + } + + /** + * Streaming compression base class + */ + class CompressionStream: public CompressionStreamBase { + public: + CompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool); + + virtual bool Next(void** data, int*size) override; + virtual std::string getName() const override = 0; + + protected: + // return total compressed size + virtual uint64_t doStreamingCompression() = 0; + }; + + CompressionStream::CompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) : + CompressionStreamBase(outStream, + compressionLevel, + capacity, + blockSize, + pool) { + // PASS + } + + bool CompressionStream::Next(void** data, int*size) { + if (bufferSize != 0) { + ensureHeader(); + + uint64_t totalCompressedSize = doStreamingCompression(); + + char * header = outputBuffer + outputPosition - totalCompressedSize - 3; + if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { + writeHeader(header, static_cast<size_t>(bufferSize), true); + memcpy( + header + 3, + rawInputBuffer.data(), + static_cast<size_t>(bufferSize)); + + int backup = static_cast<int>(totalCompressedSize) - bufferSize; + BufferedOutputStream::BackUp(backup); + outputPosition -= backup; + outputSize -= backup; + } else { + writeHeader(header, totalCompressedSize, false); + } + } + + *data = rawInputBuffer.data(); + *size = static_cast<int>(rawInputBuffer.size()); + bufferSize = *size; + + return true; + } + + class ZlibCompressionStream: public CompressionStream { + public: + ZlibCompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool); + + virtual ~ZlibCompressionStream() override { + end(); + } + + virtual std::string getName() const override; + + protected: + virtual uint64_t doStreamingCompression() override; + + private: + void init(); + void end(); + z_stream strm; + }; + + ZlibCompressionStream::ZlibCompressionStream( + OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) + : CompressionStream(outStream, + compressionLevel, + capacity, + blockSize, + pool) { + init(); + } + + uint64_t ZlibCompressionStream::doStreamingCompression() { + if (deflateReset(&strm) != Z_OK) { + throw std::runtime_error("Failed to reset inflate."); + } + + strm.avail_in = static_cast<unsigned int>(bufferSize); + strm.next_in = rawInputBuffer.data(); + + do { + if (outputPosition >= outputSize) { + if (!BufferedOutputStream::Next( + reinterpret_cast<void **>(&outputBuffer), + &outputSize)) { + throw std::runtime_error( + "Failed to get next output buffer from output stream."); + } + outputPosition = 0; + } + strm.next_out = reinterpret_cast<unsigned char *> + (outputBuffer + outputPosition); + strm.avail_out = static_cast<unsigned int> + (outputSize - outputPosition); + + int ret = deflate(&strm, Z_FINISH); + outputPosition = outputSize - static_cast<int>(strm.avail_out); + + if (ret == Z_STREAM_END) { + break; + } else if (ret == Z_OK) { + // needs more buffer so will continue the loop + } else { + throw std::runtime_error("Failed to deflate input data."); + } + } while (strm.avail_out == 0); + + return strm.total_out; + } + + std::string ZlibCompressionStream::getName() const { + return "ZlibCompressionStream"; + } + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wold-style-cast") +#endif + + void ZlibCompressionStream::init() { + strm.zalloc = nullptr; + strm.zfree = nullptr; + strm.opaque = nullptr; strm.next_in = nullptr; - - if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) - != Z_OK) { - throw std::runtime_error("Error while calling deflateInit2() for zlib."); - } - } - - void ZlibCompressionStream::end() { - (void)deflateEnd(&strm); - } - -DIAGNOSTIC_PUSH - - enum DecompressState { DECOMPRESS_HEADER, - DECOMPRESS_START, - DECOMPRESS_CONTINUE, - DECOMPRESS_ORIGINAL, - DECOMPRESS_EOF}; - - class ZlibDecompressionStream: public SeekableInputStream { - public: - ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool); - virtual ~ZlibDecompressionStream() override; - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - - private: - void readBuffer(bool failOnEof) { - int length; - if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), - &length)) { - if (failOnEof) { - throw ParseError("Read past EOF in " - "ZlibDecompressionStream::readBuffer"); - } - state = DECOMPRESS_EOF; - inputBuffer = nullptr; - inputBufferEnd = nullptr; - } else { - inputBufferEnd = inputBuffer + length; - } - } - - uint32_t readByte(bool failOnEof) { - if (inputBuffer == inputBufferEnd) { - readBuffer(failOnEof); - if (state == DECOMPRESS_EOF) { - return 0; - } - } - return static_cast<unsigned char>(*(inputBuffer++)); - } - - void readHeader() { - uint32_t header = readByte(false); - if (state != DECOMPRESS_EOF) { - header |= readByte(true) << 8; - header |= readByte(true) << 16; - if (header & 1) { - state = DECOMPRESS_ORIGINAL; - } else { - state = DECOMPRESS_START; - } - remainingLength = header >> 1; - } else { - remainingLength = 0; - } - } - - MemoryPool& pool; - const size_t blockSize; - std::unique_ptr<SeekableInputStream> input; - z_stream zstream; - DataBuffer<char> buffer; - - // the current state - DecompressState state; - - // the start of the current buffer - // This pointer is not owned by us. It is either owned by zstream or - // the underlying stream. - const char* outputBuffer; - // the size of the current buffer - size_t outputBufferLength; - // the size of the current chunk - size_t remainingLength; - - // the last buffer returned from the input - const char *inputBuffer; - const char *inputBufferEnd; - - // roughly the number of bytes returned - off_t bytesReturned; - }; - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wold-style-cast") -#endif - - ZlibDecompressionStream::ZlibDecompressionStream - (std::unique_ptr<SeekableInputStream> inStream, - size_t _blockSize, - MemoryPool& _pool - ): pool(_pool), - blockSize(_blockSize), - buffer(pool, _blockSize) { - input.reset(inStream.release()); - zstream.next_in = nullptr; - zstream.avail_in = 0; - zstream.zalloc = nullptr; - zstream.zfree = nullptr; - zstream.opaque = nullptr; - zstream.next_out = reinterpret_cast<Bytef*>(buffer.data()); - zstream.avail_out = static_cast<uInt>(blockSize); - int64_t result = inflateInit2(&zstream, -15); - switch (result) { - case Z_OK: - break; - case Z_MEM_ERROR: - throw std::logic_error("Memory error from inflateInit2"); - case Z_VERSION_ERROR: - throw std::logic_error("Version error from inflateInit2"); - case Z_STREAM_ERROR: - throw std::logic_error("Stream error from inflateInit2"); - default: - throw std::logic_error("Unknown error from inflateInit2"); - } - outputBuffer = nullptr; - outputBufferLength = 0; - remainingLength = 0; - state = DECOMPRESS_HEADER; - inputBuffer = nullptr; - inputBufferEnd = nullptr; - bytesReturned = 0; - } - -DIAGNOSTIC_POP - - ZlibDecompressionStream::~ZlibDecompressionStream() { - int64_t result = inflateEnd(&zstream); - if (result != Z_OK) { - // really can't throw in destructors - std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n"; - } - } - - bool ZlibDecompressionStream::Next(const void** data, int*size) { - // if the user pushed back, return them the partial buffer - if (outputBufferLength) { - *data = outputBuffer; - *size = static_cast<int>(outputBufferLength); - outputBuffer += outputBufferLength; - outputBufferLength = 0; - return true; - } - if (state == DECOMPRESS_HEADER || remainingLength == 0) { - readHeader(); - } - if (state == DECOMPRESS_EOF) { - return false; - } - if (inputBuffer == inputBufferEnd) { - readBuffer(true); - } - size_t availSize = - std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), - remainingLength); - if (state == DECOMPRESS_ORIGINAL) { - *data = inputBuffer; - *size = static_cast<int>(availSize); - outputBuffer = inputBuffer + availSize; - outputBufferLength = 0; - } else if (state == DECOMPRESS_START) { - zstream.next_in = - reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availSize); - outputBuffer = buffer.data(); - zstream.next_out = - reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); - zstream.avail_out = static_cast<uInt>(blockSize); - if (inflateReset(&zstream) != Z_OK) { - throw std::logic_error("Bad inflateReset in " - "ZlibDecompressionStream::Next"); - } - int64_t result; - do { - result = inflate(&zstream, availSize == remainingLength ? Z_FINISH : - Z_SYNC_FLUSH); - switch (result) { - case Z_OK: - remainingLength -= availSize; - inputBuffer += availSize; - readBuffer(true); - availSize = - std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), - remainingLength); - zstream.next_in = - reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availSize); - break; - case Z_STREAM_END: - break; - case Z_BUF_ERROR: - throw std::logic_error("Buffer error in " - "ZlibDecompressionStream::Next"); - case Z_DATA_ERROR: - throw std::logic_error("Data error in " - "ZlibDecompressionStream::Next"); - case Z_STREAM_ERROR: - throw std::logic_error("Stream error in " - "ZlibDecompressionStream::Next"); - default: - throw std::logic_error("Unknown error in " - "ZlibDecompressionStream::Next"); - } - } while (result != Z_STREAM_END); - *size = static_cast<int>(blockSize - zstream.avail_out); - *data = outputBuffer; - outputBufferLength = 0; - outputBuffer += *size; - } else { - throw std::logic_error("Unknown compression state in " - "ZlibDecompressionStream::Next"); - } - inputBuffer += availSize; - remainingLength -= availSize; - bytesReturned += *size; - return true; - } - - void ZlibDecompressionStream::BackUp(int count) { - if (outputBuffer == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in " - "ZlibDecompressionStream"); - } - outputBuffer -= static_cast<size_t>(count); - outputBufferLength = static_cast<size_t>(count); - bytesReturned -= count; - } - - bool ZlibDecompressionStream::Skip(int count) { - bytesReturned += count; - // this is a stupid implementation for now. - // should skip entire blocks without decompressing - while (count > 0) { - const void *ptr; - int len; - if (!Next(&ptr, &len)) { - return false; - } - if (len > count) { - BackUp(len - count); - count = 0; - } else { - count -= len; - } - } - return true; - } - - int64_t ZlibDecompressionStream::ByteCount() const { - return bytesReturned; - } - - void ZlibDecompressionStream::seek(PositionProvider& position) { - // clear state to force seek to read from the right position - state = DECOMPRESS_HEADER; - outputBuffer = nullptr; - outputBufferLength = 0; - remainingLength = 0; - inputBuffer = nullptr; - inputBufferEnd = nullptr; - - input->seek(position); - bytesReturned = static_cast<off_t>(input->ByteCount()); - if (!Skip(static_cast<int>(position.next()))) { - throw ParseError("Bad skip in ZlibDecompressionStream::seek"); - } - } - - std::string ZlibDecompressionStream::getName() const { - std::ostringstream result; - result << "zlib(" << input->getName() << ")"; - return result.str(); - } - - class BlockDecompressionStream: public SeekableInputStream { - public: - BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool); - - virtual ~BlockDecompressionStream() override {} - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override = 0; - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength) = 0; - - std::string getStreamName() const { - return input->getName(); - } - - private: - void readBuffer(bool failOnEof) { - int length; - if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr), - &length)) { - if (failOnEof) { - throw ParseError(getName() + "read past EOF"); - } - state = DECOMPRESS_EOF; - inputBufferPtr = nullptr; - inputBufferPtrEnd = nullptr; - } else { - inputBufferPtrEnd = inputBufferPtr + length; - } - } - - uint32_t readByte(bool failOnEof) { - if (inputBufferPtr == inputBufferPtrEnd) { - readBuffer(failOnEof); - if (state == DECOMPRESS_EOF) { - return 0; - } - } - return static_cast<unsigned char>(*(inputBufferPtr++)); - } - - void readHeader() { - uint32_t header = readByte(false); - if (state != DECOMPRESS_EOF) { - header |= readByte(true) << 8; - header |= readByte(true) << 16; - if (header & 1) { - state = DECOMPRESS_ORIGINAL; - } else { - state = DECOMPRESS_START; - } - remainingLength = header >> 1; - } else { - remainingLength = 0; - } - } - - std::unique_ptr<SeekableInputStream> input; - MemoryPool& pool; - - // may need to stitch together multiple input buffers; - // to give snappy a contiguous block - DataBuffer<char> inputBuffer; - - // uncompressed output - DataBuffer<char> outputBuffer; - - // the current state - DecompressState state; - - // the start of the current output buffer - const char* outputBufferPtr; - // the size of the current output buffer - size_t outputBufferLength; - - // the size of the current chunk - size_t remainingLength; - - // the last buffer returned from the input - const char *inputBufferPtr; - const char *inputBufferPtrEnd; - - // bytes returned by this stream - off_t bytesReturned; - }; - - BlockDecompressionStream::BlockDecompressionStream - (std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, - MemoryPool& _pool - ) : pool(_pool), - inputBuffer(pool, bufferSize), - outputBuffer(pool, bufferSize), - state(DECOMPRESS_HEADER), - outputBufferPtr(nullptr), - outputBufferLength(0), - remainingLength(0), - inputBufferPtr(nullptr), - inputBufferPtrEnd(nullptr), - bytesReturned(0) { - input.reset(inStream.release()); - } - - bool BlockDecompressionStream::Next(const void** data, int*size) { - // if the user pushed back, return them the partial buffer - if (outputBufferLength) { - *data = outputBufferPtr; - *size = static_cast<int>(outputBufferLength); - outputBufferPtr += outputBufferLength; - bytesReturned += static_cast<off_t>(outputBufferLength); - outputBufferLength = 0; - return true; - } - if (state == DECOMPRESS_HEADER || remainingLength == 0) { - readHeader(); - } - if (state == DECOMPRESS_EOF) { - return false; - } - if (inputBufferPtr == inputBufferPtrEnd) { - readBuffer(true); - } - - size_t availSize = - std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr), - remainingLength); - if (state == DECOMPRESS_ORIGINAL) { - *data = inputBufferPtr; - *size = static_cast<int>(availSize); - outputBufferPtr = inputBufferPtr + availSize; - outputBufferLength = 0; - inputBufferPtr += availSize; - remainingLength -= availSize; - } else if (state == DECOMPRESS_START) { - // Get contiguous bytes of compressed block. - const char *compressed = inputBufferPtr; - if (remainingLength == availSize) { - inputBufferPtr += availSize; - } else { - // Did not read enough from input. - if (inputBuffer.capacity() < remainingLength) { - inputBuffer.resize(remainingLength); - } - ::memcpy(inputBuffer.data(), inputBufferPtr, availSize); - inputBufferPtr += availSize; - compressed = inputBuffer.data(); - - for (size_t pos = availSize; pos < remainingLength; ) { - readBuffer(true); - size_t avail = - std::min(static_cast<size_t>(inputBufferPtrEnd - - inputBufferPtr), - remainingLength - pos); - ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail); - pos += avail; - inputBufferPtr += avail; - } - } - - outputBufferLength = decompress(compressed, remainingLength, - outputBuffer.data(), - outputBuffer.capacity()); - - remainingLength = 0; - state = DECOMPRESS_HEADER; - *data = outputBuffer.data(); - *size = static_cast<int>(outputBufferLength); - outputBufferPtr = outputBuffer.data() + outputBufferLength; - outputBufferLength = 0; - } - - bytesReturned += *size; - return true; - } - - void BlockDecompressionStream::BackUp(int count) { - if (outputBufferPtr == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in "+getName()); - } - outputBufferPtr -= static_cast<size_t>(count); - outputBufferLength = static_cast<size_t>(count); - bytesReturned -= count; - } - - bool BlockDecompressionStream::Skip(int count) { - bytesReturned += count; - // this is a stupid implementation for now. - // should skip entire blocks without decompressing - while (count > 0) { - const void *ptr; - int len; - if (!Next(&ptr, &len)) { - return false; - } - if (len > count) { - BackUp(len - count); - count = 0; - } else { - count -= len; - } - } - return true; - } - - int64_t BlockDecompressionStream::ByteCount() const { - return bytesReturned; - } - - void BlockDecompressionStream::seek(PositionProvider& position) { - // clear state to force seek to read from the right position - state = DECOMPRESS_HEADER; - outputBufferPtr = nullptr; - outputBufferLength = 0; - remainingLength = 0; - inputBufferPtr = nullptr; - inputBufferPtrEnd = nullptr; - - input->seek(position); - if (!Skip(static_cast<int>(position.next()))) { - throw ParseError("Bad skip in " + getName()); - } - } - - class SnappyDecompressionStream: public BlockDecompressionStream { - public: - SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - pool) { - // PASS - } - - std::string getName() const override { - std::ostringstream result; - result << "snappy(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; - }; - - uint64_t SnappyDecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { - size_t outLength; - if (!snappy::GetUncompressedLength(input, length, &outLength)) { - throw ParseError("SnappyDecompressionStream choked on corrupt input"); - } - - if (outLength > maxOutputLength) { - throw std::logic_error("Snappy length exceeds block size"); - } - - if (!snappy::RawUncompress(input, length, output)) { - throw ParseError("SnappyDecompressionStream choked on corrupt input"); - } - return outLength; - } - - class LzoDecompressionStream: public BlockDecompressionStream { - public: - LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - pool) { - // PASS - } - - std::string getName() const override { - std::ostringstream result; - result << "lzo(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; - }; - - uint64_t LzoDecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { - return lzoDecompress(input, input + length, output, - output + maxOutputLength); - } - - class Lz4DecompressionStream: public BlockDecompressionStream { - public: - Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - pool) { - // PASS - } - - std::string getName() const override { - std::ostringstream result; - result << "lz4(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; - }; - - uint64_t Lz4DecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { - int result = LZ4_decompress_safe(input, output, static_cast<int>(length), - static_cast<int>(maxOutputLength)); - if (result < 0) { - throw ParseError(getName() + " - failed to decompress"); - } - return static_cast<uint64_t>(result); - } - - /** - * Block compression base class - */ - class BlockCompressionStream: public CompressionStreamBase { - public: - BlockCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : CompressionStreamBase(outStream, - compressionLevel, - capacity, - blockSize, - pool) - , compressorBuffer(pool) { - // PASS - } - - virtual bool Next(void** data, int*size) override; - virtual std::string getName() const override = 0; - - protected: - // compresses a block and returns the compressed size - virtual uint64_t doBlockCompression() = 0; - - // return maximum possible compression size for allocating space for - // compressorBuffer below - virtual uint64_t estimateMaxCompressionSize() = 0; - - // should allocate max possible compressed size - DataBuffer<unsigned char> compressorBuffer; - }; - - bool BlockCompressionStream::Next(void** data, int*size) { - if (bufferSize != 0) { - ensureHeader(); - - // perform compression - size_t totalCompressedSize = doBlockCompression(); - - const unsigned char * dataToWrite = nullptr; - int totalSizeToWrite = 0; - char * header = outputBuffer + outputPosition - 3; - - if (totalCompressedSize >= static_cast<size_t>(bufferSize)) { - writeHeader(header, static_cast<size_t>(bufferSize), true); - dataToWrite = rawInputBuffer.data(); - totalSizeToWrite = bufferSize; - } else { - writeHeader(header, totalCompressedSize, false); - dataToWrite = compressorBuffer.data(); - totalSizeToWrite = static_cast<int>(totalCompressedSize); - } - - char * dst = header + 3; - while (totalSizeToWrite > 0) { - if (outputPosition == outputSize) { - if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::logic_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = 0; - dst = outputBuffer; - } else if (outputPosition > outputSize) { - // this will unlikely happen, but we have seen a few on zstd v1.1.0 - throw std::logic_error("Write to an out-of-bound place!"); - } - - int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition); - std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite)); - - outputPosition += sizeToWrite; - dataToWrite += sizeToWrite; - totalSizeToWrite -= sizeToWrite; - dst += sizeToWrite; - } - } - - *data = rawInputBuffer.data(); - *size = static_cast<int>(rawInputBuffer.size()); - bufferSize = *size; - compressorBuffer.resize(estimateMaxCompressionSize()); - - return true; - } - - /** - * ZSTD block compression - */ - class ZSTDCompressionStream: public BlockCompressionStream { - public: - ZSTDCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : BlockCompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { + + if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) + != Z_OK) { + throw std::runtime_error("Error while calling deflateInit2() for zlib."); + } + } + + void ZlibCompressionStream::end() { + (void)deflateEnd(&strm); + } + +DIAGNOSTIC_PUSH + + enum DecompressState { DECOMPRESS_HEADER, + DECOMPRESS_START, + DECOMPRESS_CONTINUE, + DECOMPRESS_ORIGINAL, + DECOMPRESS_EOF}; + + class ZlibDecompressionStream: public SeekableInputStream { + public: + ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool); + virtual ~ZlibDecompressionStream() override; + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + + private: + void readBuffer(bool failOnEof) { + int length; + if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), + &length)) { + if (failOnEof) { + throw ParseError("Read past EOF in " + "ZlibDecompressionStream::readBuffer"); + } + state = DECOMPRESS_EOF; + inputBuffer = nullptr; + inputBufferEnd = nullptr; + } else { + inputBufferEnd = inputBuffer + length; + } + } + + uint32_t readByte(bool failOnEof) { + if (inputBuffer == inputBufferEnd) { + readBuffer(failOnEof); + if (state == DECOMPRESS_EOF) { + return 0; + } + } + return static_cast<unsigned char>(*(inputBuffer++)); + } + + void readHeader() { + uint32_t header = readByte(false); + if (state != DECOMPRESS_EOF) { + header |= readByte(true) << 8; + header |= readByte(true) << 16; + if (header & 1) { + state = DECOMPRESS_ORIGINAL; + } else { + state = DECOMPRESS_START; + } + remainingLength = header >> 1; + } else { + remainingLength = 0; + } + } + + MemoryPool& pool; + const size_t blockSize; + std::unique_ptr<SeekableInputStream> input; + z_stream zstream; + DataBuffer<char> buffer; + + // the current state + DecompressState state; + + // the start of the current buffer + // This pointer is not owned by us. It is either owned by zstream or + // the underlying stream. + const char* outputBuffer; + // the size of the current buffer + size_t outputBufferLength; + // the size of the current chunk + size_t remainingLength; + + // the last buffer returned from the input + const char *inputBuffer; + const char *inputBufferEnd; + + // roughly the number of bytes returned + off_t bytesReturned; + }; + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wold-style-cast") +#endif + + ZlibDecompressionStream::ZlibDecompressionStream + (std::unique_ptr<SeekableInputStream> inStream, + size_t _blockSize, + MemoryPool& _pool + ): pool(_pool), + blockSize(_blockSize), + buffer(pool, _blockSize) { + input.reset(inStream.release()); + zstream.next_in = nullptr; + zstream.avail_in = 0; + zstream.zalloc = nullptr; + zstream.zfree = nullptr; + zstream.opaque = nullptr; + zstream.next_out = reinterpret_cast<Bytef*>(buffer.data()); + zstream.avail_out = static_cast<uInt>(blockSize); + int64_t result = inflateInit2(&zstream, -15); + switch (result) { + case Z_OK: + break; + case Z_MEM_ERROR: + throw std::logic_error("Memory error from inflateInit2"); + case Z_VERSION_ERROR: + throw std::logic_error("Version error from inflateInit2"); + case Z_STREAM_ERROR: + throw std::logic_error("Stream error from inflateInit2"); + default: + throw std::logic_error("Unknown error from inflateInit2"); + } + outputBuffer = nullptr; + outputBufferLength = 0; + remainingLength = 0; + state = DECOMPRESS_HEADER; + inputBuffer = nullptr; + inputBufferEnd = nullptr; + bytesReturned = 0; + } + +DIAGNOSTIC_POP + + ZlibDecompressionStream::~ZlibDecompressionStream() { + int64_t result = inflateEnd(&zstream); + if (result != Z_OK) { + // really can't throw in destructors + std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n"; + } + } + + bool ZlibDecompressionStream::Next(const void** data, int*size) { + // if the user pushed back, return them the partial buffer + if (outputBufferLength) { + *data = outputBuffer; + *size = static_cast<int>(outputBufferLength); + outputBuffer += outputBufferLength; + outputBufferLength = 0; + return true; + } + if (state == DECOMPRESS_HEADER || remainingLength == 0) { + readHeader(); + } + if (state == DECOMPRESS_EOF) { + return false; + } + if (inputBuffer == inputBufferEnd) { + readBuffer(true); + } + size_t availSize = + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), + remainingLength); + if (state == DECOMPRESS_ORIGINAL) { + *data = inputBuffer; + *size = static_cast<int>(availSize); + outputBuffer = inputBuffer + availSize; + outputBufferLength = 0; + } else if (state == DECOMPRESS_START) { + zstream.next_in = + reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream.avail_in = static_cast<uInt>(availSize); + outputBuffer = buffer.data(); + zstream.next_out = + reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); + zstream.avail_out = static_cast<uInt>(blockSize); + if (inflateReset(&zstream) != Z_OK) { + throw std::logic_error("Bad inflateReset in " + "ZlibDecompressionStream::Next"); + } + int64_t result; + do { + result = inflate(&zstream, availSize == remainingLength ? Z_FINISH : + Z_SYNC_FLUSH); + switch (result) { + case Z_OK: + remainingLength -= availSize; + inputBuffer += availSize; + readBuffer(true); + availSize = + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), + remainingLength); + zstream.next_in = + reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream.avail_in = static_cast<uInt>(availSize); + break; + case Z_STREAM_END: + break; + case Z_BUF_ERROR: + throw std::logic_error("Buffer error in " + "ZlibDecompressionStream::Next"); + case Z_DATA_ERROR: + throw std::logic_error("Data error in " + "ZlibDecompressionStream::Next"); + case Z_STREAM_ERROR: + throw std::logic_error("Stream error in " + "ZlibDecompressionStream::Next"); + default: + throw std::logic_error("Unknown error in " + "ZlibDecompressionStream::Next"); + } + } while (result != Z_STREAM_END); + *size = static_cast<int>(blockSize - zstream.avail_out); + *data = outputBuffer; + outputBufferLength = 0; + outputBuffer += *size; + } else { + throw std::logic_error("Unknown compression state in " + "ZlibDecompressionStream::Next"); + } + inputBuffer += availSize; + remainingLength -= availSize; + bytesReturned += *size; + return true; + } + + void ZlibDecompressionStream::BackUp(int count) { + if (outputBuffer == nullptr || outputBufferLength != 0) { + throw std::logic_error("Backup without previous Next in " + "ZlibDecompressionStream"); + } + outputBuffer -= static_cast<size_t>(count); + outputBufferLength = static_cast<size_t>(count); + bytesReturned -= count; + } + + bool ZlibDecompressionStream::Skip(int count) { + bytesReturned += count; + // this is a stupid implementation for now. + // should skip entire blocks without decompressing + while (count > 0) { + const void *ptr; + int len; + if (!Next(&ptr, &len)) { + return false; + } + if (len > count) { + BackUp(len - count); + count = 0; + } else { + count -= len; + } + } + return true; + } + + int64_t ZlibDecompressionStream::ByteCount() const { + return bytesReturned; + } + + void ZlibDecompressionStream::seek(PositionProvider& position) { + // clear state to force seek to read from the right position + state = DECOMPRESS_HEADER; + outputBuffer = nullptr; + outputBufferLength = 0; + remainingLength = 0; + inputBuffer = nullptr; + inputBufferEnd = nullptr; + + input->seek(position); + bytesReturned = static_cast<off_t>(input->ByteCount()); + if (!Skip(static_cast<int>(position.next()))) { + throw ParseError("Bad skip in ZlibDecompressionStream::seek"); + } + } + + std::string ZlibDecompressionStream::getName() const { + std::ostringstream result; + result << "zlib(" << input->getName() << ")"; + return result.str(); + } + + class BlockDecompressionStream: public SeekableInputStream { + public: + BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool); + + virtual ~BlockDecompressionStream() override {} + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override = 0; + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength) = 0; + + std::string getStreamName() const { + return input->getName(); + } + + private: + void readBuffer(bool failOnEof) { + int length; + if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr), + &length)) { + if (failOnEof) { + throw ParseError(getName() + "read past EOF"); + } + state = DECOMPRESS_EOF; + inputBufferPtr = nullptr; + inputBufferPtrEnd = nullptr; + } else { + inputBufferPtrEnd = inputBufferPtr + length; + } + } + + uint32_t readByte(bool failOnEof) { + if (inputBufferPtr == inputBufferPtrEnd) { + readBuffer(failOnEof); + if (state == DECOMPRESS_EOF) { + return 0; + } + } + return static_cast<unsigned char>(*(inputBufferPtr++)); + } + + void readHeader() { + uint32_t header = readByte(false); + if (state != DECOMPRESS_EOF) { + header |= readByte(true) << 8; + header |= readByte(true) << 16; + if (header & 1) { + state = DECOMPRESS_ORIGINAL; + } else { + state = DECOMPRESS_START; + } + remainingLength = header >> 1; + } else { + remainingLength = 0; + } + } + + std::unique_ptr<SeekableInputStream> input; + MemoryPool& pool; + + // may need to stitch together multiple input buffers; + // to give snappy a contiguous block + DataBuffer<char> inputBuffer; + + // uncompressed output + DataBuffer<char> outputBuffer; + + // the current state + DecompressState state; + + // the start of the current output buffer + const char* outputBufferPtr; + // the size of the current output buffer + size_t outputBufferLength; + + // the size of the current chunk + size_t remainingLength; + + // the last buffer returned from the input + const char *inputBufferPtr; + const char *inputBufferPtrEnd; + + // bytes returned by this stream + off_t bytesReturned; + }; + + BlockDecompressionStream::BlockDecompressionStream + (std::unique_ptr<SeekableInputStream> inStream, + size_t bufferSize, + MemoryPool& _pool + ) : pool(_pool), + inputBuffer(pool, bufferSize), + outputBuffer(pool, bufferSize), + state(DECOMPRESS_HEADER), + outputBufferPtr(nullptr), + outputBufferLength(0), + remainingLength(0), + inputBufferPtr(nullptr), + inputBufferPtrEnd(nullptr), + bytesReturned(0) { + input.reset(inStream.release()); + } + + bool BlockDecompressionStream::Next(const void** data, int*size) { + // if the user pushed back, return them the partial buffer + if (outputBufferLength) { + *data = outputBufferPtr; + *size = static_cast<int>(outputBufferLength); + outputBufferPtr += outputBufferLength; + bytesReturned += static_cast<off_t>(outputBufferLength); + outputBufferLength = 0; + return true; + } + if (state == DECOMPRESS_HEADER || remainingLength == 0) { + readHeader(); + } + if (state == DECOMPRESS_EOF) { + return false; + } + if (inputBufferPtr == inputBufferPtrEnd) { + readBuffer(true); + } + + size_t availSize = + std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr), + remainingLength); + if (state == DECOMPRESS_ORIGINAL) { + *data = inputBufferPtr; + *size = static_cast<int>(availSize); + outputBufferPtr = inputBufferPtr + availSize; + outputBufferLength = 0; + inputBufferPtr += availSize; + remainingLength -= availSize; + } else if (state == DECOMPRESS_START) { + // Get contiguous bytes of compressed block. + const char *compressed = inputBufferPtr; + if (remainingLength == availSize) { + inputBufferPtr += availSize; + } else { + // Did not read enough from input. + if (inputBuffer.capacity() < remainingLength) { + inputBuffer.resize(remainingLength); + } + ::memcpy(inputBuffer.data(), inputBufferPtr, availSize); + inputBufferPtr += availSize; + compressed = inputBuffer.data(); + + for (size_t pos = availSize; pos < remainingLength; ) { + readBuffer(true); + size_t avail = + std::min(static_cast<size_t>(inputBufferPtrEnd - + inputBufferPtr), + remainingLength - pos); + ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail); + pos += avail; + inputBufferPtr += avail; + } + } + + outputBufferLength = decompress(compressed, remainingLength, + outputBuffer.data(), + outputBuffer.capacity()); + + remainingLength = 0; + state = DECOMPRESS_HEADER; + *data = outputBuffer.data(); + *size = static_cast<int>(outputBufferLength); + outputBufferPtr = outputBuffer.data() + outputBufferLength; + outputBufferLength = 0; + } + + bytesReturned += *size; + return true; + } + + void BlockDecompressionStream::BackUp(int count) { + if (outputBufferPtr == nullptr || outputBufferLength != 0) { + throw std::logic_error("Backup without previous Next in "+getName()); + } + outputBufferPtr -= static_cast<size_t>(count); + outputBufferLength = static_cast<size_t>(count); + bytesReturned -= count; + } + + bool BlockDecompressionStream::Skip(int count) { + bytesReturned += count; + // this is a stupid implementation for now. + // should skip entire blocks without decompressing + while (count > 0) { + const void *ptr; + int len; + if (!Next(&ptr, &len)) { + return false; + } + if (len > count) { + BackUp(len - count); + count = 0; + } else { + count -= len; + } + } + return true; + } + + int64_t BlockDecompressionStream::ByteCount() const { + return bytesReturned; + } + + void BlockDecompressionStream::seek(PositionProvider& position) { + // clear state to force seek to read from the right position + state = DECOMPRESS_HEADER; + outputBufferPtr = nullptr; + outputBufferLength = 0; + remainingLength = 0; + inputBufferPtr = nullptr; + inputBufferPtrEnd = nullptr; + + input->seek(position); + if (!Skip(static_cast<int>(position.next()))) { + throw ParseError("Bad skip in " + getName()); + } + } + + class SnappyDecompressionStream: public BlockDecompressionStream { + public: + SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool + ): BlockDecompressionStream + (std::move(inStream), + blockSize, + pool) { + // PASS + } + + std::string getName() const override { + std::ostringstream result; + result << "snappy(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength + ) override; + }; + + uint64_t SnappyDecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { + size_t outLength; + if (!snappy::GetUncompressedLength(input, length, &outLength)) { + throw ParseError("SnappyDecompressionStream choked on corrupt input"); + } + + if (outLength > maxOutputLength) { + throw std::logic_error("Snappy length exceeds block size"); + } + + if (!snappy::RawUncompress(input, length, output)) { + throw ParseError("SnappyDecompressionStream choked on corrupt input"); + } + return outLength; + } + + class LzoDecompressionStream: public BlockDecompressionStream { + public: + LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool + ): BlockDecompressionStream + (std::move(inStream), + blockSize, + pool) { + // PASS + } + + std::string getName() const override { + std::ostringstream result; + result << "lzo(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength + ) override; + }; + + uint64_t LzoDecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { + return lzoDecompress(input, input + length, output, + output + maxOutputLength); + } + + class Lz4DecompressionStream: public BlockDecompressionStream { + public: + Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool + ): BlockDecompressionStream + (std::move(inStream), + blockSize, + pool) { + // PASS + } + + std::string getName() const override { + std::ostringstream result; + result << "lz4(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength + ) override; + }; + + uint64_t Lz4DecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { + int result = LZ4_decompress_safe(input, output, static_cast<int>(length), + static_cast<int>(maxOutputLength)); + if (result < 0) { + throw ParseError(getName() + " - failed to decompress"); + } + return static_cast<uint64_t>(result); + } + + /** + * Block compression base class + */ + class BlockCompressionStream: public CompressionStreamBase { + public: + BlockCompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) + : CompressionStreamBase(outStream, + compressionLevel, + capacity, + blockSize, + pool) + , compressorBuffer(pool) { + // PASS + } + + virtual bool Next(void** data, int*size) override; + virtual std::string getName() const override = 0; + + protected: + // compresses a block and returns the compressed size + virtual uint64_t doBlockCompression() = 0; + + // return maximum possible compression size for allocating space for + // compressorBuffer below + virtual uint64_t estimateMaxCompressionSize() = 0; + + // should allocate max possible compressed size + DataBuffer<unsigned char> compressorBuffer; + }; + + bool BlockCompressionStream::Next(void** data, int*size) { + if (bufferSize != 0) { + ensureHeader(); + + // perform compression + size_t totalCompressedSize = doBlockCompression(); + + const unsigned char * dataToWrite = nullptr; + int totalSizeToWrite = 0; + char * header = outputBuffer + outputPosition - 3; + + if (totalCompressedSize >= static_cast<size_t>(bufferSize)) { + writeHeader(header, static_cast<size_t>(bufferSize), true); + dataToWrite = rawInputBuffer.data(); + totalSizeToWrite = bufferSize; + } else { + writeHeader(header, totalCompressedSize, false); + dataToWrite = compressorBuffer.data(); + totalSizeToWrite = static_cast<int>(totalCompressedSize); + } + + char * dst = header + 3; + while (totalSizeToWrite > 0) { + if (outputPosition == outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer), + &outputSize)) { + throw std::logic_error( + "Failed to get next output buffer from output stream."); + } + outputPosition = 0; + dst = outputBuffer; + } else if (outputPosition > outputSize) { + // this will unlikely happen, but we have seen a few on zstd v1.1.0 + throw std::logic_error("Write to an out-of-bound place!"); + } + + int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition); + std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite)); + + outputPosition += sizeToWrite; + dataToWrite += sizeToWrite; + totalSizeToWrite -= sizeToWrite; + dst += sizeToWrite; + } + } + + *data = rawInputBuffer.data(); + *size = static_cast<int>(rawInputBuffer.size()); + bufferSize = *size; + compressorBuffer.resize(estimateMaxCompressionSize()); + + return true; + } + + /** + * ZSTD block compression + */ + class ZSTDCompressionStream: public BlockCompressionStream { + public: + ZSTDCompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) + : BlockCompressionStream(outStream, + compressionLevel, + capacity, + blockSize, + pool) { this->init(); - } - - virtual std::string getName() const override { - return "ZstdCompressionStream"; - } + } + + virtual std::string getName() const override { + return "ZstdCompressionStream"; + } virtual ~ZSTDCompressionStream() override { this->end(); } - - protected: - virtual uint64_t doBlockCompression() override; - - virtual uint64_t estimateMaxCompressionSize() override { - return ZSTD_compressBound(static_cast<size_t>(bufferSize)); - } + + protected: + virtual uint64_t doBlockCompression() override; + + virtual uint64_t estimateMaxCompressionSize() override { + return ZSTD_compressBound(static_cast<size_t>(bufferSize)); + } private: void init(); void end(); ZSTD_CCtx *cctx; - }; - - uint64_t ZSTDCompressionStream::doBlockCompression() { + }; + + uint64_t ZSTDCompressionStream::doBlockCompression() { return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(), rawInputBuffer.data(), static_cast<size_t>(bufferSize), level); - } + } DIAGNOSTIC_PUSH - + #if defined(__GNUC__) || defined(__clang__) DIAGNOSTIC_IGNORE("-Wold-style-cast") #endif @@ -1086,53 +1086,53 @@ DIAGNOSTIC_PUSH DIAGNOSTIC_PUSH - /** - * ZSTD block decompression - */ - class ZSTDDecompressionStream: public BlockDecompressionStream { - public: - ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool) - : BlockDecompressionStream(std::move(inStream), - blockSize, - pool) { + /** + * ZSTD block decompression + */ + class ZSTDDecompressionStream: public BlockDecompressionStream { + public: + ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool) + : BlockDecompressionStream(std::move(inStream), + blockSize, + pool) { this->init(); - } - + } + virtual ~ZSTDDecompressionStream() override { this->end(); } - std::string getName() const override { - std::ostringstream result; - result << "zstd(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) override; + std::string getName() const override { + std::ostringstream result; + result << "zstd(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) override; private: void init(); void end(); ZSTD_DCtx *dctx; - }; - - uint64_t ZSTDDecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { + }; + + uint64_t ZSTDDecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { return static_cast<uint64_t>(ZSTD_decompressDCtx(dctx, output, maxOutputLength, input, length)); - } - + } + DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) @@ -1155,71 +1155,71 @@ DIAGNOSTIC_PUSH DIAGNOSTIC_PUSH - std::unique_ptr<BufferedOutputStream> - createCompressor( - CompressionKind kind, - OutputStream * outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool) { - switch (static_cast<int64_t>(kind)) { - case CompressionKind_NONE: { - return std::unique_ptr<BufferedOutputStream> - (new BufferedOutputStream( - pool, outStream, bufferCapacity, compressionBlockSize)); - } - case CompressionKind_ZLIB: { - int level = (strategy == CompressionStrategy_SPEED) ? - Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; - return std::unique_ptr<BufferedOutputStream> - (new ZlibCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_ZSTD: { - int level = (strategy == CompressionStrategy_SPEED) ? - 1 : ZSTD_CLEVEL_DEFAULT; - return std::unique_ptr<BufferedOutputStream> - (new ZSTDCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_SNAPPY: - case CompressionKind_LZO: - case CompressionKind_LZ4: - default: - throw NotImplementedYet("compression codec"); - } - } - - std::unique_ptr<SeekableInputStream> - createDecompressor(CompressionKind kind, - std::unique_ptr<SeekableInputStream> input, - uint64_t blockSize, - MemoryPool& pool) { - switch (static_cast<int64_t>(kind)) { - case CompressionKind_NONE: - return REDUNDANT_MOVE(input); - case CompressionKind_ZLIB: - return std::unique_ptr<SeekableInputStream> - (new ZlibDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_SNAPPY: - return std::unique_ptr<SeekableInputStream> - (new SnappyDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_LZO: - return std::unique_ptr<SeekableInputStream> - (new LzoDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_LZ4: - return std::unique_ptr<SeekableInputStream> - (new Lz4DecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_ZSTD: - return std::unique_ptr<SeekableInputStream> - (new ZSTDDecompressionStream(std::move(input), blockSize, pool)); - default: { - std::ostringstream buffer; - buffer << "Unknown compression codec " << kind; - throw NotImplementedYet(buffer.str()); - } - } - } - -} + std::unique_ptr<BufferedOutputStream> + createCompressor( + CompressionKind kind, + OutputStream * outStream, + CompressionStrategy strategy, + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + MemoryPool& pool) { + switch (static_cast<int64_t>(kind)) { + case CompressionKind_NONE: { + return std::unique_ptr<BufferedOutputStream> + (new BufferedOutputStream( + pool, outStream, bufferCapacity, compressionBlockSize)); + } + case CompressionKind_ZLIB: { + int level = (strategy == CompressionStrategy_SPEED) ? + Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; + return std::unique_ptr<BufferedOutputStream> + (new ZlibCompressionStream( + outStream, level, bufferCapacity, compressionBlockSize, pool)); + } + case CompressionKind_ZSTD: { + int level = (strategy == CompressionStrategy_SPEED) ? + 1 : ZSTD_CLEVEL_DEFAULT; + return std::unique_ptr<BufferedOutputStream> + (new ZSTDCompressionStream( + outStream, level, bufferCapacity, compressionBlockSize, pool)); + } + case CompressionKind_SNAPPY: + case CompressionKind_LZO: + case CompressionKind_LZ4: + default: + throw NotImplementedYet("compression codec"); + } + } + + std::unique_ptr<SeekableInputStream> + createDecompressor(CompressionKind kind, + std::unique_ptr<SeekableInputStream> input, + uint64_t blockSize, + MemoryPool& pool) { + switch (static_cast<int64_t>(kind)) { + case CompressionKind_NONE: + return REDUNDANT_MOVE(input); + case CompressionKind_ZLIB: + return std::unique_ptr<SeekableInputStream> + (new ZlibDecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_SNAPPY: + return std::unique_ptr<SeekableInputStream> + (new SnappyDecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_LZO: + return std::unique_ptr<SeekableInputStream> + (new LzoDecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_LZ4: + return std::unique_ptr<SeekableInputStream> + (new Lz4DecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_ZSTD: + return std::unique_ptr<SeekableInputStream> + (new ZSTDDecompressionStream(std::move(input), blockSize, pool)); + default: { + std::ostringstream buffer; + buffer << "Unknown compression codec " << kind; + throw NotImplementedYet(buffer.str()); + } + } + } + +} diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh index ff79377d83..84e85bddaf 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.hh +++ b/contrib/libs/apache/orc/c++/src/Compression.hh @@ -1,58 +1,58 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COMPRESSION_HH -#define ORC_COMPRESSION_HH - -#include "io/InputStream.hh" -#include "io/OutputStream.hh" - -namespace orc { - - /** - * Create a decompressor for the given compression kind. - * @param kind the compression type to implement - * @param input the input stream that is the underlying source - * @param bufferSize the maximum size of the buffer - * @param pool the memory pool - */ - std::unique_ptr<SeekableInputStream> - createDecompressor(CompressionKind kind, - std::unique_ptr<SeekableInputStream> input, - uint64_t bufferSize, - MemoryPool& pool); - - /** - * Create a compressor for the given compression kind. - * @param kind the compression type to implement - * @param outStream the output stream that is the underlying target - * @param strategy compression strategy - * @param bufferCapacity compression stream buffer total capacity - * @param compressionBlockSize compression buffer block size - * @param pool the memory pool - */ - std::unique_ptr<BufferedOutputStream> - createCompressor(CompressionKind kind, - OutputStream * outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COMPRESSION_HH +#define ORC_COMPRESSION_HH + +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +namespace orc { + + /** + * Create a decompressor for the given compression kind. + * @param kind the compression type to implement + * @param input the input stream that is the underlying source + * @param bufferSize the maximum size of the buffer + * @param pool the memory pool + */ + std::unique_ptr<SeekableInputStream> + createDecompressor(CompressionKind kind, + std::unique_ptr<SeekableInputStream> input, + uint64_t bufferSize, + MemoryPool& pool); + + /** + * Create a compressor for the given compression kind. + * @param kind the compression type to implement + * @param outStream the output stream that is the underlying target + * @param strategy compression strategy + * @param bufferCapacity compression stream buffer total capacity + * @param compressionBlockSize compression buffer block size + * @param pool the memory pool + */ + std::unique_ptr<BufferedOutputStream> + createCompressor(CompressionKind kind, + OutputStream * outStream, + CompressionStrategy strategy, + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + MemoryPool& pool); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc index 2077b27df4..f721c05a88 100644 --- a/contrib/libs/apache/orc/c++/src/Exceptions.cc +++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc @@ -1,78 +1,78 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" - -namespace orc { - - NotImplementedYet::NotImplementedYet(const std::string& what_arg - ) : logic_error(what_arg) { - // PASS - } - - NotImplementedYet::NotImplementedYet(const char* what_arg - ) :logic_error(what_arg) { - // PASS - } - - NotImplementedYet::NotImplementedYet(const NotImplementedYet& error - ): logic_error(error) { - // PASS - } - - NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT { - // PASS - } - - ParseError::ParseError(const std::string& what_arg - ): runtime_error(what_arg) { - // PASS - } - - ParseError::ParseError(const char* what_arg - ): runtime_error(what_arg) { - // PASS - } - - ParseError::ParseError(const ParseError& error): runtime_error(error) { - // PASS - } - - ParseError::~ParseError() ORC_NOEXCEPT { - // PASS - } - - InvalidArgument::InvalidArgument(const std::string& what_arg - ): runtime_error(what_arg) { - // PASS - } - - InvalidArgument::InvalidArgument(const char* what_arg - ): runtime_error(what_arg) { - // PASS - } - - InvalidArgument::InvalidArgument(const InvalidArgument& error - ): runtime_error(error) { - // PASS - } - - InvalidArgument::~InvalidArgument() ORC_NOEXCEPT { - // PASS - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" + +namespace orc { + + NotImplementedYet::NotImplementedYet(const std::string& what_arg + ) : logic_error(what_arg) { + // PASS + } + + NotImplementedYet::NotImplementedYet(const char* what_arg + ) :logic_error(what_arg) { + // PASS + } + + NotImplementedYet::NotImplementedYet(const NotImplementedYet& error + ): logic_error(error) { + // PASS + } + + NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT { + // PASS + } + + ParseError::ParseError(const std::string& what_arg + ): runtime_error(what_arg) { + // PASS + } + + ParseError::ParseError(const char* what_arg + ): runtime_error(what_arg) { + // PASS + } + + ParseError::ParseError(const ParseError& error): runtime_error(error) { + // PASS + } + + ParseError::~ParseError() ORC_NOEXCEPT { + // PASS + } + + InvalidArgument::InvalidArgument(const std::string& what_arg + ): runtime_error(what_arg) { + // PASS + } + + InvalidArgument::InvalidArgument(const char* what_arg + ): runtime_error(what_arg) { + // PASS + } + + InvalidArgument::InvalidArgument(const InvalidArgument& error + ): runtime_error(error) { + // PASS + } + + InvalidArgument::~InvalidArgument() ORC_NOEXCEPT { + // PASS + } +} diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc index 433e6fa193..96266e855c 100644 --- a/contrib/libs/apache/orc/c++/src/Int128.cc +++ b/contrib/libs/apache/orc/c++/src/Int128.cc @@ -1,494 +1,494 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" -#include "Adaptor.hh" - -#include <algorithm> -#include <iomanip> -#include <iostream> -#include <sstream> - -namespace orc { - - Int128 Int128::maximumValue() { - return Int128(0x7fffffffffffffff, 0xfffffffffffffff); - } - - Int128 Int128::minimumValue() { - return Int128(static_cast<int64_t>(0x8000000000000000), 0x0); - } - - Int128::Int128(const std::string& str) { - lowbits = 0; - highbits = 0; - size_t length = str.length(); - if (length > 0) { - bool isNegative = str[0] == '-'; - size_t posn = isNegative ? 1 : 0; - while (posn < length) { - size_t group = std::min(static_cast<size_t>(18), length - posn); - int64_t chunk = std::stoll(str.substr(posn, group)); - int64_t multiple = 1; - for(size_t i=0; i < group; ++i) { - multiple *= 10; - } - *this *= multiple; - *this += chunk; - posn += group; - } - if (isNegative) { - negate(); - } - } - } - - Int128& Int128::operator*=(const Int128 &right) { - const uint64_t INT_MASK = 0xffffffff; - const uint64_t CARRY_BIT = INT_MASK + 1; - - // Break the left and right numbers into 32 bit chunks - // so that we can multiply them without overflow. - uint64_t L0 = static_cast<uint64_t>(highbits) >> 32; - uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK; - uint64_t L2 = lowbits >> 32; - uint64_t L3 = lowbits & INT_MASK; - uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32; - uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK; - uint64_t R2 = right.lowbits >> 32; - uint64_t R3 = right.lowbits & INT_MASK; - - uint64_t product = L3 * R3; - lowbits = product & INT_MASK; - uint64_t sum = product >> 32; - product = L2 * R3; - sum += product; - highbits = sum < product ? CARRY_BIT : 0; - product = L3 * R2; - sum += product; - if (sum < product) { - highbits += CARRY_BIT; - } - lowbits += sum << 32; - highbits += static_cast<int64_t>(sum >> 32); - highbits += L1 * R3 + L2 * R2 + L3 * R1; - highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; - return *this; - } - - /** - * Expands the given value into an array of ints so that we can work on - * it. The array will be converted to an absolute value and the wasNegative - * flag will be set appropriately. The array will remove leading zeros from - * the value. - * @param array an array of length 4 to set with the value - * @param wasNegative a flag for whether the value was original negative - * @result the output length of the array - */ - int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const { - uint64_t high; - uint64_t low; - if (highbits < 0) { - low = ~lowbits + 1; - high = static_cast<uint64_t>(~highbits); - if (low == 0) { - high += 1; - } - wasNegative = true; - } else { - low = lowbits; - high = static_cast<uint64_t>(highbits); - wasNegative = false; - } - if (high != 0) { - if (high > UINT32_MAX) { - array[0] = static_cast<uint32_t>(high >> 32); - array[1] = static_cast<uint32_t>(high); - array[2] = static_cast<uint32_t>(low >> 32); - array[3] = static_cast<uint32_t>(low); - return 4; - } else { - array[0] = static_cast<uint32_t>(high); - array[1] = static_cast<uint32_t>(low >> 32); - array[2] = static_cast<uint32_t>(low); - return 3; - } - } else if (low >= UINT32_MAX) { - array[0] = static_cast<uint32_t>(low >> 32); - array[1] = static_cast<uint32_t>(low); - return 2; - } else if (low == 0) { - return 0; - } else { - array[0] = static_cast<uint32_t>(low); - return 1; - } - } - - - /** - * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is - * the MSB. We can replace this with bsrq asm instruction on x64. - */ - int64_t fls(uint32_t x) { - int64_t bitpos = 0; - while (x) { - x >>= 1; - bitpos += 1; - } - return bitpos; - } - - /** - * Shift the number in the array left by bits positions. - * @param array the number to shift, must have length elements - * @param length the number of entries in the array - * @param bits the number of bits to shift (0 <= bits < 32) - */ - void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits != 0) { - for(int64_t i=0; i < length-1; ++i) { - array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits)); - } - array[length-1] <<= bits; - } - } - - /** - * Shift the number in the array right by bits positions. - * @param array the number to shift, must have length elements - * @param length the number of entries in the array - * @param bits the number of bits to shift (0 <= bits < 32) - */ - void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits != 0) { - for(int64_t i=length-1; i > 0; --i) { - array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits)); - } - array[0] >>= bits; - } - } - - /** - * Fix the signs of the result and remainder at the end of the division - * based on the signs of the dividend and divisor. - */ - void fixDivisionSigns(Int128 &result, Int128 &remainder, - bool dividendWasNegative, bool divisorWasNegative) { - if (dividendWasNegative != divisorWasNegative) { - result.negate(); - } - if (dividendWasNegative) { - remainder.negate(); - } - } - - /** - * Build a Int128 from a list of ints. - */ - void buildFromArray(Int128& value, uint32_t* array, int64_t length) { - switch (length) { - case 0: - value = 0; - break; - case 1: - value = array[0]; - break; - case 2: - value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]); - break; - case 3: - value = Int128(array[0], - (static_cast<uint64_t>(array[1]) << 32) + array[2]); - break; - case 4: - value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1], - (static_cast<uint64_t>(array[2]) << 32) + array[3]); - break; - case 5: - if (array[0] != 0) { - throw std::logic_error("Can't build Int128 with 5 ints."); - } - value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2], - (static_cast<uint64_t>(array[3]) << 32) + array[4]); - break; - default: - throw std::logic_error("Unsupported length for building Int128"); - } - } - - /** - * Do a division where the divisor fits into a single 32 bit value. - */ - Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, - uint32_t divisor, Int128& remainder, - bool dividendWasNegative, bool divisorWasNegative) { - uint64_t r = 0; - uint32_t resultArray[5]; - for(int64_t j=0; j < dividendLength; j++) { - r <<= 32; - r += dividend[j]; - resultArray[j] = static_cast<uint32_t>(r / divisor); - r %= divisor; - } - Int128 result; - buildFromArray(result, resultArray, dividendLength); - remainder = static_cast<int64_t>(r); - fixDivisionSigns(result, remainder, dividendWasNegative, - divisorWasNegative); - return result; - } - - Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const { - // Split the dividend and divisor into integer pieces so that we can - // work on them. - uint32_t dividendArray[5]; - uint32_t divisorArray[4]; - bool dividendWasNegative; - bool divisorWasNegative; - // leave an extra zero before the dividend - dividendArray[0] = 0; - int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1; - int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative); - - // Handle some of the easy cases. - if (dividendLength <= divisorLength) { - remainder = *this; - return 0; - } else if (divisorLength == 0) { - throw std::range_error("Division by 0 in Int128"); - } else if (divisorLength == 1) { - return singleDivide(dividendArray, dividendLength, divisorArray[0], - remainder, dividendWasNegative, divisorWasNegative); - } - - int64_t resultLength = dividendLength - divisorLength; - uint32_t resultArray[4]; - - // Normalize by shifting both by a multiple of 2 so that - // the digit guessing is better. The requirement is that - // divisorArray[0] is greater than 2**31. - int64_t normalizeBits = 32 - fls(divisorArray[0]); - shiftArrayLeft(divisorArray, divisorLength, normalizeBits); - shiftArrayLeft(dividendArray, dividendLength, normalizeBits); - - // compute each digit in the result - for(int64_t j=0; j < resultLength; ++j) { - // Guess the next digit. At worst it is two too large - uint32_t guess = UINT32_MAX; - uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | - dividendArray[j+1]; - if (dividendArray[j] != divisorArray[0]) { - guess = static_cast<uint32_t>(highDividend / divisorArray[0]); - } - - // catch all of the cases where guess is two too large and most of the - // cases where it is one too large - uint32_t rhat = - static_cast<uint32_t>(highDividend - guess * - static_cast<uint64_t>(divisorArray[0])); - while (static_cast<uint64_t>(divisorArray[1]) * guess > - (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) { - guess -= 1; - rhat += divisorArray[0]; - if (static_cast<uint64_t>(rhat) < divisorArray[0]) { - break; - } - } - - // subtract off the guess * divisor from the dividend - uint64_t mult = 0; - for(int64_t i=divisorLength-1; i >= 0; --i) { - mult += static_cast<uint64_t>(guess) * divisorArray[i]; - uint32_t prev = dividendArray[j+i+1]; - dividendArray[j+i+1] -= static_cast<uint32_t>(mult); - mult >>= 32; - if (dividendArray[j+i+1] > prev) { - mult += 1; - } - } - uint32_t prev = dividendArray[j]; - dividendArray[j] -= static_cast<uint32_t>(mult); - - // if guess was too big, we add back divisor - if (dividendArray[j] > prev) { - guess -= 1; - uint32_t carry = 0; - for(int64_t i=divisorLength-1; i >= 0; --i) { - uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + - dividendArray[j+i+1] + carry; - dividendArray[j+i+1] = static_cast<uint32_t>(sum); - carry = static_cast<uint32_t>(sum >> 32); - } - dividendArray[j] += carry; - } - - resultArray[j] = guess; - } - - // denormalize the remainder - shiftArrayRight(dividendArray, dividendLength, normalizeBits); - - // return result and remainder - Int128 result; - buildFromArray(result, resultArray, resultLength); - buildFromArray(remainder, dividendArray, dividendLength); - fixDivisionSigns(result, remainder, - dividendWasNegative, divisorWasNegative); - return result; - } - - std::string Int128::toString() const { - // 10**18 - the largest power of 10 less than 63 bits - const Int128 tenTo18(0xde0b6b3a7640000); - // 10**36 - const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000); - Int128 remainder; - std::stringstream buf; - bool needFill = false; - - // get anything above 10**36 and print it - Int128 top = divide(tenTo36, remainder); - if (top != 0) { - buf << top.toLong(); - remainder.abs(); - needFill = true; - } - - // now get anything above 10**18 and print it - Int128 tail; - top = remainder.divide(tenTo18, tail); - if (needFill || top != 0) { - if (needFill) { - buf << std::setw(18) << std::setfill('0'); - } else { - needFill = true; - tail.abs(); - } - buf << top.toLong(); - } - - // finally print the tail, which is less than 10**18 - if (needFill) { - buf << std::setw(18) << std::setfill('0'); - } - buf << tail.toLong(); - return buf.str(); - } - - std::string Int128::toDecimalString(int32_t scale) const { - std::string str = toString(); - if (scale == 0) { - return str; - } else if (*this < 0) { - int32_t len = static_cast<int32_t>(str.length()); - if (len - 1 > scale) { - return str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); - } else if (len - 1 == scale) { - return "-0." + str.substr(1, std::string::npos); - } else { - std::string result = "-0."; - for(int32_t i=0; i < scale - len + 1; ++i) { - result += "0"; - } - return result + str.substr(1, std::string::npos); - } - } else { - int32_t len = static_cast<int32_t>(str.length()); - if (len > scale) { - return str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); - } else if (len == scale) { - return "0." + str; - } else { - std::string result = "0."; - for(int32_t i=0; i < scale - len; ++i) { - result += "0"; - } - return result + str; - } - } - } - - std::string Int128::toHexString() const { - std::stringstream buf; - buf << std::hex << "0x" - << std::setw(16) << std::setfill('0') << highbits - << std::setw(16) << std::setfill('0') << lowbits; - return buf.str(); - } - - const static int32_t MAX_PRECISION_64 = 18; - const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; - - Int128 scaleUpInt128ByPowerOfTen(Int128 value, - int32_t power, - bool &overflow) { - overflow = false; - Int128 remainder; - - while (power > 0) { - int32_t step = std::min(power, MAX_PRECISION_64); - if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) { - overflow = true; - return Int128::maximumValue(); - } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) { - overflow = true; - return Int128::minimumValue(); - } - - value *= POWERS_OF_TEN[step]; - power -= step; - } - - return value; - } - - Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) { - Int128 remainder; - while (power > 0) { - int32_t step = std::min(std::abs(power), MAX_PRECISION_64); - value = value.divide(POWERS_OF_TEN[step], remainder); - power -= step; - } - return value; - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" +#include "Adaptor.hh" + +#include <algorithm> +#include <iomanip> +#include <iostream> +#include <sstream> + +namespace orc { + + Int128 Int128::maximumValue() { + return Int128(0x7fffffffffffffff, 0xfffffffffffffff); + } + + Int128 Int128::minimumValue() { + return Int128(static_cast<int64_t>(0x8000000000000000), 0x0); + } + + Int128::Int128(const std::string& str) { + lowbits = 0; + highbits = 0; + size_t length = str.length(); + if (length > 0) { + bool isNegative = str[0] == '-'; + size_t posn = isNegative ? 1 : 0; + while (posn < length) { + size_t group = std::min(static_cast<size_t>(18), length - posn); + int64_t chunk = std::stoll(str.substr(posn, group)); + int64_t multiple = 1; + for(size_t i=0; i < group; ++i) { + multiple *= 10; + } + *this *= multiple; + *this += chunk; + posn += group; + } + if (isNegative) { + negate(); + } + } + } + + Int128& Int128::operator*=(const Int128 &right) { + const uint64_t INT_MASK = 0xffffffff; + const uint64_t CARRY_BIT = INT_MASK + 1; + + // Break the left and right numbers into 32 bit chunks + // so that we can multiply them without overflow. + uint64_t L0 = static_cast<uint64_t>(highbits) >> 32; + uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK; + uint64_t L2 = lowbits >> 32; + uint64_t L3 = lowbits & INT_MASK; + uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32; + uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK; + uint64_t R2 = right.lowbits >> 32; + uint64_t R3 = right.lowbits & INT_MASK; + + uint64_t product = L3 * R3; + lowbits = product & INT_MASK; + uint64_t sum = product >> 32; + product = L2 * R3; + sum += product; + highbits = sum < product ? CARRY_BIT : 0; + product = L3 * R2; + sum += product; + if (sum < product) { + highbits += CARRY_BIT; + } + lowbits += sum << 32; + highbits += static_cast<int64_t>(sum >> 32); + highbits += L1 * R3 + L2 * R2 + L3 * R1; + highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; + return *this; + } + + /** + * Expands the given value into an array of ints so that we can work on + * it. The array will be converted to an absolute value and the wasNegative + * flag will be set appropriately. The array will remove leading zeros from + * the value. + * @param array an array of length 4 to set with the value + * @param wasNegative a flag for whether the value was original negative + * @result the output length of the array + */ + int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const { + uint64_t high; + uint64_t low; + if (highbits < 0) { + low = ~lowbits + 1; + high = static_cast<uint64_t>(~highbits); + if (low == 0) { + high += 1; + } + wasNegative = true; + } else { + low = lowbits; + high = static_cast<uint64_t>(highbits); + wasNegative = false; + } + if (high != 0) { + if (high > UINT32_MAX) { + array[0] = static_cast<uint32_t>(high >> 32); + array[1] = static_cast<uint32_t>(high); + array[2] = static_cast<uint32_t>(low >> 32); + array[3] = static_cast<uint32_t>(low); + return 4; + } else { + array[0] = static_cast<uint32_t>(high); + array[1] = static_cast<uint32_t>(low >> 32); + array[2] = static_cast<uint32_t>(low); + return 3; + } + } else if (low >= UINT32_MAX) { + array[0] = static_cast<uint32_t>(low >> 32); + array[1] = static_cast<uint32_t>(low); + return 2; + } else if (low == 0) { + return 0; + } else { + array[0] = static_cast<uint32_t>(low); + return 1; + } + } + + + /** + * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is + * the MSB. We can replace this with bsrq asm instruction on x64. + */ + int64_t fls(uint32_t x) { + int64_t bitpos = 0; + while (x) { + x >>= 1; + bitpos += 1; + } + return bitpos; + } + + /** + * Shift the number in the array left by bits positions. + * @param array the number to shift, must have length elements + * @param length the number of entries in the array + * @param bits the number of bits to shift (0 <= bits < 32) + */ + void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits != 0) { + for(int64_t i=0; i < length-1; ++i) { + array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits)); + } + array[length-1] <<= bits; + } + } + + /** + * Shift the number in the array right by bits positions. + * @param array the number to shift, must have length elements + * @param length the number of entries in the array + * @param bits the number of bits to shift (0 <= bits < 32) + */ + void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits != 0) { + for(int64_t i=length-1; i > 0; --i) { + array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits)); + } + array[0] >>= bits; + } + } + + /** + * Fix the signs of the result and remainder at the end of the division + * based on the signs of the dividend and divisor. + */ + void fixDivisionSigns(Int128 &result, Int128 &remainder, + bool dividendWasNegative, bool divisorWasNegative) { + if (dividendWasNegative != divisorWasNegative) { + result.negate(); + } + if (dividendWasNegative) { + remainder.negate(); + } + } + + /** + * Build a Int128 from a list of ints. + */ + void buildFromArray(Int128& value, uint32_t* array, int64_t length) { + switch (length) { + case 0: + value = 0; + break; + case 1: + value = array[0]; + break; + case 2: + value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]); + break; + case 3: + value = Int128(array[0], + (static_cast<uint64_t>(array[1]) << 32) + array[2]); + break; + case 4: + value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1], + (static_cast<uint64_t>(array[2]) << 32) + array[3]); + break; + case 5: + if (array[0] != 0) { + throw std::logic_error("Can't build Int128 with 5 ints."); + } + value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2], + (static_cast<uint64_t>(array[3]) << 32) + array[4]); + break; + default: + throw std::logic_error("Unsupported length for building Int128"); + } + } + + /** + * Do a division where the divisor fits into a single 32 bit value. + */ + Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, + uint32_t divisor, Int128& remainder, + bool dividendWasNegative, bool divisorWasNegative) { + uint64_t r = 0; + uint32_t resultArray[5]; + for(int64_t j=0; j < dividendLength; j++) { + r <<= 32; + r += dividend[j]; + resultArray[j] = static_cast<uint32_t>(r / divisor); + r %= divisor; + } + Int128 result; + buildFromArray(result, resultArray, dividendLength); + remainder = static_cast<int64_t>(r); + fixDivisionSigns(result, remainder, dividendWasNegative, + divisorWasNegative); + return result; + } + + Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const { + // Split the dividend and divisor into integer pieces so that we can + // work on them. + uint32_t dividendArray[5]; + uint32_t divisorArray[4]; + bool dividendWasNegative; + bool divisorWasNegative; + // leave an extra zero before the dividend + dividendArray[0] = 0; + int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1; + int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative); + + // Handle some of the easy cases. + if (dividendLength <= divisorLength) { + remainder = *this; + return 0; + } else if (divisorLength == 0) { + throw std::range_error("Division by 0 in Int128"); + } else if (divisorLength == 1) { + return singleDivide(dividendArray, dividendLength, divisorArray[0], + remainder, dividendWasNegative, divisorWasNegative); + } + + int64_t resultLength = dividendLength - divisorLength; + uint32_t resultArray[4]; + + // Normalize by shifting both by a multiple of 2 so that + // the digit guessing is better. The requirement is that + // divisorArray[0] is greater than 2**31. + int64_t normalizeBits = 32 - fls(divisorArray[0]); + shiftArrayLeft(divisorArray, divisorLength, normalizeBits); + shiftArrayLeft(dividendArray, dividendLength, normalizeBits); + + // compute each digit in the result + for(int64_t j=0; j < resultLength; ++j) { + // Guess the next digit. At worst it is two too large + uint32_t guess = UINT32_MAX; + uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | + dividendArray[j+1]; + if (dividendArray[j] != divisorArray[0]) { + guess = static_cast<uint32_t>(highDividend / divisorArray[0]); + } + + // catch all of the cases where guess is two too large and most of the + // cases where it is one too large + uint32_t rhat = + static_cast<uint32_t>(highDividend - guess * + static_cast<uint64_t>(divisorArray[0])); + while (static_cast<uint64_t>(divisorArray[1]) * guess > + (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) { + guess -= 1; + rhat += divisorArray[0]; + if (static_cast<uint64_t>(rhat) < divisorArray[0]) { + break; + } + } + + // subtract off the guess * divisor from the dividend + uint64_t mult = 0; + for(int64_t i=divisorLength-1; i >= 0; --i) { + mult += static_cast<uint64_t>(guess) * divisorArray[i]; + uint32_t prev = dividendArray[j+i+1]; + dividendArray[j+i+1] -= static_cast<uint32_t>(mult); + mult >>= 32; + if (dividendArray[j+i+1] > prev) { + mult += 1; + } + } + uint32_t prev = dividendArray[j]; + dividendArray[j] -= static_cast<uint32_t>(mult); + + // if guess was too big, we add back divisor + if (dividendArray[j] > prev) { + guess -= 1; + uint32_t carry = 0; + for(int64_t i=divisorLength-1; i >= 0; --i) { + uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + + dividendArray[j+i+1] + carry; + dividendArray[j+i+1] = static_cast<uint32_t>(sum); + carry = static_cast<uint32_t>(sum >> 32); + } + dividendArray[j] += carry; + } + + resultArray[j] = guess; + } + + // denormalize the remainder + shiftArrayRight(dividendArray, dividendLength, normalizeBits); + + // return result and remainder + Int128 result; + buildFromArray(result, resultArray, resultLength); + buildFromArray(remainder, dividendArray, dividendLength); + fixDivisionSigns(result, remainder, + dividendWasNegative, divisorWasNegative); + return result; + } + + std::string Int128::toString() const { + // 10**18 - the largest power of 10 less than 63 bits + const Int128 tenTo18(0xde0b6b3a7640000); + // 10**36 + const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000); + Int128 remainder; + std::stringstream buf; + bool needFill = false; + + // get anything above 10**36 and print it + Int128 top = divide(tenTo36, remainder); + if (top != 0) { + buf << top.toLong(); + remainder.abs(); + needFill = true; + } + + // now get anything above 10**18 and print it + Int128 tail; + top = remainder.divide(tenTo18, tail); + if (needFill || top != 0) { + if (needFill) { + buf << std::setw(18) << std::setfill('0'); + } else { + needFill = true; + tail.abs(); + } + buf << top.toLong(); + } + + // finally print the tail, which is less than 10**18 + if (needFill) { + buf << std::setw(18) << std::setfill('0'); + } + buf << tail.toLong(); + return buf.str(); + } + + std::string Int128::toDecimalString(int32_t scale) const { + std::string str = toString(); + if (scale == 0) { + return str; + } else if (*this < 0) { + int32_t len = static_cast<int32_t>(str.length()); + if (len - 1 > scale) { + return str.substr(0, static_cast<size_t>(len - scale)) + "." + + str.substr(static_cast<size_t>(len - scale), + static_cast<size_t>(scale)); + } else if (len - 1 == scale) { + return "-0." + str.substr(1, std::string::npos); + } else { + std::string result = "-0."; + for(int32_t i=0; i < scale - len + 1; ++i) { + result += "0"; + } + return result + str.substr(1, std::string::npos); + } + } else { + int32_t len = static_cast<int32_t>(str.length()); + if (len > scale) { + return str.substr(0, static_cast<size_t>(len - scale)) + "." + + str.substr(static_cast<size_t>(len - scale), + static_cast<size_t>(scale)); + } else if (len == scale) { + return "0." + str; + } else { + std::string result = "0."; + for(int32_t i=0; i < scale - len; ++i) { + result += "0"; + } + return result + str; + } + } + } + + std::string Int128::toHexString() const { + std::stringstream buf; + buf << std::hex << "0x" + << std::setw(16) << std::setfill('0') << highbits + << std::setw(16) << std::setfill('0') << lowbits; + return buf.str(); + } + + const static int32_t MAX_PRECISION_64 = 18; + const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = + {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + + Int128 scaleUpInt128ByPowerOfTen(Int128 value, + int32_t power, + bool &overflow) { + overflow = false; + Int128 remainder; + + while (power > 0) { + int32_t step = std::min(power, MAX_PRECISION_64); + if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) { + overflow = true; + return Int128::maximumValue(); + } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) { + overflow = true; + return Int128::minimumValue(); + } + + value *= POWERS_OF_TEN[step]; + power -= step; + } + + return value; + } + + Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) { + Int128 remainder; + while (power > 0) { + int32_t step = std::min(std::abs(power), MAX_PRECISION_64); + value = value.divide(POWERS_OF_TEN[step], remainder); + power -= step; + } + return value; + } + +} diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc index d1ba183aeb..7bf91dee13 100644 --- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc +++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc @@ -1,391 +1,391 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" - -#include <string> - -namespace orc { - - static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4}; - static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3}; - - static const int32_t SIZE_OF_SHORT = 2; - static const int32_t SIZE_OF_INT = 4; - static const int32_t SIZE_OF_LONG = 8; - - static std::string toHex(uint64_t val) { - std::ostringstream out; - out << "0x" << std::hex << val; - return out.str(); - } - - static std::string toString(int64_t val) { - std::ostringstream out; - out << val; - return out.str(); - } - - class MalformedInputException: public ParseError { - public: - MalformedInputException(int64_t off - ) :ParseError("MalformedInputException at " + - toString(off)) { - } - - MalformedInputException(int64_t off, const std::string& msg - ): ParseError("MalformedInputException " + msg + - " at " + toString(off)) { - } - - MalformedInputException(const MalformedInputException& other - ): ParseError(other.what()) { - } - - virtual ~MalformedInputException() noexcept; - }; - - MalformedInputException::~MalformedInputException() noexcept { - // PASS - } - - uint64_t lzoDecompress(const char *inputAddress, - const char *inputLimit, - char *outputAddress, - char *outputLimit) { - // nothing compresses to nothing - if (inputAddress == inputLimit) { - return 0; - } - - // maximum offset in buffers to which it's safe to write long-at-a-time - char * const fastOutputLimit = outputLimit - SIZE_OF_LONG; - - // LZO can concat two blocks together so, decode until the input data is - // consumed - const char *input = inputAddress; - char *output = outputAddress; - while (input < inputLimit) { - // - // Note: For safety some of the code below may stop decoding early or - // skip decoding, because input is not available. This makes the code - // safe, and since LZO requires an explicit "stop" command, the decoder - // will still throw a exception. - // - - bool firstCommand = true; - uint32_t lastLiteralLength = 0; - while (true) { - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - uint32_t command = *(input++) & 0xFF; - if (command == 0x11) { - break; - } - - // Commands are described using a bit pattern notation: - // 0: bit is not set - // 1: bit is set - // L: part of literal length - // P: part of match offset position - // M: part of match length - // ?: see documentation in command decoder - - int32_t matchLength; - int32_t matchOffset; - uint32_t literalLength; - if ((command & 0xf0) == 0) { - if (lastLiteralLength == 0) { - // 0b0000_LLLL (0bLLLL_LLLL)* - - // copy length :: fixed - // 0 - matchOffset = 0; - - // copy offset :: fixed - // 0 - matchLength = 0; - - // literal length - 3 :: variable bits :: valid range [4..] - // 3 + variableLength(command bits [0..3], 4) - literalLength = command & 0xf; - if (literalLength == 0) { - literalLength = 0xf; - - uint32_t nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { - literalLength += 0xff; - } - literalLength += nextByte; - } - literalLength += 3; - } else if (lastLiteralLength <= 3) { - // 0b0000_PPLL 0bPPPP_PPPP - - // copy length: fixed - // 3 - matchLength = 3; - - // copy offset :: 12 bits :: valid range [2048..3071] - // [0..1] from command [2..3] - // [2..9] from trailer [0..7] - // [10] unset - // [11] set - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - matchOffset = (command & 0xc) >> 2; - matchOffset |= (*(input++) & 0xFF) << 2; - matchOffset |= 0x800; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from command [0..1] - literalLength = (command & 0x3); - } else { - // 0b0000_PPLL 0bPPPP_PPPP - - // copy length :: fixed - // 2 - matchLength = 2; - - // copy offset :: 10 bits :: valid range [0..1023] - // [0..1] from command [2..3] - // [2..9] from trailer [0..7] - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - matchOffset = (command & 0xc) >> 2; - matchOffset |= (*(input++) & 0xFF) << 2; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from command [0..1] - literalLength = (command & 0x3); - } - } else if (firstCommand) { - // first command has special handling when high nibble is set - matchLength = 0; - matchOffset = 0; - literalLength = command - 17; - } else if ((command & 0xf0) == 0x10) { - // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL - - // copy length - 2 :: variable bits :: valid range [3..] - // 2 + variableLength(command bits [0..2], 3) - matchLength = command & 0x7; - if (matchLength == 0) { - matchLength = 0x7; - - int32_t nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { - matchLength += 0xff; - } - matchLength += nextByte; - } - matchLength += 2; - - // read trailer - if (input + SIZE_OF_SHORT > inputLimit) { - throw MalformedInputException(input - inputAddress); - } - uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF; - input += SIZE_OF_SHORT; - - // copy offset :: 16 bits :: valid range [32767..49151] - // [0..13] from trailer [2..15] - // [14] if command bit [3] unset - // [15] if command bit [3] set - matchOffset = trailer >> 2; - if ((command & 0x8) == 0) { - matchOffset |= 0x4000; - } else { - matchOffset |= 0x8000; - } - matchOffset--; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from trailer [0..1] - literalLength = trailer & 0x3; - } else if ((command & 0xe0) == 0x20) { - // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL - - // copy length - 2 :: variable bits :: valid range [3..] - // 2 + variableLength(command bits [0..4], 5) - matchLength = command & 0x1f; - if (matchLength == 0) { - matchLength = 0x1f; - - int nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { - matchLength += 0xff; - } - matchLength += nextByte; - } - matchLength += 2; - - // read trailer - if (input + SIZE_OF_SHORT > inputLimit) { - throw MalformedInputException(input - inputAddress); - } - int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF; - input += SIZE_OF_SHORT; - - // copy offset :: 14 bits :: valid range [0..16383] - // [0..13] from trailer [2..15] - matchOffset = trailer >> 2; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from trailer [0..1] - literalLength = trailer & 0x3; - } else if ((command & 0xc0) != 0) { - // 0bMMMP_PPLL 0bPPPP_PPPP - - // copy length - 1 :: 3 bits :: valid range [1..8] - // [0..2] from command [5..7] - // add 1 - matchLength = (command & 0xe0) >> 5; - matchLength += 1; - - // copy offset :: 11 bits :: valid range [0..4095] - // [0..2] from command [2..4] - // [3..10] from trailer [0..7] - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - matchOffset = (command & 0x1c) >> 2; - matchOffset |= (*(input++) & 0xFF) << 3; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from command [0..1] - literalLength = (command & 0x3); - } else { - throw MalformedInputException(input - inputAddress - 1, - "Invalid LZO command " + - toHex(command)); - } - firstCommand = false; - - // copy match - if (matchLength != 0) { - // lzo encodes match offset minus one - matchOffset++; - - char *matchAddress = output - matchOffset; - if (matchAddress < outputAddress || - output + matchLength > outputLimit) { - throw MalformedInputException(input - inputAddress); - } - char *matchOutputLimit = output + matchLength; - - if (output > fastOutputLimit) { - // slow match copy - while (output < matchOutputLimit) { - *(output++) = *(matchAddress++); - } - } else { - // copy repeated sequence - if (matchOffset < SIZE_OF_LONG) { - // 8 bytes apart so that we can copy long-at-a-time below - int32_t increment32 = DEC_32_TABLE[matchOffset]; - int32_t decrement64 = DEC_64_TABLE[matchOffset]; - - output[0] = *matchAddress; - output[1] = *(matchAddress + 1); - output[2] = *(matchAddress + 2); - output[3] = *(matchAddress + 3); - output += SIZE_OF_INT; - matchAddress += increment32; - - *reinterpret_cast<int32_t*>(output) = - *reinterpret_cast<int32_t*>(matchAddress); - output += SIZE_OF_INT; - matchAddress -= decrement64; - } else { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<int64_t*>(matchAddress); - matchAddress += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } - - if (matchOutputLimit >= fastOutputLimit) { - if (matchOutputLimit > outputLimit) { - throw MalformedInputException(input - inputAddress); - } - - while (output < fastOutputLimit) { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<int64_t*>(matchAddress); - matchAddress += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } - - while (output < matchOutputLimit) { - *(output++) = *(matchAddress++); - } - } else { - while (output < matchOutputLimit) { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<int64_t*>(matchAddress); - matchAddress += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } - } - } - output = matchOutputLimit; // correction in case we over-copied - } - - // copy literal - char *literalOutputLimit = output + literalLength; - if (literalOutputLimit > fastOutputLimit || - input + literalLength > inputLimit - SIZE_OF_LONG) { - if (literalOutputLimit > outputLimit) { - throw MalformedInputException(input - inputAddress); - } - - // slow, precise copy - memcpy(output, input, literalLength); - input += literalLength; - output += literalLength; - } else { - // fast copy. We may over-copy but there's enough room in input - // and output to not overrun them - do { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<const int64_t*>(input); - input += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } while (output < literalOutputLimit); - // adjust index if we over-copied - input -= (output - literalOutputLimit); - output = literalOutputLimit; - } - lastLiteralLength = literalLength; - } - - if (input + SIZE_OF_SHORT > inputLimit && - *reinterpret_cast<const int16_t*>(input) != 0) { - throw MalformedInputException(input - inputAddress); - } - input += SIZE_OF_SHORT; - } - - return static_cast<uint64_t>(output - outputAddress); - } - -} +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" + +#include <string> + +namespace orc { + + static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4}; + static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3}; + + static const int32_t SIZE_OF_SHORT = 2; + static const int32_t SIZE_OF_INT = 4; + static const int32_t SIZE_OF_LONG = 8; + + static std::string toHex(uint64_t val) { + std::ostringstream out; + out << "0x" << std::hex << val; + return out.str(); + } + + static std::string toString(int64_t val) { + std::ostringstream out; + out << val; + return out.str(); + } + + class MalformedInputException: public ParseError { + public: + MalformedInputException(int64_t off + ) :ParseError("MalformedInputException at " + + toString(off)) { + } + + MalformedInputException(int64_t off, const std::string& msg + ): ParseError("MalformedInputException " + msg + + " at " + toString(off)) { + } + + MalformedInputException(const MalformedInputException& other + ): ParseError(other.what()) { + } + + virtual ~MalformedInputException() noexcept; + }; + + MalformedInputException::~MalformedInputException() noexcept { + // PASS + } + + uint64_t lzoDecompress(const char *inputAddress, + const char *inputLimit, + char *outputAddress, + char *outputLimit) { + // nothing compresses to nothing + if (inputAddress == inputLimit) { + return 0; + } + + // maximum offset in buffers to which it's safe to write long-at-a-time + char * const fastOutputLimit = outputLimit - SIZE_OF_LONG; + + // LZO can concat two blocks together so, decode until the input data is + // consumed + const char *input = inputAddress; + char *output = outputAddress; + while (input < inputLimit) { + // + // Note: For safety some of the code below may stop decoding early or + // skip decoding, because input is not available. This makes the code + // safe, and since LZO requires an explicit "stop" command, the decoder + // will still throw a exception. + // + + bool firstCommand = true; + uint32_t lastLiteralLength = 0; + while (true) { + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + uint32_t command = *(input++) & 0xFF; + if (command == 0x11) { + break; + } + + // Commands are described using a bit pattern notation: + // 0: bit is not set + // 1: bit is set + // L: part of literal length + // P: part of match offset position + // M: part of match length + // ?: see documentation in command decoder + + int32_t matchLength; + int32_t matchOffset; + uint32_t literalLength; + if ((command & 0xf0) == 0) { + if (lastLiteralLength == 0) { + // 0b0000_LLLL (0bLLLL_LLLL)* + + // copy length :: fixed + // 0 + matchOffset = 0; + + // copy offset :: fixed + // 0 + matchLength = 0; + + // literal length - 3 :: variable bits :: valid range [4..] + // 3 + variableLength(command bits [0..3], 4) + literalLength = command & 0xf; + if (literalLength == 0) { + literalLength = 0xf; + + uint32_t nextByte = 0; + while (input < inputLimit && + (nextByte = *(input++) & 0xFF) == 0) { + literalLength += 0xff; + } + literalLength += nextByte; + } + literalLength += 3; + } else if (lastLiteralLength <= 3) { + // 0b0000_PPLL 0bPPPP_PPPP + + // copy length: fixed + // 3 + matchLength = 3; + + // copy offset :: 12 bits :: valid range [2048..3071] + // [0..1] from command [2..3] + // [2..9] from trailer [0..7] + // [10] unset + // [11] set + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + matchOffset = (command & 0xc) >> 2; + matchOffset |= (*(input++) & 0xFF) << 2; + matchOffset |= 0x800; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from command [0..1] + literalLength = (command & 0x3); + } else { + // 0b0000_PPLL 0bPPPP_PPPP + + // copy length :: fixed + // 2 + matchLength = 2; + + // copy offset :: 10 bits :: valid range [0..1023] + // [0..1] from command [2..3] + // [2..9] from trailer [0..7] + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + matchOffset = (command & 0xc) >> 2; + matchOffset |= (*(input++) & 0xFF) << 2; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from command [0..1] + literalLength = (command & 0x3); + } + } else if (firstCommand) { + // first command has special handling when high nibble is set + matchLength = 0; + matchOffset = 0; + literalLength = command - 17; + } else if ((command & 0xf0) == 0x10) { + // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL + + // copy length - 2 :: variable bits :: valid range [3..] + // 2 + variableLength(command bits [0..2], 3) + matchLength = command & 0x7; + if (matchLength == 0) { + matchLength = 0x7; + + int32_t nextByte = 0; + while (input < inputLimit && + (nextByte = *(input++) & 0xFF) == 0) { + matchLength += 0xff; + } + matchLength += nextByte; + } + matchLength += 2; + + // read trailer + if (input + SIZE_OF_SHORT > inputLimit) { + throw MalformedInputException(input - inputAddress); + } + uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF; + input += SIZE_OF_SHORT; + + // copy offset :: 16 bits :: valid range [32767..49151] + // [0..13] from trailer [2..15] + // [14] if command bit [3] unset + // [15] if command bit [3] set + matchOffset = trailer >> 2; + if ((command & 0x8) == 0) { + matchOffset |= 0x4000; + } else { + matchOffset |= 0x8000; + } + matchOffset--; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from trailer [0..1] + literalLength = trailer & 0x3; + } else if ((command & 0xe0) == 0x20) { + // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL + + // copy length - 2 :: variable bits :: valid range [3..] + // 2 + variableLength(command bits [0..4], 5) + matchLength = command & 0x1f; + if (matchLength == 0) { + matchLength = 0x1f; + + int nextByte = 0; + while (input < inputLimit && + (nextByte = *(input++) & 0xFF) == 0) { + matchLength += 0xff; + } + matchLength += nextByte; + } + matchLength += 2; + + // read trailer + if (input + SIZE_OF_SHORT > inputLimit) { + throw MalformedInputException(input - inputAddress); + } + int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF; + input += SIZE_OF_SHORT; + + // copy offset :: 14 bits :: valid range [0..16383] + // [0..13] from trailer [2..15] + matchOffset = trailer >> 2; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from trailer [0..1] + literalLength = trailer & 0x3; + } else if ((command & 0xc0) != 0) { + // 0bMMMP_PPLL 0bPPPP_PPPP + + // copy length - 1 :: 3 bits :: valid range [1..8] + // [0..2] from command [5..7] + // add 1 + matchLength = (command & 0xe0) >> 5; + matchLength += 1; + + // copy offset :: 11 bits :: valid range [0..4095] + // [0..2] from command [2..4] + // [3..10] from trailer [0..7] + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + matchOffset = (command & 0x1c) >> 2; + matchOffset |= (*(input++) & 0xFF) << 3; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from command [0..1] + literalLength = (command & 0x3); + } else { + throw MalformedInputException(input - inputAddress - 1, + "Invalid LZO command " + + toHex(command)); + } + firstCommand = false; + + // copy match + if (matchLength != 0) { + // lzo encodes match offset minus one + matchOffset++; + + char *matchAddress = output - matchOffset; + if (matchAddress < outputAddress || + output + matchLength > outputLimit) { + throw MalformedInputException(input - inputAddress); + } + char *matchOutputLimit = output + matchLength; + + if (output > fastOutputLimit) { + // slow match copy + while (output < matchOutputLimit) { + *(output++) = *(matchAddress++); + } + } else { + // copy repeated sequence + if (matchOffset < SIZE_OF_LONG) { + // 8 bytes apart so that we can copy long-at-a-time below + int32_t increment32 = DEC_32_TABLE[matchOffset]; + int32_t decrement64 = DEC_64_TABLE[matchOffset]; + + output[0] = *matchAddress; + output[1] = *(matchAddress + 1); + output[2] = *(matchAddress + 2); + output[3] = *(matchAddress + 3); + output += SIZE_OF_INT; + matchAddress += increment32; + + *reinterpret_cast<int32_t*>(output) = + *reinterpret_cast<int32_t*>(matchAddress); + output += SIZE_OF_INT; + matchAddress -= decrement64; + } else { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<int64_t*>(matchAddress); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + if (matchOutputLimit >= fastOutputLimit) { + if (matchOutputLimit > outputLimit) { + throw MalformedInputException(input - inputAddress); + } + + while (output < fastOutputLimit) { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<int64_t*>(matchAddress); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + while (output < matchOutputLimit) { + *(output++) = *(matchAddress++); + } + } else { + while (output < matchOutputLimit) { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<int64_t*>(matchAddress); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + } + } + output = matchOutputLimit; // correction in case we over-copied + } + + // copy literal + char *literalOutputLimit = output + literalLength; + if (literalOutputLimit > fastOutputLimit || + input + literalLength > inputLimit - SIZE_OF_LONG) { + if (literalOutputLimit > outputLimit) { + throw MalformedInputException(input - inputAddress); + } + + // slow, precise copy + memcpy(output, input, literalLength); + input += literalLength; + output += literalLength; + } else { + // fast copy. We may over-copy but there's enough room in input + // and output to not overrun them + do { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<const int64_t*>(input); + input += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } while (output < literalOutputLimit); + // adjust index if we over-copied + input -= (output - literalOutputLimit); + output = literalOutputLimit; + } + lastLiteralLength = literalLength; + } + + if (input + SIZE_OF_SHORT > inputLimit && + *reinterpret_cast<const int16_t*>(input) != 0) { + throw MalformedInputException(input - inputAddress); + } + input += SIZE_OF_SHORT; + } + + return static_cast<uint64_t>(output - outputAddress); + } + +} diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh index 9de8537dd8..32d8085174 100644 --- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh +++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh @@ -1,42 +1,42 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_LZO_HH -#define ORC_LZO_HH - -#include "orc/OrcFile.hh" - -#include "Adaptor.hh" - -namespace orc { - - /** - * Decompress the bytes in to the output buffer. - * @param inputAddress the start of the input - * @param inputLimit one past the last byte of the input - * @param outputAddress the start of the output buffer - * @param outputLimit one past the last byte of the output buffer - * @result the number of bytes decompressed - */ - uint64_t lzoDecompress(const char *inputAddress, - const char *inputLimit, - char *outputAddress, - char *outputLimit); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_LZO_HH +#define ORC_LZO_HH + +#include "orc/OrcFile.hh" + +#include "Adaptor.hh" + +namespace orc { + + /** + * Decompress the bytes in to the output buffer. + * @param inputAddress the start of the input + * @param inputLimit one past the last byte of the input + * @param outputAddress the start of the output buffer + * @param outputLimit one past the last byte of the output buffer + * @result the number of bytes decompressed + */ + uint64_t lzoDecompress(const char *inputAddress, + const char *inputLimit, + char *outputAddress, + char *outputLimit); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc index ecfb295bae..178e9cc316 100644 --- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc +++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc @@ -1,244 +1,244 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" -#include "orc/MemoryPool.hh" - -#include "Adaptor.hh" - -#include <cstdlib> -#include <iostream> -#include <string.h> - -namespace orc { - - MemoryPool::~MemoryPool() { - // PASS - } - - class MemoryPoolImpl: public MemoryPool { - public: - virtual ~MemoryPoolImpl() override; - - char* malloc(uint64_t size) override; - void free(char* p) override; - }; - - char* MemoryPoolImpl::malloc(uint64_t size) { - return static_cast<char*>(std::malloc(size)); - } - - void MemoryPoolImpl::free(char* p) { - std::free(p); - } - - MemoryPoolImpl::~MemoryPoolImpl() { - // PASS - } - - template <class T> - DataBuffer<T>::DataBuffer(MemoryPool& pool, - uint64_t newSize - ): memoryPool(pool), - buf(nullptr), - currentSize(0), - currentCapacity(0) { - resize(newSize); - } - - template <class T> - DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer - ) noexcept: - memoryPool(buffer.memoryPool), - buf(buffer.buf), - currentSize(buffer.currentSize), - currentCapacity(buffer.currentCapacity) { - buffer.buf = nullptr; - buffer.currentSize = 0; - buffer.currentCapacity = 0; - } - - template <class T> - DataBuffer<T>::~DataBuffer(){ - for(uint64_t i=currentSize; i > 0; --i) { - (buf + i - 1)->~T(); - } - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <class T> - void DataBuffer<T>::resize(uint64_t newSize) { - reserve(newSize); - if (currentSize > newSize) { - for(uint64_t i=currentSize; i > newSize; --i) { - (buf + i - 1)->~T(); - } - } else if (newSize > currentSize) { - for(uint64_t i=currentSize; i < newSize; ++i) { - new (buf + i) T(); - } - } - currentSize = newSize; - } - - template <class T> - void DataBuffer<T>::reserve(uint64_t newCapacity){ - if (newCapacity > currentCapacity || !buf) { - if (buf) { - T* buf_old = buf; - buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); - memcpy(buf, buf_old, sizeof(T) * currentSize); - memoryPool.free(reinterpret_cast<char*>(buf_old)); - } else { - buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); - } - currentCapacity = newCapacity; - } - } - - // Specializations for char - - template <> - DataBuffer<char>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<char>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, newSize - currentSize); - } - currentSize = newSize; - } - - // Specializations for char* - - template <> - DataBuffer<char*>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<char*>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*)); - } - currentSize = newSize; - } - - // Specializations for double - - template <> - DataBuffer<double>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<double>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double)); - } - currentSize = newSize; - } - - // Specializations for int64_t - - template <> - DataBuffer<int64_t>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<int64_t>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t)); - } - currentSize = newSize; - } - - // Specializations for uint64_t - - template <> - DataBuffer<uint64_t>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<uint64_t>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t)); - } - currentSize = newSize; - } - - // Specializations for unsigned char - - template <> - DataBuffer<unsigned char>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<unsigned char>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, newSize - currentSize); - } - currentSize = newSize; - } - - #ifdef __clang__ - #pragma clang diagnostic ignored "-Wweak-template-vtables" - #endif - - template class DataBuffer<char>; - template class DataBuffer<char*>; - template class DataBuffer<double>; - template class DataBuffer<Int128>; - template class DataBuffer<int64_t>; - template class DataBuffer<uint64_t>; - template class DataBuffer<unsigned char>; - - #ifdef __clang__ - #pragma clang diagnostic ignored "-Wexit-time-destructors" - #endif - - MemoryPool* getDefaultPool() { - static MemoryPoolImpl internal; - return &internal; - } -} // namespace orc +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" +#include "orc/MemoryPool.hh" + +#include "Adaptor.hh" + +#include <cstdlib> +#include <iostream> +#include <string.h> + +namespace orc { + + MemoryPool::~MemoryPool() { + // PASS + } + + class MemoryPoolImpl: public MemoryPool { + public: + virtual ~MemoryPoolImpl() override; + + char* malloc(uint64_t size) override; + void free(char* p) override; + }; + + char* MemoryPoolImpl::malloc(uint64_t size) { + return static_cast<char*>(std::malloc(size)); + } + + void MemoryPoolImpl::free(char* p) { + std::free(p); + } + + MemoryPoolImpl::~MemoryPoolImpl() { + // PASS + } + + template <class T> + DataBuffer<T>::DataBuffer(MemoryPool& pool, + uint64_t newSize + ): memoryPool(pool), + buf(nullptr), + currentSize(0), + currentCapacity(0) { + resize(newSize); + } + + template <class T> + DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer + ) noexcept: + memoryPool(buffer.memoryPool), + buf(buffer.buf), + currentSize(buffer.currentSize), + currentCapacity(buffer.currentCapacity) { + buffer.buf = nullptr; + buffer.currentSize = 0; + buffer.currentCapacity = 0; + } + + template <class T> + DataBuffer<T>::~DataBuffer(){ + for(uint64_t i=currentSize; i > 0; --i) { + (buf + i - 1)->~T(); + } + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <class T> + void DataBuffer<T>::resize(uint64_t newSize) { + reserve(newSize); + if (currentSize > newSize) { + for(uint64_t i=currentSize; i > newSize; --i) { + (buf + i - 1)->~T(); + } + } else if (newSize > currentSize) { + for(uint64_t i=currentSize; i < newSize; ++i) { + new (buf + i) T(); + } + } + currentSize = newSize; + } + + template <class T> + void DataBuffer<T>::reserve(uint64_t newCapacity){ + if (newCapacity > currentCapacity || !buf) { + if (buf) { + T* buf_old = buf; + buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); + memcpy(buf, buf_old, sizeof(T) * currentSize); + memoryPool.free(reinterpret_cast<char*>(buf_old)); + } else { + buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); + } + currentCapacity = newCapacity; + } + } + + // Specializations for char + + template <> + DataBuffer<char>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<char>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, newSize - currentSize); + } + currentSize = newSize; + } + + // Specializations for char* + + template <> + DataBuffer<char*>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<char*>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*)); + } + currentSize = newSize; + } + + // Specializations for double + + template <> + DataBuffer<double>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<double>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double)); + } + currentSize = newSize; + } + + // Specializations for int64_t + + template <> + DataBuffer<int64_t>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<int64_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t)); + } + currentSize = newSize; + } + + // Specializations for uint64_t + + template <> + DataBuffer<uint64_t>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<uint64_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t)); + } + currentSize = newSize; + } + + // Specializations for unsigned char + + template <> + DataBuffer<unsigned char>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<unsigned char>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, newSize - currentSize); + } + currentSize = newSize; + } + + #ifdef __clang__ + #pragma clang diagnostic ignored "-Wweak-template-vtables" + #endif + + template class DataBuffer<char>; + template class DataBuffer<char*>; + template class DataBuffer<double>; + template class DataBuffer<Int128>; + template class DataBuffer<int64_t>; + template class DataBuffer<uint64_t>; + template class DataBuffer<unsigned char>; + + #ifdef __clang__ + #pragma clang diagnostic ignored "-Wexit-time-destructors" + #endif + + MemoryPool* getDefaultPool() { + static MemoryPoolImpl internal; + return &internal; + } +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.cc b/contrib/libs/apache/orc/c++/src/Murmur3.cc index b45bd6d492..63cf797a04 100644 --- a/contrib/libs/apache/orc/c++/src/Murmur3.cc +++ b/contrib/libs/apache/orc/c++/src/Murmur3.cc @@ -1,98 +1,98 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Murmur3.hh" - -#define ROTL64(x, r) ((x << r) | (x >> (64 - r))) - -namespace orc { - - inline uint64_t rotl64 ( uint64_t x, int8_t r ) { - return (x << r) | (x >> (64 - r)); - } - - inline uint64_t Murmur3::fmix64(uint64_t value) { - value ^= (value >> 33); - value *= 0xff51afd7ed558ccdL; - value ^= (value >> 33); - value *= 0xc4ceb9fe1a85ec53L; - value ^= (value >> 33); - return value; - } - - uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) { - return hash64(data, len, DEFAULT_SEED); - } - - DIAGNOSTIC_PUSH - -#if defined(__clang__) - DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough") -#endif - - uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) { - uint64_t h = seed; - uint32_t blocks = len >> 3; - - const uint64_t* src = reinterpret_cast<const uint64_t*>(data); - uint64_t c1 = 0x87c37b91114253d5L; - uint64_t c2 = 0x4cf5ad432745937fL; - for (uint32_t i = 0; i < blocks; i++) { - uint64_t k = src[i]; - k *= c1; - k = ROTL64(k, 31); - k *= c2; - - h ^= k; - h = ROTL64(h, 27); - h = h * 5 + 0x52dce729; - } - - uint64_t k = 0; - uint32_t idx = blocks << 3; - switch (len - idx) { - case 7: - k ^= static_cast<uint64_t>(data[idx + 6]) << 48; - case 6: - k ^= static_cast<uint64_t>(data[idx + 5]) << 40; - case 5: - k ^= static_cast<uint64_t>(data[idx + 4]) << 32; - case 4: - k ^= static_cast<uint64_t>(data[idx + 3]) << 24; - case 3: - k ^= static_cast<uint64_t>(data[idx + 2]) << 16; - case 2: - k ^= static_cast<uint64_t>(data[idx + 1]) << 8; - case 1: - k ^= static_cast<uint64_t>(data[idx + 0]); - - k *= c1; - k = ROTL64(k, 31); - k *= c2; - h ^= k; - } - - h ^= len; - h = fmix64(h); - return h; - } - - DIAGNOSTIC_POP - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Murmur3.hh" + +#define ROTL64(x, r) ((x << r) | (x >> (64 - r))) + +namespace orc { + + inline uint64_t rotl64 ( uint64_t x, int8_t r ) { + return (x << r) | (x >> (64 - r)); + } + + inline uint64_t Murmur3::fmix64(uint64_t value) { + value ^= (value >> 33); + value *= 0xff51afd7ed558ccdL; + value ^= (value >> 33); + value *= 0xc4ceb9fe1a85ec53L; + value ^= (value >> 33); + return value; + } + + uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) { + return hash64(data, len, DEFAULT_SEED); + } + + DIAGNOSTIC_PUSH + +#if defined(__clang__) + DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough") +#endif + + uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) { + uint64_t h = seed; + uint32_t blocks = len >> 3; + + const uint64_t* src = reinterpret_cast<const uint64_t*>(data); + uint64_t c1 = 0x87c37b91114253d5L; + uint64_t c2 = 0x4cf5ad432745937fL; + for (uint32_t i = 0; i < blocks; i++) { + uint64_t k = src[i]; + k *= c1; + k = ROTL64(k, 31); + k *= c2; + + h ^= k; + h = ROTL64(h, 27); + h = h * 5 + 0x52dce729; + } + + uint64_t k = 0; + uint32_t idx = blocks << 3; + switch (len - idx) { + case 7: + k ^= static_cast<uint64_t>(data[idx + 6]) << 48; + case 6: + k ^= static_cast<uint64_t>(data[idx + 5]) << 40; + case 5: + k ^= static_cast<uint64_t>(data[idx + 4]) << 32; + case 4: + k ^= static_cast<uint64_t>(data[idx + 3]) << 24; + case 3: + k ^= static_cast<uint64_t>(data[idx + 2]) << 16; + case 2: + k ^= static_cast<uint64_t>(data[idx + 1]) << 8; + case 1: + k ^= static_cast<uint64_t>(data[idx + 0]); + + k *= c1; + k = ROTL64(k, 31); + k *= c2; + h ^= k; + } + + h ^= len; + h = fmix64(h); + return h; + } + + DIAGNOSTIC_POP + +} diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.hh b/contrib/libs/apache/orc/c++/src/Murmur3.hh index 02391811b0..9cf1de138f 100644 --- a/contrib/libs/apache/orc/c++/src/Murmur3.hh +++ b/contrib/libs/apache/orc/c++/src/Murmur3.hh @@ -1,40 +1,40 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_MURMUR3_HH -#define ORC_MURMUR3_HH - -#include "orc/orc-config.hh" - -namespace orc { - - class Murmur3 { - public: - static const uint32_t DEFAULT_SEED = 104729; - static const uint64_t NULL_HASHCODE = 2862933555777941757LL; - - static uint64_t hash64(const uint8_t *data, uint32_t len); - - private: - static uint64_t fmix64(uint64_t value); - static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed); - }; - -} - -#endif //ORC_MURMUR3_HH +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_MURMUR3_HH +#define ORC_MURMUR3_HH + +#include "orc/orc-config.hh" + +namespace orc { + + class Murmur3 { + public: + static const uint32_t DEFAULT_SEED = 104729; + static const uint64_t NULL_HASHCODE = 2862933555777941757LL; + + static uint64_t hash64(const uint8_t *data, uint32_t len); + + private: + static uint64_t fmix64(uint64_t value); + static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed); + }; + +} + +#endif //ORC_MURMUR3_HH diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh index 795e166138..ee9982cdc2 100644 --- a/contrib/libs/apache/orc/c++/src/Options.hh +++ b/contrib/libs/apache/orc/c++/src/Options.hh @@ -1,258 +1,258 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_OPTIONS_HH -#define ORC_OPTIONS_HH - -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include <limits> - -namespace orc { - - enum ColumnSelection { - ColumnSelection_NONE = 0, - ColumnSelection_NAMES = 1, - ColumnSelection_FIELD_IDS = 2, - ColumnSelection_TYPE_IDS = 3, - }; - -/** - * ReaderOptions Implementation - */ - struct ReaderOptionsPrivate { - uint64_t tailLocation; - std::ostream* errorStream; - MemoryPool* memoryPool; - std::string serializedTail; - - ReaderOptionsPrivate() { - tailLocation = std::numeric_limits<uint64_t>::max(); - errorStream = &std::cerr; - memoryPool = getDefaultPool(); - } - }; - - ReaderOptions::ReaderOptions(): - privateBits(std::unique_ptr<ReaderOptionsPrivate> - (new ReaderOptionsPrivate())) { - // PASS - } - - ReaderOptions::ReaderOptions(const ReaderOptions& rhs): - privateBits(std::unique_ptr<ReaderOptionsPrivate> - (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) { - // PASS - } - - ReaderOptions::ReaderOptions(ReaderOptions& rhs) { - // swap privateBits with rhs - ReaderOptionsPrivate* l = privateBits.release(); - privateBits.reset(rhs.privateBits.release()); - rhs.privateBits.reset(l); - } - - ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) { - if (this != &rhs) { - privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get()))); - } - return *this; - } - - ReaderOptions::~ReaderOptions() { - // PASS - } - - ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) { - privateBits->memoryPool = &pool; - return *this; - } - - MemoryPool* ReaderOptions::getMemoryPool() const{ - return privateBits->memoryPool; - } - - ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) { - privateBits->tailLocation = offset; - return *this; - } - - uint64_t ReaderOptions::getTailLocation() const { - return privateBits->tailLocation; - } - - ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value - ) { - privateBits->serializedTail = value; - return *this; - } - - std::string ReaderOptions::getSerializedFileTail() const { - return privateBits->serializedTail; - } - - ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) { - privateBits->errorStream = &stream; - return *this; - } - - std::ostream* ReaderOptions::getErrorStream() const { - return privateBits->errorStream; - } - -/** - * RowReaderOptions Implementation - */ - - struct RowReaderOptionsPrivate { - ColumnSelection selection; - std::list<uint64_t> includedColumnIndexes; - std::list<std::string> includedColumnNames; - uint64_t dataStart; - uint64_t dataLength; - bool throwOnHive11DecimalOverflow; - int32_t forcedScaleOnHive11Decimal; - bool enableLazyDecoding; - - RowReaderOptionsPrivate() { - selection = ColumnSelection_NONE; - dataStart = 0; - dataLength = std::numeric_limits<uint64_t>::max(); - throwOnHive11DecimalOverflow = true; - forcedScaleOnHive11Decimal = 6; - enableLazyDecoding = false; - } - }; - - RowReaderOptions::RowReaderOptions(): - privateBits(std::unique_ptr<RowReaderOptionsPrivate> - (new RowReaderOptionsPrivate())) { - // PASS - } - - RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs): - privateBits(std::unique_ptr<RowReaderOptionsPrivate> - (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) { - // PASS - } - - RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) { - // swap privateBits with rhs - RowReaderOptionsPrivate* l = privateBits.release(); - privateBits.reset(rhs.privateBits.release()); - rhs.privateBits.reset(l); - } - - RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) { - if (this != &rhs) { - privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get()))); - } - return *this; - } - - RowReaderOptions::~RowReaderOptions() { - // PASS - } - - RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) { - privateBits->selection = ColumnSelection_FIELD_IDS; - privateBits->includedColumnIndexes.assign(include.begin(), include.end()); - privateBits->includedColumnNames.clear(); - return *this; - } - - RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) { - privateBits->selection = ColumnSelection_NAMES; - privateBits->includedColumnNames.assign(include.begin(), include.end()); - privateBits->includedColumnIndexes.clear(); - return *this; - } - - RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) { - privateBits->selection = ColumnSelection_TYPE_IDS; - privateBits->includedColumnIndexes.assign(types.begin(), types.end()); - privateBits->includedColumnNames.clear(); - return *this; - } - - RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { - privateBits->dataStart = offset; - privateBits->dataLength = length; - return *this; - } - - bool RowReaderOptions::getIndexesSet() const { - return privateBits->selection == ColumnSelection_FIELD_IDS; - } - - bool RowReaderOptions::getTypeIdsSet() const { - return privateBits->selection == ColumnSelection_TYPE_IDS; - } - - const std::list<uint64_t>& RowReaderOptions::getInclude() const { - return privateBits->includedColumnIndexes; - } - - bool RowReaderOptions::getNamesSet() const { - return privateBits->selection == ColumnSelection_NAMES; - } - - const std::list<std::string>& RowReaderOptions::getIncludeNames() const { - return privateBits->includedColumnNames; - } - - uint64_t RowReaderOptions::getOffset() const { - return privateBits->dataStart; - } - - uint64_t RowReaderOptions::getLength() const { - return privateBits->dataLength; - } - - RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){ - privateBits->throwOnHive11DecimalOverflow = shouldThrow; - return *this; - } - - bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const { - return privateBits->throwOnHive11DecimalOverflow; - } - - RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale - ) { - privateBits->forcedScaleOnHive11Decimal = forcedScale; - return *this; - } - - int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const { - return privateBits->forcedScaleOnHive11Decimal; - } - - bool RowReaderOptions::getEnableLazyDecoding() const { - return privateBits->enableLazyDecoding; - } - - RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) { - privateBits->enableLazyDecoding = enable; - return *this; - } -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_OPTIONS_HH +#define ORC_OPTIONS_HH + +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include <limits> + +namespace orc { + + enum ColumnSelection { + ColumnSelection_NONE = 0, + ColumnSelection_NAMES = 1, + ColumnSelection_FIELD_IDS = 2, + ColumnSelection_TYPE_IDS = 3, + }; + +/** + * ReaderOptions Implementation + */ + struct ReaderOptionsPrivate { + uint64_t tailLocation; + std::ostream* errorStream; + MemoryPool* memoryPool; + std::string serializedTail; + + ReaderOptionsPrivate() { + tailLocation = std::numeric_limits<uint64_t>::max(); + errorStream = &std::cerr; + memoryPool = getDefaultPool(); + } + }; + + ReaderOptions::ReaderOptions(): + privateBits(std::unique_ptr<ReaderOptionsPrivate> + (new ReaderOptionsPrivate())) { + // PASS + } + + ReaderOptions::ReaderOptions(const ReaderOptions& rhs): + privateBits(std::unique_ptr<ReaderOptionsPrivate> + (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) { + // PASS + } + + ReaderOptions::ReaderOptions(ReaderOptions& rhs) { + // swap privateBits with rhs + ReaderOptionsPrivate* l = privateBits.release(); + privateBits.reset(rhs.privateBits.release()); + rhs.privateBits.reset(l); + } + + ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) { + if (this != &rhs) { + privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get()))); + } + return *this; + } + + ReaderOptions::~ReaderOptions() { + // PASS + } + + ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) { + privateBits->memoryPool = &pool; + return *this; + } + + MemoryPool* ReaderOptions::getMemoryPool() const{ + return privateBits->memoryPool; + } + + ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) { + privateBits->tailLocation = offset; + return *this; + } + + uint64_t ReaderOptions::getTailLocation() const { + return privateBits->tailLocation; + } + + ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value + ) { + privateBits->serializedTail = value; + return *this; + } + + std::string ReaderOptions::getSerializedFileTail() const { + return privateBits->serializedTail; + } + + ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) { + privateBits->errorStream = &stream; + return *this; + } + + std::ostream* ReaderOptions::getErrorStream() const { + return privateBits->errorStream; + } + +/** + * RowReaderOptions Implementation + */ + + struct RowReaderOptionsPrivate { + ColumnSelection selection; + std::list<uint64_t> includedColumnIndexes; + std::list<std::string> includedColumnNames; + uint64_t dataStart; + uint64_t dataLength; + bool throwOnHive11DecimalOverflow; + int32_t forcedScaleOnHive11Decimal; + bool enableLazyDecoding; + + RowReaderOptionsPrivate() { + selection = ColumnSelection_NONE; + dataStart = 0; + dataLength = std::numeric_limits<uint64_t>::max(); + throwOnHive11DecimalOverflow = true; + forcedScaleOnHive11Decimal = 6; + enableLazyDecoding = false; + } + }; + + RowReaderOptions::RowReaderOptions(): + privateBits(std::unique_ptr<RowReaderOptionsPrivate> + (new RowReaderOptionsPrivate())) { + // PASS + } + + RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs): + privateBits(std::unique_ptr<RowReaderOptionsPrivate> + (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) { + // PASS + } + + RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) { + // swap privateBits with rhs + RowReaderOptionsPrivate* l = privateBits.release(); + privateBits.reset(rhs.privateBits.release()); + rhs.privateBits.reset(l); + } + + RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) { + if (this != &rhs) { + privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get()))); + } + return *this; + } + + RowReaderOptions::~RowReaderOptions() { + // PASS + } + + RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) { + privateBits->selection = ColumnSelection_FIELD_IDS; + privateBits->includedColumnIndexes.assign(include.begin(), include.end()); + privateBits->includedColumnNames.clear(); + return *this; + } + + RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) { + privateBits->selection = ColumnSelection_NAMES; + privateBits->includedColumnNames.assign(include.begin(), include.end()); + privateBits->includedColumnIndexes.clear(); + return *this; + } + + RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) { + privateBits->selection = ColumnSelection_TYPE_IDS; + privateBits->includedColumnIndexes.assign(types.begin(), types.end()); + privateBits->includedColumnNames.clear(); + return *this; + } + + RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { + privateBits->dataStart = offset; + privateBits->dataLength = length; + return *this; + } + + bool RowReaderOptions::getIndexesSet() const { + return privateBits->selection == ColumnSelection_FIELD_IDS; + } + + bool RowReaderOptions::getTypeIdsSet() const { + return privateBits->selection == ColumnSelection_TYPE_IDS; + } + + const std::list<uint64_t>& RowReaderOptions::getInclude() const { + return privateBits->includedColumnIndexes; + } + + bool RowReaderOptions::getNamesSet() const { + return privateBits->selection == ColumnSelection_NAMES; + } + + const std::list<std::string>& RowReaderOptions::getIncludeNames() const { + return privateBits->includedColumnNames; + } + + uint64_t RowReaderOptions::getOffset() const { + return privateBits->dataStart; + } + + uint64_t RowReaderOptions::getLength() const { + return privateBits->dataLength; + } + + RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){ + privateBits->throwOnHive11DecimalOverflow = shouldThrow; + return *this; + } + + bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const { + return privateBits->throwOnHive11DecimalOverflow; + } + + RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale + ) { + privateBits->forcedScaleOnHive11Decimal = forcedScale; + return *this; + } + + int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const { + return privateBits->forcedScaleOnHive11Decimal; + } + + bool RowReaderOptions::getEnableLazyDecoding() const { + return privateBits->enableLazyDecoding; + } + + RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) { + privateBits->enableLazyDecoding = enable; + return *this; + } +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc index a0158bbadf..5856db692e 100644 --- a/contrib/libs/apache/orc/c++/src/OrcFile.cc +++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc @@ -1,184 +1,184 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "orc/OrcFile.hh" -#include "orc/Exceptions.hh" - -#include <errno.h> -#include <fcntl.h> -#include <stdio.h> -#include <sys/stat.h> -#include <string.h> - -#ifdef _MSC_VER -#include <io.h> -#define S_IRUSR _S_IREAD -#define S_IWUSR _S_IWRITE +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "orc/Exceptions.hh" + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/stat.h> +#include <string.h> + +#ifdef _MSC_VER +#include <io.h> +#define S_IRUSR _S_IREAD +#define S_IWUSR _S_IWRITE #define stat _stat64 #define fstat _fstat64 -#else -#include <unistd.h> -#define O_BINARY 0 -#endif - -namespace orc { - - class FileInputStream : public InputStream { - private: - std::string filename; - int file; - uint64_t totalLength; - - public: - FileInputStream(std::string _filename) { - filename = _filename; - file = open(filename.c_str(), O_BINARY | O_RDONLY); - if (file == -1) { - throw ParseError("Can't open " + filename); - } - struct stat fileStat; - if (fstat(file, &fileStat) == -1) { - throw ParseError("Can't stat " + filename); - } - totalLength = static_cast<uint64_t>(fileStat.st_size); - } - - ~FileInputStream() override; - - uint64_t getLength() const override { - return totalLength; - } - - uint64_t getNaturalReadSize() const override { - return 128 * 1024; - } - - void read(void* buf, - uint64_t length, - uint64_t offset) override { - if (!buf) { - throw ParseError("Buffer is null"); - } - ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset)); - - if (bytesRead == -1) { - throw ParseError("Bad read of " + filename); - } - if (static_cast<uint64_t>(bytesRead) != length) { - throw ParseError("Short read of " + filename); - } - } - - const std::string& getName() const override { - return filename; - } - }; - - FileInputStream::~FileInputStream() { - close(file); - } - - std::unique_ptr<InputStream> readFile(const std::string& path) { -#ifdef BUILD_LIBHDFSPP - if(strncmp (path.c_str(), "hdfs://", 7) == 0){ - return orc::readHdfsFile(std::string(path)); - } else { -#endif - return orc::readLocalFile(std::string(path)); -#ifdef BUILD_LIBHDFSPP - } -#endif - } - - std::unique_ptr<InputStream> readLocalFile(const std::string& path) { - return std::unique_ptr<InputStream>(new FileInputStream(path)); - } - - OutputStream::~OutputStream() { - // PASS - }; - - class FileOutputStream : public OutputStream { - private: - std::string filename; - int file; - uint64_t bytesWritten; - bool closed; - - public: - FileOutputStream(std::string _filename) { - bytesWritten = 0; - filename = _filename; - closed = false; - file = open( - filename.c_str(), - O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, - S_IRUSR | S_IWUSR); - if (file == -1) { - throw ParseError("Can't open " + filename); - } - } - - ~FileOutputStream() override; - - uint64_t getLength() const override { - return bytesWritten; - } - - uint64_t getNaturalWriteSize() const override { - return 128 * 1024; - } - - void write(const void* buf, size_t length) override { - if (closed) { - throw std::logic_error("Cannot write to closed stream."); - } - ssize_t bytesWrite = ::write(file, buf, length); - if (bytesWrite == -1) { - throw ParseError("Bad write of " + filename); - } - if (static_cast<uint64_t>(bytesWrite) != length) { - throw ParseError("Short write of " + filename); - } - bytesWritten += static_cast<uint64_t>(bytesWrite); - } - - const std::string& getName() const override { - return filename; - } - - void close() override { - if (!closed) { - ::close(file); - closed = true; - } - } - }; - - FileOutputStream::~FileOutputStream() { - if (!closed) { - ::close(file); - closed = true; - } - } - - std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) { - return std::unique_ptr<OutputStream>(new FileOutputStream(path)); - } -} +#else +#include <unistd.h> +#define O_BINARY 0 +#endif + +namespace orc { + + class FileInputStream : public InputStream { + private: + std::string filename; + int file; + uint64_t totalLength; + + public: + FileInputStream(std::string _filename) { + filename = _filename; + file = open(filename.c_str(), O_BINARY | O_RDONLY); + if (file == -1) { + throw ParseError("Can't open " + filename); + } + struct stat fileStat; + if (fstat(file, &fileStat) == -1) { + throw ParseError("Can't stat " + filename); + } + totalLength = static_cast<uint64_t>(fileStat.st_size); + } + + ~FileInputStream() override; + + uint64_t getLength() const override { + return totalLength; + } + + uint64_t getNaturalReadSize() const override { + return 128 * 1024; + } + + void read(void* buf, + uint64_t length, + uint64_t offset) override { + if (!buf) { + throw ParseError("Buffer is null"); + } + ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset)); + + if (bytesRead == -1) { + throw ParseError("Bad read of " + filename); + } + if (static_cast<uint64_t>(bytesRead) != length) { + throw ParseError("Short read of " + filename); + } + } + + const std::string& getName() const override { + return filename; + } + }; + + FileInputStream::~FileInputStream() { + close(file); + } + + std::unique_ptr<InputStream> readFile(const std::string& path) { +#ifdef BUILD_LIBHDFSPP + if(strncmp (path.c_str(), "hdfs://", 7) == 0){ + return orc::readHdfsFile(std::string(path)); + } else { +#endif + return orc::readLocalFile(std::string(path)); +#ifdef BUILD_LIBHDFSPP + } +#endif + } + + std::unique_ptr<InputStream> readLocalFile(const std::string& path) { + return std::unique_ptr<InputStream>(new FileInputStream(path)); + } + + OutputStream::~OutputStream() { + // PASS + }; + + class FileOutputStream : public OutputStream { + private: + std::string filename; + int file; + uint64_t bytesWritten; + bool closed; + + public: + FileOutputStream(std::string _filename) { + bytesWritten = 0; + filename = _filename; + closed = false; + file = open( + filename.c_str(), + O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, + S_IRUSR | S_IWUSR); + if (file == -1) { + throw ParseError("Can't open " + filename); + } + } + + ~FileOutputStream() override; + + uint64_t getLength() const override { + return bytesWritten; + } + + uint64_t getNaturalWriteSize() const override { + return 128 * 1024; + } + + void write(const void* buf, size_t length) override { + if (closed) { + throw std::logic_error("Cannot write to closed stream."); + } + ssize_t bytesWrite = ::write(file, buf, length); + if (bytesWrite == -1) { + throw ParseError("Bad write of " + filename); + } + if (static_cast<uint64_t>(bytesWrite) != length) { + throw ParseError("Short write of " + filename); + } + bytesWritten += static_cast<uint64_t>(bytesWrite); + } + + const std::string& getName() const override { + return filename; + } + + void close() override { + if (!closed) { + ::close(file); + closed = true; + } + } + }; + + FileOutputStream::~FileOutputStream() { + if (!closed) { + ::close(file); + closed = true; + } + } + + std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) { + return std::unique_ptr<OutputStream>(new FileOutputStream(path)); + } +} diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc index 21f9082216..ea0181deaf 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.cc +++ b/contrib/libs/apache/orc/c++/src/RLE.cc @@ -1,121 +1,121 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "RLEv1.hh" -#include "RLEv2.hh" -#include "orc/Exceptions.hh" - -namespace orc { - - RleEncoder::~RleEncoder() { - // PASS - } - - RleDecoder::~RleDecoder() { - // PASS - } - - std::unique_ptr<RleEncoder> createRleEncoder - (std::unique_ptr<BufferedOutputStream> output, - bool isSigned, - RleVersion version, - MemoryPool&, - bool alignedBitpacking) { - switch (static_cast<int64_t>(version)) { - case RleVersion_1: - // We don't have std::make_unique() yet. - return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output), - isSigned)); - case RleVersion_2: - return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output), - isSigned, alignedBitpacking)); - default: - throw NotImplementedYet("Not implemented yet"); - } - } - - std::unique_ptr<RleDecoder> createRleDecoder - (std::unique_ptr<SeekableInputStream> input, - bool isSigned, - RleVersion version, - MemoryPool& pool) { - switch (static_cast<int64_t>(version)) { - case RleVersion_1: - // We don't have std::make_unique() yet. - return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input), - isSigned)); - case RleVersion_2: - return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input), - isSigned, pool)); - default: - throw NotImplementedYet("Not implemented yet"); - } - } - - void RleEncoder::add(const int64_t* data, uint64_t numValues, - const char* notNull) { - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - write(data[i]); - } - } - } - - void RleEncoder::writeVslong(int64_t val) { - writeVulong((val << 1) ^ (val >> 63)); - } - - void RleEncoder::writeVulong(int64_t val) { - while (true) { - if ((val & ~0x7f) == 0) { - writeByte(static_cast<char>(val)); - return; - } else { - writeByte(static_cast<char>(0x80 | (val & 0x7f))); - // cast val to unsigned so as to force 0-fill right shift - val = (static_cast<uint64_t>(val) >> 7); - } - } - } - - void RleEncoder::writeByte(char c) { - if (bufferPosition == bufferLength) { - int addedSize = 0; - if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { - throw std::bad_alloc(); - } - bufferPosition = 0; - bufferLength = static_cast<size_t>(addedSize); - } - buffer[bufferPosition++] = c; - } - - void RleEncoder::recordPosition(PositionRecorder* recorder) const { - uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); - if (outputStream->isCompressed()) { - recorder->add(flushedSize); - recorder->add(unflushedSize); - } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - recorder->add(flushedSize + unflushedSize); - } - recorder->add(static_cast<uint64_t>(numLiterals)); - } - -} // namespace orc +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "RLEv1.hh" +#include "RLEv2.hh" +#include "orc/Exceptions.hh" + +namespace orc { + + RleEncoder::~RleEncoder() { + // PASS + } + + RleDecoder::~RleDecoder() { + // PASS + } + + std::unique_ptr<RleEncoder> createRleEncoder + (std::unique_ptr<BufferedOutputStream> output, + bool isSigned, + RleVersion version, + MemoryPool&, + bool alignedBitpacking) { + switch (static_cast<int64_t>(version)) { + case RleVersion_1: + // We don't have std::make_unique() yet. + return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output), + isSigned)); + case RleVersion_2: + return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output), + isSigned, alignedBitpacking)); + default: + throw NotImplementedYet("Not implemented yet"); + } + } + + std::unique_ptr<RleDecoder> createRleDecoder + (std::unique_ptr<SeekableInputStream> input, + bool isSigned, + RleVersion version, + MemoryPool& pool) { + switch (static_cast<int64_t>(version)) { + case RleVersion_1: + // We don't have std::make_unique() yet. + return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input), + isSigned)); + case RleVersion_2: + return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input), + isSigned, pool)); + default: + throw NotImplementedYet("Not implemented yet"); + } + } + + void RleEncoder::add(const int64_t* data, uint64_t numValues, + const char* notNull) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + write(data[i]); + } + } + } + + void RleEncoder::writeVslong(int64_t val) { + writeVulong((val << 1) ^ (val >> 63)); + } + + void RleEncoder::writeVulong(int64_t val) { + while (true) { + if ((val & ~0x7f) == 0) { + writeByte(static_cast<char>(val)); + return; + } else { + writeByte(static_cast<char>(0x80 | (val & 0x7f))); + // cast val to unsigned so as to force 0-fill right shift + val = (static_cast<uint64_t>(val) >> 7); + } + } + } + + void RleEncoder::writeByte(char c) { + if (bufferPosition == bufferLength) { + int addedSize = 0; + if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { + throw std::bad_alloc(); + } + bufferPosition = 0; + bufferLength = static_cast<size_t>(addedSize); + } + buffer[bufferPosition++] = c; + } + + void RleEncoder::recordPosition(PositionRecorder* recorder) const { + uint64_t flushedSize = outputStream->getSize(); + uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); + if (outputStream->isCompressed()) { + recorder->add(flushedSize); + recorder->add(unflushedSize); + } else { + flushedSize -= static_cast<uint64_t>(bufferLength); + recorder->add(flushedSize + unflushedSize); + } + recorder->add(static_cast<uint64_t>(numLiterals)); + } + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh index 6822bd812e..ec0330559e 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.hh +++ b/contrib/libs/apache/orc/c++/src/RLE.hh @@ -1,155 +1,155 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_RLE_HH -#define ORC_RLE_HH - -#include "io/InputStream.hh" -#include "io/OutputStream.hh" - -#include <memory> - -namespace orc { - - inline int64_t zigZag(int64_t value) { - return (value << 1) ^ (value >> 63); - } - - inline int64_t unZigZag(uint64_t value) { - return value >> 1 ^ -(value & 1); - } - - class RleEncoder { - public: - // must be non-inline! - virtual ~RleEncoder(); - - RleEncoder( - std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned): - outputStream(std::move(outStream)), - bufferPosition(0), - bufferLength(0), - numLiterals(0), - isSigned(hasSigned), - buffer(nullptr){ - //pass - } - - /** - * Encode the next batch of values. - * @param data the array to read from - * @param numValues the number of values to write - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const int64_t* data, uint64_t numValues, - const char* notNull); - - /** - * Get size of buffer used so far. - */ - uint64_t getBufferSize() const { - return outputStream->getSize(); - } - - /** - * Flushing underlying BufferedOutputStream - */ - virtual uint64_t flush() = 0; - - /** - * record current position - * @param recorder use the recorder to record current positions - */ - virtual void recordPosition(PositionRecorder* recorder) const; - - virtual void write(int64_t val) = 0; - - protected: - std::unique_ptr<BufferedOutputStream> outputStream; - size_t bufferPosition; - size_t bufferLength; - size_t numLiterals; - int64_t* literals; - bool isSigned; - char* buffer; - - virtual void writeByte(char c); - - virtual void writeVulong(int64_t val); - - virtual void writeVslong(int64_t val); - }; - - class RleDecoder { - public: - // must be non-inline! - virtual ~RleDecoder(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&) = 0; - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues) = 0; - - /** - * Read a number of values into the batch. - * @param data the array to read into - * @param numValues the number of values to read - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void next(int64_t* data, uint64_t numValues, - const char* notNull) = 0; - }; - - /** - * Create an RLE encoder. - * @param output the output stream to write to - * @param isSigned true if the number sequence is signed - * @param version version of RLE decoding to do - * @param pool memory pool to use for allocation - */ - std::unique_ptr<RleEncoder> createRleEncoder - (std::unique_ptr<BufferedOutputStream> output, - bool isSigned, - RleVersion version, - MemoryPool& pool, - bool alignedBitpacking); - - /** - * Create an RLE decoder. - * @param input the input stream to read from - * @param isSigned true if the number sequence is signed - * @param version version of RLE decoding to do - * @param pool memory pool to use for allocation - */ - std::unique_ptr<RleDecoder> createRleDecoder - (std::unique_ptr<SeekableInputStream> input, - bool isSigned, - RleVersion version, - MemoryPool& pool); - -} // namespace orc - -#endif // ORC_RLE_HH +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_RLE_HH +#define ORC_RLE_HH + +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +#include <memory> + +namespace orc { + + inline int64_t zigZag(int64_t value) { + return (value << 1) ^ (value >> 63); + } + + inline int64_t unZigZag(uint64_t value) { + return value >> 1 ^ -(value & 1); + } + + class RleEncoder { + public: + // must be non-inline! + virtual ~RleEncoder(); + + RleEncoder( + std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned): + outputStream(std::move(outStream)), + bufferPosition(0), + bufferLength(0), + numLiterals(0), + isSigned(hasSigned), + buffer(nullptr){ + //pass + } + + /** + * Encode the next batch of values. + * @param data the array to read from + * @param numValues the number of values to write + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const int64_t* data, uint64_t numValues, + const char* notNull); + + /** + * Get size of buffer used so far. + */ + uint64_t getBufferSize() const { + return outputStream->getSize(); + } + + /** + * Flushing underlying BufferedOutputStream + */ + virtual uint64_t flush() = 0; + + /** + * record current position + * @param recorder use the recorder to record current positions + */ + virtual void recordPosition(PositionRecorder* recorder) const; + + virtual void write(int64_t val) = 0; + + protected: + std::unique_ptr<BufferedOutputStream> outputStream; + size_t bufferPosition; + size_t bufferLength; + size_t numLiterals; + int64_t* literals; + bool isSigned; + char* buffer; + + virtual void writeByte(char c); + + virtual void writeVulong(int64_t val); + + virtual void writeVslong(int64_t val); + }; + + class RleDecoder { + public: + // must be non-inline! + virtual ~RleDecoder(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&) = 0; + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues) = 0; + + /** + * Read a number of values into the batch. + * @param data the array to read into + * @param numValues the number of values to read + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void next(int64_t* data, uint64_t numValues, + const char* notNull) = 0; + }; + + /** + * Create an RLE encoder. + * @param output the output stream to write to + * @param isSigned true if the number sequence is signed + * @param version version of RLE decoding to do + * @param pool memory pool to use for allocation + */ + std::unique_ptr<RleEncoder> createRleEncoder + (std::unique_ptr<BufferedOutputStream> output, + bool isSigned, + RleVersion version, + MemoryPool& pool, + bool alignedBitpacking); + + /** + * Create an RLE decoder. + * @param input the input stream to read from + * @param isSigned true if the number sequence is signed + * @param version version of RLE decoding to do + * @param pool memory pool to use for allocation + */ + std::unique_ptr<RleDecoder> createRleDecoder + (std::unique_ptr<SeekableInputStream> input, + bool isSigned, + RleVersion version, + MemoryPool& pool); + +} // namespace orc + +#endif // ORC_RLE_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc index 12e2d057cd..20fc0931ef 100644 --- a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc +++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc @@ -1,70 +1,70 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with option work for additional information - * regarding copyright ownership. The ASF licenses option file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use option file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "RLEV2Util.hh" - -namespace orc { - - // Map FBS enum to bit width value. - const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 26, 28, 30, 32, 40, 48, 56, 64 - }; - - // Map bit length i to closest fixed bit width that can contain i bits. - const uint8_t ClosestFixedBitsMap[65] = { - 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 26, 26, 28, 28, 30, 30, 32, 32, - 40, 40, 40, 40, 40, 40, 40, 40, - 48, 48, 48, 48, 48, 48, 48, 48, - 56, 56, 56, 56, 56, 56, 56, 56, - 64, 64, 64, 64, 64, 64, 64, 64 - }; - - // Map bit length i to closest aligned fixed bit width that can contain i bits. - const uint8_t ClosestAlignedFixedBitsMap[65] = { - 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, - 32, 32, 32, 32, 32, 32, 32, 32, - 40, 40, 40, 40, 40, 40, 40, 40, - 48, 48, 48, 48, 48, 48, 48, 48, - 56, 56, 56, 56, 56, 56, 56, 56, - 64, 64, 64, 64, 64, 64, 64, 64 - }; - - // Map bit width to FBS enum. - const uint8_t BitWidthToFBSMap[65] = { - FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR, - FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT, - FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE, - FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, - FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY, - FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR, - FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX, - FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, - FixedBitSizes::THIRTY, FixedBitSizes::THIRTY, - FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO, - FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, - FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, - FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, - FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, - FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, - FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, - FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, - FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR - }; -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with option work for additional information + * regarding copyright ownership. The ASF licenses option file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use option file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "RLEV2Util.hh" + +namespace orc { + + // Map FBS enum to bit width value. + const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 26, 28, 30, 32, 40, 48, 56, 64 + }; + + // Map bit length i to closest fixed bit width that can contain i bits. + const uint8_t ClosestFixedBitsMap[65] = { + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 26, 26, 28, 28, 30, 30, 32, 32, + 40, 40, 40, 40, 40, 40, 40, 40, + 48, 48, 48, 48, 48, 48, 48, 48, + 56, 56, 56, 56, 56, 56, 56, 56, + 64, 64, 64, 64, 64, 64, 64, 64 + }; + + // Map bit length i to closest aligned fixed bit width that can contain i bits. + const uint8_t ClosestAlignedFixedBitsMap[65] = { + 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, + 32, 32, 32, 32, 32, 32, 32, 32, + 40, 40, 40, 40, 40, 40, 40, 40, + 48, 48, 48, 48, 48, 48, 48, 48, + 56, 56, 56, 56, 56, 56, 56, 56, + 64, 64, 64, 64, 64, 64, 64, 64 + }; + + // Map bit width to FBS enum. + const uint8_t BitWidthToFBSMap[65] = { + FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR, + FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT, + FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE, + FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, + FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY, + FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR, + FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX, + FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, + FixedBitSizes::THIRTY, FixedBitSizes::THIRTY, + FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR + }; +} diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh index 95a6826eaa..67a94c7c48 100644 --- a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh +++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh @@ -1,81 +1,81 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef ORC_RLEV2UTIL_HH -#define ORC_RLEV2UTIL_HH - -#include "RLEv2.hh" - -namespace orc { - extern const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE]; - extern const uint8_t ClosestFixedBitsMap[65]; - extern const uint8_t ClosestAlignedFixedBitsMap[65]; - extern const uint8_t BitWidthToFBSMap[65]; - - // The input n must be less than FixedBitSizes::SIZE. - inline uint32_t decodeBitWidth(uint32_t n) { - return FBSToBitWidthMap[n]; - } - - inline uint32_t getClosestFixedBits(uint32_t n) { - if (n <= 64) { - return ClosestFixedBitsMap[n]; - } else { - return 64; - } - } - - inline uint32_t getClosestAlignedFixedBits(uint32_t n) { - if (n <= 64) { - return ClosestAlignedFixedBitsMap[n]; - } else { - return 64; - } - } - - inline uint32_t encodeBitWidth(uint32_t n) { - if (n <= 64) { - return BitWidthToFBSMap[n]; - } else { - return FixedBitSizes::SIXTYFOUR; - } - } - - inline uint32_t findClosestNumBits(int64_t value) { - if (value < 0) { - return getClosestFixedBits(64); - } - - uint32_t count = 0; - while (value != 0) { - count++; - value = value >> 1; - } - return getClosestFixedBits(count); - } - - inline bool isSafeSubtract(int64_t left, int64_t right) { - return ((left ^ right) >= 0) || ((left ^ (left - right)) >= 0); - } - - inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) { - return static_cast<uint32_t >(encoding << 6); - } -} - -#endif //ORC_RLEV2UTIL_HH +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef ORC_RLEV2UTIL_HH +#define ORC_RLEV2UTIL_HH + +#include "RLEv2.hh" + +namespace orc { + extern const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE]; + extern const uint8_t ClosestFixedBitsMap[65]; + extern const uint8_t ClosestAlignedFixedBitsMap[65]; + extern const uint8_t BitWidthToFBSMap[65]; + + // The input n must be less than FixedBitSizes::SIZE. + inline uint32_t decodeBitWidth(uint32_t n) { + return FBSToBitWidthMap[n]; + } + + inline uint32_t getClosestFixedBits(uint32_t n) { + if (n <= 64) { + return ClosestFixedBitsMap[n]; + } else { + return 64; + } + } + + inline uint32_t getClosestAlignedFixedBits(uint32_t n) { + if (n <= 64) { + return ClosestAlignedFixedBitsMap[n]; + } else { + return 64; + } + } + + inline uint32_t encodeBitWidth(uint32_t n) { + if (n <= 64) { + return BitWidthToFBSMap[n]; + } else { + return FixedBitSizes::SIXTYFOUR; + } + } + + inline uint32_t findClosestNumBits(int64_t value) { + if (value < 0) { + return getClosestFixedBits(64); + } + + uint32_t count = 0; + while (value != 0) { + count++; + value = value >> 1; + } + return getClosestFixedBits(count); + } + + inline bool isSafeSubtract(int64_t left, int64_t right) { + return ((left ^ right) >= 0) || ((left ^ (left - right)) >= 0); + } + + inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) { + return static_cast<uint32_t >(encoding << 6); + } +} + +#endif //ORC_RLEV2UTIL_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc index fe333978db..aae9726bf6 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.cc +++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc @@ -1,302 +1,302 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" -#include "RLEv1.hh" - -#include <algorithm> - -namespace orc { - -const uint64_t MINIMUM_REPEAT = 3; -const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; - -const int64_t BASE_128_MASK = 0x7f; - -const int64_t MAX_DELTA = 127; -const int64_t MIN_DELTA = -128; -const uint64_t MAX_LITERAL_SIZE = 128; - -RleEncoderV1::RleEncoderV1( - std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned): - RleEncoder(std::move(outStream), hasSigned) { - literals = new int64_t[MAX_LITERAL_SIZE]; - delta = 0; - repeat = false; - tailRunLength = 0; -} - -RleEncoderV1::~RleEncoderV1() { - delete [] literals; -} - -void RleEncoderV1::writeValues() { - if (numLiterals != 0) { - if (repeat) { - writeByte(static_cast<char> - (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); - writeByte(static_cast<char>(delta)); - if (isSigned) { - writeVslong(literals[0]); - } else { - writeVulong(literals[0]); - } - } else { - writeByte(static_cast<char>(-numLiterals)); - for(size_t i=0; i < numLiterals; ++i) { - if (isSigned) { - writeVslong(literals[i]); - } else { - writeVulong(literals[i]); - } - } - } - repeat = false; - numLiterals = 0; - tailRunLength = 0; - } -} - -uint64_t RleEncoderV1::flush() { - writeValues(); - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; -} - -void RleEncoderV1::write(int64_t value) { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { - numLiterals += 1; - if (numLiterals == MAXIMUM_REPEAT) { - writeValues(); - } - } else { - writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; - } - } else { - if (tailRunLength == 1) { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; - } else { - tailRunLength = 2; - } - } else if (value == literals[numLiterals - 1] + delta) { - tailRunLength += 1; - } else { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; - } else { - tailRunLength = 2; - } - } - if (tailRunLength == MINIMUM_REPEAT) { - if (numLiterals + 1 == MINIMUM_REPEAT) { - repeat = true; - numLiterals += 1; - } else { - numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); - int64_t base = literals[numLiterals]; - writeValues(); - literals[0] = base; - repeat = true; - numLiterals = MINIMUM_REPEAT; - } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); - } - } - } -} - -signed char RleDecoderV1::readByte() { - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in readByte"); - } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - return *(bufferStart++); -} - -uint64_t RleDecoderV1::readLong() { - uint64_t result = 0; - int64_t offset = 0; - signed char ch = readByte(); - if (ch >= 0) { - result = static_cast<uint64_t>(ch); - } else { - result = static_cast<uint64_t>(ch) & BASE_128_MASK; - while ((ch = readByte()) < 0) { - offset += 7; - result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset; - } - result |= static_cast<uint64_t>(ch) << (offset + 7); - } - return result; -} - -void RleDecoderV1::skipLongs(uint64_t numValues) { - while (numValues > 0) { - if (readByte() >= 0) { - --numValues; - } - } -} - -void RleDecoderV1::readHeader() { - signed char ch = readByte(); - if (ch < 0) { - remainingValues = static_cast<uint64_t>(-ch); - repeating = false; - } else { - remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; - repeating = true; - delta = readByte(); - value = isSigned - ? unZigZag(readLong()) - : static_cast<int64_t>(readLong()); - } -} - -RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, - bool hasSigned) - : inputStream(std::move(input)), - isSigned(hasSigned), - remainingValues(0), - value(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - delta(0), - repeating(false) { -} - -void RleDecoderV1::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // force a re-read from the stream - bufferEnd = bufferStart; - // read a new header - readHeader(); - // skip ahead the given number of records - skip(location.next()); -} - -void RleDecoderV1::skip(uint64_t numValues) { - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; - numValues -= count; - if (repeating) { - value += delta * static_cast<int64_t>(count); - } else { - skipLongs(count); - } - } -} - -void RleDecoderV1::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - uint64_t position = 0; - // skipNulls() - if (notNull) { - // Skip over null values. - while (position < numValues && !notNull[position]) { - ++position; - } - } - while (position < numValues) { - // If we are out of values, read more. - if (remainingValues == 0) { - readHeader(); - } - // How many do we read out of this block? - uint64_t count = std::min(numValues - position, remainingValues); - uint64_t consumed = 0; - if (repeating) { - if (notNull) { - for (uint64_t i = 0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = value + static_cast<int64_t>(consumed) * delta; - consumed += 1; - } - } - } else { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = value + static_cast<int64_t>(i) * delta; - } - consumed = count; - } - value += static_cast<int64_t>(consumed) * delta; - } else { - if (notNull) { - for (uint64_t i = 0 ; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = isSigned - ? unZigZag(readLong()) - : static_cast<int64_t>(readLong()); - ++consumed; - } - } - } else { - if (isSigned) { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = unZigZag(readLong()); - } - } else { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = static_cast<int64_t>(readLong()); - } - } - consumed = count; - } - } - remainingValues -= consumed; - position += count; - - // skipNulls() - if (notNull) { - // Skip over null values. - while (position < numValues && !notNull[position]) { - ++position; - } - } - } -} - -} // namespace orc +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" +#include "RLEv1.hh" + +#include <algorithm> + +namespace orc { + +const uint64_t MINIMUM_REPEAT = 3; +const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; + +const int64_t BASE_128_MASK = 0x7f; + +const int64_t MAX_DELTA = 127; +const int64_t MIN_DELTA = -128; +const uint64_t MAX_LITERAL_SIZE = 128; + +RleEncoderV1::RleEncoderV1( + std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned): + RleEncoder(std::move(outStream), hasSigned) { + literals = new int64_t[MAX_LITERAL_SIZE]; + delta = 0; + repeat = false; + tailRunLength = 0; +} + +RleEncoderV1::~RleEncoderV1() { + delete [] literals; +} + +void RleEncoderV1::writeValues() { + if (numLiterals != 0) { + if (repeat) { + writeByte(static_cast<char> + (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); + writeByte(static_cast<char>(delta)); + if (isSigned) { + writeVslong(literals[0]); + } else { + writeVulong(literals[0]); + } + } else { + writeByte(static_cast<char>(-numLiterals)); + for(size_t i=0; i < numLiterals; ++i) { + if (isSigned) { + writeVslong(literals[i]); + } else { + writeVulong(literals[i]); + } + } + } + repeat = false; + numLiterals = 0; + tailRunLength = 0; + } +} + +uint64_t RleEncoderV1::flush() { + writeValues(); + outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; +} + +void RleEncoderV1::write(int64_t value) { + if (numLiterals == 0) { + literals[numLiterals++] = value; + tailRunLength = 1; + } else if (repeat) { + if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { + numLiterals += 1; + if (numLiterals == MAXIMUM_REPEAT) { + writeValues(); + } + } else { + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; + } + } else { + if (tailRunLength == 1) { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } else if (value == literals[numLiterals - 1] + delta) { + tailRunLength += 1; + } else { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } + if (tailRunLength == MINIMUM_REPEAT) { + if (numLiterals + 1 == MINIMUM_REPEAT) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); + int64_t base = literals[numLiterals]; + writeValues(); + literals[0] = base; + repeat = true; + numLiterals = MINIMUM_REPEAT; + } + } else { + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } + } + } +} + +signed char RleDecoderV1::readByte() { + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in readByte"); + } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + return *(bufferStart++); +} + +uint64_t RleDecoderV1::readLong() { + uint64_t result = 0; + int64_t offset = 0; + signed char ch = readByte(); + if (ch >= 0) { + result = static_cast<uint64_t>(ch); + } else { + result = static_cast<uint64_t>(ch) & BASE_128_MASK; + while ((ch = readByte()) < 0) { + offset += 7; + result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset; + } + result |= static_cast<uint64_t>(ch) << (offset + 7); + } + return result; +} + +void RleDecoderV1::skipLongs(uint64_t numValues) { + while (numValues > 0) { + if (readByte() >= 0) { + --numValues; + } + } +} + +void RleDecoderV1::readHeader() { + signed char ch = readByte(); + if (ch < 0) { + remainingValues = static_cast<uint64_t>(-ch); + repeating = false; + } else { + remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; + repeating = true; + delta = readByte(); + value = isSigned + ? unZigZag(readLong()) + : static_cast<int64_t>(readLong()); + } +} + +RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, + bool hasSigned) + : inputStream(std::move(input)), + isSigned(hasSigned), + remainingValues(0), + value(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + delta(0), + repeating(false) { +} + +void RleDecoderV1::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // force a re-read from the stream + bufferEnd = bufferStart; + // read a new header + readHeader(); + // skip ahead the given number of records + skip(location.next()); +} + +void RleDecoderV1::skip(uint64_t numValues) { + while (numValues > 0) { + if (remainingValues == 0) { + readHeader(); + } + uint64_t count = std::min(numValues, remainingValues); + remainingValues -= count; + numValues -= count; + if (repeating) { + value += delta * static_cast<int64_t>(count); + } else { + skipLongs(count); + } + } +} + +void RleDecoderV1::next(int64_t* const data, + const uint64_t numValues, + const char* const notNull) { + uint64_t position = 0; + // skipNulls() + if (notNull) { + // Skip over null values. + while (position < numValues && !notNull[position]) { + ++position; + } + } + while (position < numValues) { + // If we are out of values, read more. + if (remainingValues == 0) { + readHeader(); + } + // How many do we read out of this block? + uint64_t count = std::min(numValues - position, remainingValues); + uint64_t consumed = 0; + if (repeating) { + if (notNull) { + for (uint64_t i = 0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = value + static_cast<int64_t>(consumed) * delta; + consumed += 1; + } + } + } else { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = value + static_cast<int64_t>(i) * delta; + } + consumed = count; + } + value += static_cast<int64_t>(consumed) * delta; + } else { + if (notNull) { + for (uint64_t i = 0 ; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = isSigned + ? unZigZag(readLong()) + : static_cast<int64_t>(readLong()); + ++consumed; + } + } + } else { + if (isSigned) { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = unZigZag(readLong()); + } + } else { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = static_cast<int64_t>(readLong()); + } + } + consumed = count; + } + } + remainingValues -= consumed; + position += count; + + // skipNulls() + if (notNull) { + // Skip over null values. + while (position < numValues && !notNull[position]) { + ++position; + } + } + } +} + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh index 8e31d70873..eb0cf1d8c2 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh @@ -1,91 +1,91 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef ORC_RLEV1_HH -#define ORC_RLEV1_HH - -#include "Adaptor.hh" -#include "RLE.hh" - -#include <memory> - -namespace orc { - -class RleEncoderV1 : public RleEncoder { -public: - RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned); - ~RleEncoderV1() override ; - - /** - * Flushing underlying BufferedOutputStream - */ - uint64_t flush() override; - - void write(int64_t val) override; - -private: - int64_t delta; - bool repeat; - uint64_t tailRunLength; - - void writeValues(); -}; - -class RleDecoderV1 : public RleDecoder { -public: - RleDecoderV1(std::unique_ptr<SeekableInputStream> input, - bool isSigned); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; - -private: - inline signed char readByte(); - - inline void readHeader(); - - inline uint64_t readLong(); - - inline void skipLongs(uint64_t numValues); - - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - uint64_t remainingValues; - int64_t value; - const char *bufferStart; - const char *bufferEnd; - int64_t delta; - bool repeating; -}; -} // namespace orc - -#endif // ORC_RLEV1_HH +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef ORC_RLEV1_HH +#define ORC_RLEV1_HH + +#include "Adaptor.hh" +#include "RLE.hh" + +#include <memory> + +namespace orc { + +class RleEncoderV1 : public RleEncoder { +public: + RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned); + ~RleEncoderV1() override ; + + /** + * Flushing underlying BufferedOutputStream + */ + uint64_t flush() override; + + void write(int64_t val) override; + +private: + int64_t delta; + bool repeat; + uint64_t tailRunLength; + + void writeValues(); +}; + +class RleDecoderV1 : public RleDecoder { +public: + RleDecoderV1(std::unique_ptr<SeekableInputStream> input, + bool isSigned); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + void next(int64_t* data, uint64_t numValues, + const char* notNull) override; + +private: + inline signed char readByte(); + + inline void readHeader(); + + inline uint64_t readLong(); + + inline void skipLongs(uint64_t numValues); + + const std::unique_ptr<SeekableInputStream> inputStream; + const bool isSigned; + uint64_t remainingValues; + int64_t value; + const char *bufferStart; + const char *bufferEnd; + int64_t delta; + bool repeating; +}; +} // namespace orc + +#endif // ORC_RLEV1_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh index f85dabd9e6..5c740dfd27 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv2.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh @@ -1,251 +1,251 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef ORC_RLEV2_HH -#define ORC_RLEV2_HH - -#include "Adaptor.hh" -#include "orc/Exceptions.hh" -#include "RLE.hh" - -#include <vector> - -#define MIN_REPEAT 3 -#define HIST_LEN 32 -namespace orc { - -struct FixedBitSizes { - enum FBS { - ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, - TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE - }; -}; - -enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; - -struct EncodingOption { - EncodingType encoding; - int64_t fixedDelta; - int64_t gapVsPatchListCount; - int64_t zigzagLiteralsCount; - int64_t baseRedLiteralsCount; - int64_t adjDeltasCount; - uint32_t zzBits90p; - uint32_t zzBits100p; - uint32_t brBits95p; - uint32_t brBits100p; - uint32_t bitsDeltaMax; - uint32_t patchWidth; - uint32_t patchGapWidth; - uint32_t patchLength; - int64_t min; - bool isFixedDelta; -}; - -class RleEncoderV2 : public RleEncoder { -public: - RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true); - - ~RleEncoderV2() override { - delete [] literals; - delete [] gapVsPatchList; - delete [] zigzagLiterals; - delete [] baseRedLiterals; - delete [] adjDeltas; - } - /** - * Flushing underlying BufferedOutputStream - */ - uint64_t flush() override; - - void write(int64_t val) override; - -private: - - const bool alignedBitPacking; - uint32_t fixedRunLength; - uint32_t variableRunLength; - int64_t prevDelta; - int32_t histgram[HIST_LEN]; - - // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), - // it is move here for performance consideration. - int64_t* gapVsPatchList; - int64_t* zigzagLiterals; - int64_t* baseRedLiterals; - int64_t* adjDeltas; - - uint32_t getOpCode(EncodingType encoding); - void determineEncoding(EncodingOption& option); - void computeZigZagLiterals(EncodingOption& option); - void preparePatchedBlob(EncodingOption& option); - - void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize); - void initializeLiterals(int64_t val); - void writeValues(EncodingOption& option); - void writeShortRepeatValues(EncodingOption& option); - void writeDirectValues(EncodingOption& option); - void writePatchedBasedValues(EncodingOption& option); - void writeDeltaValues(EncodingOption& option); - uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); -}; - -class RleDecoderV2 : public RleDecoder { -public: - RleDecoderV2(std::unique_ptr<SeekableInputStream> input, - bool isSigned, MemoryPool& pool); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; - -private: - - // Used by PATCHED_BASE - void adjustGapAndPatch() { - curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> - patchBitSize; - curPatch = unpackedPatch[patchIdx] & patchMask; - actualGap = 0; - - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (curGap == 255 && curPatch == 0) { - actualGap += 255; - ++patchIdx; - curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> - patchBitSize; - curPatch = unpackedPatch[patchIdx] & patchMask; - } - // add the left over gap - actualGap += curGap; - } - - void resetReadLongs() { - bitsLeft = 0; - curByte = 0; - } - - void resetRun() { - resetReadLongs(); - bitSize = 0; - } - - unsigned char readByte() { - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::readByte"); - } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - - unsigned char result = static_cast<unsigned char>(*bufferStart++); - return result; -} - - int64_t readLongBE(uint64_t bsz); - int64_t readVslong(); - uint64_t readVulong(); - uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len, - uint64_t fb, const char* notNull = nullptr) { - uint64_t ret = 0; - - // TODO: unroll to improve performance - for(uint64_t i = offset; i < (offset + len); i++) { - // skip null positions - if (notNull && !notNull[i]) { - continue; - } - uint64_t result = 0; - uint64_t bitsLeftToRead = fb; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast<uint32_t>(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast<int64_t>(result); - ++ret; - } - - return ret; -} - - uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - - unsigned char firstByte; - uint64_t runLength; - uint64_t runRead; - const char *bufferStart; - const char *bufferEnd; - int64_t deltaBase; // Used by DELTA - uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE - int64_t firstValue; // Used by SHORT_REPEAT and DELTA - int64_t prevValue; // Used by DELTA - uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA - uint32_t bitsLeft; // Used by anything that uses readLongs - uint32_t curByte; // Used by anything that uses readLongs - uint32_t patchBitSize; // Used by PATCHED_BASE - uint64_t unpackedIdx; // Used by PATCHED_BASE - uint64_t patchIdx; // Used by PATCHED_BASE - int64_t base; // Used by PATCHED_BASE - uint64_t curGap; // Used by PATCHED_BASE - int64_t curPatch; // Used by PATCHED_BASE - int64_t patchMask; // Used by PATCHED_BASE - int64_t actualGap; // Used by PATCHED_BASE - DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE - DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE -}; -} // namespace orc - -#endif // ORC_RLEV2_HH +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef ORC_RLEV2_HH +#define ORC_RLEV2_HH + +#include "Adaptor.hh" +#include "orc/Exceptions.hh" +#include "RLE.hh" + +#include <vector> + +#define MIN_REPEAT 3 +#define HIST_LEN 32 +namespace orc { + +struct FixedBitSizes { + enum FBS { + ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, + THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, + TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, + TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE + }; +}; + +enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; + +struct EncodingOption { + EncodingType encoding; + int64_t fixedDelta; + int64_t gapVsPatchListCount; + int64_t zigzagLiteralsCount; + int64_t baseRedLiteralsCount; + int64_t adjDeltasCount; + uint32_t zzBits90p; + uint32_t zzBits100p; + uint32_t brBits95p; + uint32_t brBits100p; + uint32_t bitsDeltaMax; + uint32_t patchWidth; + uint32_t patchGapWidth; + uint32_t patchLength; + int64_t min; + bool isFixedDelta; +}; + +class RleEncoderV2 : public RleEncoder { +public: + RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true); + + ~RleEncoderV2() override { + delete [] literals; + delete [] gapVsPatchList; + delete [] zigzagLiterals; + delete [] baseRedLiterals; + delete [] adjDeltas; + } + /** + * Flushing underlying BufferedOutputStream + */ + uint64_t flush() override; + + void write(int64_t val) override; + +private: + + const bool alignedBitPacking; + uint32_t fixedRunLength; + uint32_t variableRunLength; + int64_t prevDelta; + int32_t histgram[HIST_LEN]; + + // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), + // it is move here for performance consideration. + int64_t* gapVsPatchList; + int64_t* zigzagLiterals; + int64_t* baseRedLiterals; + int64_t* adjDeltas; + + uint32_t getOpCode(EncodingType encoding); + void determineEncoding(EncodingOption& option); + void computeZigZagLiterals(EncodingOption& option); + void preparePatchedBlob(EncodingOption& option); + + void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize); + void initializeLiterals(int64_t val); + void writeValues(EncodingOption& option); + void writeShortRepeatValues(EncodingOption& option); + void writeDirectValues(EncodingOption& option); + void writePatchedBasedValues(EncodingOption& option); + void writeDeltaValues(EncodingOption& option); + uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); +}; + +class RleDecoderV2 : public RleDecoder { +public: + RleDecoderV2(std::unique_ptr<SeekableInputStream> input, + bool isSigned, MemoryPool& pool); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + void next(int64_t* data, uint64_t numValues, + const char* notNull) override; + +private: + + // Used by PATCHED_BASE + void adjustGapAndPatch() { + curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> + patchBitSize; + curPatch = unpackedPatch[patchIdx] & patchMask; + actualGap = 0; + + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (curGap == 255 && curPatch == 0) { + actualGap += 255; + ++patchIdx; + curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> + patchBitSize; + curPatch = unpackedPatch[patchIdx] & patchMask; + } + // add the left over gap + actualGap += curGap; + } + + void resetReadLongs() { + bitsLeft = 0; + curByte = 0; + } + + void resetRun() { + resetReadLongs(); + bitSize = 0; + } + + unsigned char readByte() { + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::readByte"); + } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + + unsigned char result = static_cast<unsigned char>(*bufferStart++); + return result; +} + + int64_t readLongBE(uint64_t bsz); + int64_t readVslong(); + uint64_t readVulong(); + uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len, + uint64_t fb, const char* notNull = nullptr) { + uint64_t ret = 0; + + // TODO: unroll to improve performance + for(uint64_t i = offset; i < (offset + len); i++) { + // skip null positions + if (notNull && !notNull[i]) { + continue; + } + uint64_t result = 0; + uint64_t bitsLeftToRead = fb; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= curByte & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + curByte = readByte(); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= static_cast<uint32_t>(bitsLeftToRead); + result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast<int64_t>(result); + ++ret; + } + + return ret; +} + + uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + + const std::unique_ptr<SeekableInputStream> inputStream; + const bool isSigned; + + unsigned char firstByte; + uint64_t runLength; + uint64_t runRead; + const char *bufferStart; + const char *bufferEnd; + int64_t deltaBase; // Used by DELTA + uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE + int64_t firstValue; // Used by SHORT_REPEAT and DELTA + int64_t prevValue; // Used by DELTA + uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA + uint32_t bitsLeft; // Used by anything that uses readLongs + uint32_t curByte; // Used by anything that uses readLongs + uint32_t patchBitSize; // Used by PATCHED_BASE + uint64_t unpackedIdx; // Used by PATCHED_BASE + uint64_t patchIdx; // Used by PATCHED_BASE + int64_t base; // Used by PATCHED_BASE + uint64_t curGap; // Used by PATCHED_BASE + int64_t curPatch; // Used by PATCHED_BASE + int64_t patchMask; // Used by PATCHED_BASE + int64_t actualGap; // Used by PATCHED_BASE + DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE + DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE +}; +} // namespace orc + +#endif // ORC_RLEV2_HH diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc index f35106ee44..a633567a9c 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.cc +++ b/contrib/libs/apache/orc/c++/src/Reader.cc @@ -1,513 +1,513 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "BloomFilter.hh" -#include "Options.hh" -#include "Reader.hh" -#include "Statistics.hh" -#include "StripeStream.hh" - -#include "wrap/coded-stream-wrapper.h" - -#include <algorithm> -#include <iostream> -#include <memory> -#include <sstream> -#include <string> -#include <vector> -#include <iterator> -#include <set> - -namespace orc { - - const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() { - static const WriterVersionImpl version(WriterVersion_HIVE_8732); - return version; - } - - uint64_t getCompressionBlockSize(const proto::PostScript& ps) { - if (ps.has_compressionblocksize()) { - return ps.compressionblocksize(); - } else { - return 256 * 1024; - } - } - - CompressionKind convertCompressionKind(const proto::PostScript& ps) { - if (ps.has_compression()) { - return static_cast<CompressionKind>(ps.compression()); - } else { - throw ParseError("Unknown compression type"); - } - } - - std::string ColumnSelector::toDotColumnPath() { - if (columns.empty()) { - return std::string(); - } - std::ostringstream columnStream; - std::copy(columns.begin(), columns.end(), - std::ostream_iterator<std::string>(columnStream, ".")); - std::string columnPath = columnStream.str(); - return columnPath.substr(0, columnPath.length() - 1); - } - - - void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) { - size_t id = static_cast<size_t>(type.getColumnId()); - if (!selectedColumns[id]) { - selectedColumns[id] = true; - for(size_t c = id; c <= type.getMaximumColumnId(); ++c){ - selectedColumns[c] = true; - } - } - } - - /** - * Recurses over a type tree and selects the parents of every selected type. - * @return true if any child was selected. - */ - bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const Type& type) { - size_t id = static_cast<size_t>(type.getColumnId()); - bool result = selectedColumns[id]; - for(uint64_t c=0; c < type.getSubtypeCount(); ++c) { - result |= selectParents(selectedColumns, *type.getSubtype(c)); - } - selectedColumns[id] = result; - return result; - } - - /** - * Recurses over a type tree and build two maps - * map<TypeName, TypeId>, map<TypeId, Type> - */ - void ColumnSelector::buildTypeNameIdMap(const Type* type) { - // map<type_id, Type*> - idTypeMap[type->getColumnId()] = type; - - if (STRUCT == type->getKind()) { - for (size_t i = 0; i < type->getSubtypeCount(); ++i) { - const std::string& fieldName = type->getFieldName(i); - columns.push_back(fieldName); - nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId(); - buildTypeNameIdMap(type->getSubtype(i)); - columns.pop_back(); - } - } else { - // other non-primitive type - for (size_t j = 0; j < type->getSubtypeCount(); ++j) { - buildTypeNameIdMap(type->getSubtype(j)); - } - } - } - - void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns, - const RowReaderOptions& options) { - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) { - for(std::list<uint64_t>::const_iterator field = options.getInclude().begin(); - field != options.getInclude().end(); ++field) { - updateSelectedByFieldId(selectedColumns, *field); - } - } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { - for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); - field != options.getIncludeNames().end(); ++field) { - updateSelectedByName(selectedColumns, *field); - } - } else if (options.getTypeIdsSet()) { - for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); - typeId != options.getInclude().end(); ++typeId) { - updateSelectedByTypeId(selectedColumns, *typeId); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - } - - void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns, - uint64_t fieldId) { - if (fieldId < contents->schema->getSubtypeCount()) { - selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId)); - } else { - std::stringstream buffer; - buffer << "Invalid column selected " << fieldId << " out of " - << contents->schema->getSubtypeCount(); - throw ParseError(buffer.str()); - } - } - - void ColumnSelector::updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId) { - if (typeId < selectedColumns.size()) { - const Type& type = *idTypeMap[typeId]; - selectChildren(selectedColumns, type); - } else { - std::stringstream buffer; - buffer << "Invalid type id selected " << typeId << " out of " - << selectedColumns.size(); - throw ParseError(buffer.str()); - } - } - - void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns, - const std::string& fieldName) { - std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName); - if (ite != nameIdMap.end()) { - updateSelectedByTypeId(selectedColumns, ite->second); - } else { - throw ParseError("Invalid column selected " + fieldName); - } - } - - ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) { - buildTypeNameIdMap(contents->schema.get()); - } - - RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents, - const RowReaderOptions& opts - ): localTimezone(getLocalTimezone()), - contents(_contents), - throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), - forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), - footer(contents->footer.get()), - firstRowOfStripe(*contents->pool, 0), - enableEncodedBlock(opts.getEnableLazyDecoding()) { - uint64_t numberOfStripes; - numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - currentStripe = numberOfStripes; - lastStripe = 0; - currentRowInStripe = 0; - rowsInCurrentStripe = 0; - uint64_t rowTotal = 0; - - firstRowOfStripe.resize(numberOfStripes); - for(size_t i=0; i < numberOfStripes; ++i) { - firstRowOfStripe[i] = rowTotal; - proto::StripeInformation stripeInfo = - footer->stripes(static_cast<int>(i)); - rowTotal += stripeInfo.numberofrows(); - bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() && - stripeInfo.offset() < opts.getOffset() + opts.getLength(); - if (isStripeInRange) { - if (i < currentStripe) { - currentStripe = i; - } - if (i >= lastStripe) { - lastStripe = i + 1; - } - } - } - firstStripe = currentStripe; - - if (currentStripe == 0) { - previousRow = (std::numeric_limits<uint64_t>::max)(); - } else if (currentStripe == numberOfStripes) { - previousRow = footer->numberofrows(); - } else { - previousRow = firstRowOfStripe[firstStripe]-1; - } - - ColumnSelector column_selector(contents.get()); - column_selector.updateSelected(selectedColumns, opts); - } - - CompressionKind RowReaderImpl::getCompression() const { - return contents->compression; - } - - uint64_t RowReaderImpl::getCompressionSize() const { - return contents->blockSize; - } - - const std::vector<bool> RowReaderImpl::getSelectedColumns() const { - return selectedColumns; - } - - const Type& RowReaderImpl::getSelectedType() const { - if (selectedSchema.get() == nullptr) { - selectedSchema = buildSelectedType(contents->schema.get(), - selectedColumns); - } - return *(selectedSchema.get()); - } - - uint64_t RowReaderImpl::getRowNumber() const { - return previousRow; - } - - void RowReaderImpl::seekToRow(uint64_t rowNumber) { - // Empty file - if (lastStripe == 0) { - return; - } - - // If we are reading only a portion of the file - // (bounded by firstStripe and lastStripe), - // seeking before or after the portion of interest should return no data. - // Implement this by setting previousRow to the number of rows in the file. - - // seeking past lastStripe - uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size()); - if ( (lastStripe == num_stripes - && rowNumber >= footer->numberofrows()) || - (lastStripe < num_stripes - && rowNumber >= firstRowOfStripe[lastStripe]) ) { - currentStripe = num_stripes; - previousRow = footer->numberofrows(); - return; - } - - uint64_t seekToStripe = 0; - while (seekToStripe+1 < lastStripe && - firstRowOfStripe[seekToStripe+1] <= rowNumber) { - seekToStripe++; - } - - // seeking before the first stripe - if (seekToStripe < firstStripe) { - currentStripe = num_stripes; - previousRow = footer->numberofrows(); - return; - } - - currentStripe = seekToStripe; - currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; - previousRow = rowNumber; - startNextStripe(); - - uint64_t rowsToSkip = currentRowInStripe; - - if (footer->rowindexstride() > 0 && - currentStripeInfo.indexlength() > 0) { - uint32_t rowGroupId = - static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride()); - rowsToSkip -= rowGroupId * footer->rowindexstride(); - - if (rowGroupId != 0) { - seekToRowGroup(rowGroupId); - } - } - - reader->skip(rowsToSkip); - } - - void RowReaderImpl::seekToRowGroup(uint32_t rowGroupEntryId) { - // reset all previous row indexes - rowIndexes.clear(); - - // obtain row indexes for selected columns - uint64_t offset = currentStripeInfo.offset(); - for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { - const proto::Stream& pbStream = currentStripeFooter.streams(i); - uint64_t colId = pbStream.column(); - if (selectedColumns[colId] && pbStream.has_kind() - && pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { - std::unique_ptr<SeekableInputStream> inStream = - createDecompressor(getCompression(), - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream - (contents->stream.get(), - offset, - pbStream.length(), - *contents->pool)), - getCompressionSize(), - *contents->pool); - - proto::RowIndex rowIndex; - if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) { - throw ParseError("Failed to parse the row index"); - } - - rowIndexes[colId] = rowIndex; - } - offset += pbStream.length(); - } - - // store positions for selected columns - std::vector<std::list<uint64_t>> positions; - // store position providers for selected colimns - std::unordered_map<uint64_t, PositionProvider> positionProviders; - - for (auto rowIndex = rowIndexes.cbegin(); - rowIndex != rowIndexes.cend(); ++rowIndex) { - uint64_t colId = rowIndex->first; - const proto::RowIndexEntry& entry = - rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); - - // copy index positions for a specific column - positions.push_back({}); - auto& position = positions.back(); - for (int pos = 0; pos != entry.positions_size(); ++pos) { - position.push_back(entry.positions(pos)); - } - positionProviders.insert(std::make_pair(colId, PositionProvider(position))); - } - - reader->seekToRowGroup(positionProviders); - } - - const FileContents& RowReaderImpl::getFileContents() const { - return *contents; - } - - bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const { - return throwOnHive11DecimalOverflow; - } - - int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const { - return forcedScaleOnHive11Decimal; - } - - proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, - const FileContents& contents) { - uint64_t stripeFooterStart = info.offset() + info.indexlength() + - info.datalength(); - uint64_t stripeFooterLength = info.footerlength(); - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents.compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents.stream.get(), - stripeFooterStart, - stripeFooterLength, - *contents.pool)), - contents.blockSize, - *contents.pool); - proto::StripeFooter result; - if (!result.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError(std::string("bad StripeFooter from ") + - pbStream->getName()); - } - return result; - } - - ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, - const ReaderOptions& opts, - uint64_t _fileLength, - uint64_t _postscriptLength - ): contents(std::move(_contents)), - options(opts), - fileLength(_fileLength), - postscriptLength(_postscriptLength), - footer(contents->footer.get()) { - isMetadataLoaded = false; - checkOrcVersion(); - numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer)); - contents->blockSize = getCompressionBlockSize(*contents->postscript); - contents->compression= convertCompressionKind(*contents->postscript); - } - - std::string ReaderImpl::getSerializedFileTail() const { - proto::FileTail tail; - proto::PostScript *mutable_ps = tail.mutable_postscript(); - mutable_ps->CopyFrom(*contents->postscript); - proto::Footer *mutableFooter = tail.mutable_footer(); - mutableFooter->CopyFrom(*footer); - tail.set_filelength(fileLength); - tail.set_postscriptlength(postscriptLength); - TString result; - if (!tail.SerializeToString(&result)) { - throw ParseError("Failed to serialize file tail"); - } - return result; - } - - const ReaderOptions& ReaderImpl::getReaderOptions() const { - return options; - } - - CompressionKind ReaderImpl::getCompression() const { - return contents->compression; - } - - uint64_t ReaderImpl::getCompressionSize() const { - return contents->blockSize; - } - - uint64_t ReaderImpl::getNumberOfStripes() const { - return numberOfStripes; - } - - uint64_t ReaderImpl::getNumberOfStripeStatistics() const { - if (!isMetadataLoaded) { - readMetadata(); - } - return metadata.get() == nullptr ? 0 : - static_cast<uint64_t>(metadata->stripestats_size()); - } - - std::unique_ptr<StripeInformation> - ReaderImpl::getStripe(uint64_t stripeIndex) const { - if (stripeIndex > getNumberOfStripes()) { - throw std::logic_error("stripe index out of range"); - } - proto::StripeInformation stripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - - return std::unique_ptr<StripeInformation> - (new StripeInformationImpl - (stripeInfo.offset(), - stripeInfo.indexlength(), - stripeInfo.datalength(), - stripeInfo.footerlength(), - stripeInfo.numberofrows(), - contents->stream.get(), - *contents->pool, - contents->compression, - contents->blockSize)); - } - - FileVersion ReaderImpl::getFormatVersion() const { - if (contents->postscript->version_size() != 2) { - return FileVersion::v_0_11(); - } - return FileVersion( - contents->postscript->version(0), - contents->postscript->version(1)); - } - - uint64_t ReaderImpl::getNumberOfRows() const { - return footer->numberofrows(); - } - - WriterId ReaderImpl::getWriterId() const { - if (footer->has_writer()) { - uint32_t id = footer->writer(); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "BloomFilter.hh" +#include "Options.hh" +#include "Reader.hh" +#include "Statistics.hh" +#include "StripeStream.hh" + +#include "wrap/coded-stream-wrapper.h" + +#include <algorithm> +#include <iostream> +#include <memory> +#include <sstream> +#include <string> +#include <vector> +#include <iterator> +#include <set> + +namespace orc { + + const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() { + static const WriterVersionImpl version(WriterVersion_HIVE_8732); + return version; + } + + uint64_t getCompressionBlockSize(const proto::PostScript& ps) { + if (ps.has_compressionblocksize()) { + return ps.compressionblocksize(); + } else { + return 256 * 1024; + } + } + + CompressionKind convertCompressionKind(const proto::PostScript& ps) { + if (ps.has_compression()) { + return static_cast<CompressionKind>(ps.compression()); + } else { + throw ParseError("Unknown compression type"); + } + } + + std::string ColumnSelector::toDotColumnPath() { + if (columns.empty()) { + return std::string(); + } + std::ostringstream columnStream; + std::copy(columns.begin(), columns.end(), + std::ostream_iterator<std::string>(columnStream, ".")); + std::string columnPath = columnStream.str(); + return columnPath.substr(0, columnPath.length() - 1); + } + + + void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) { + size_t id = static_cast<size_t>(type.getColumnId()); + if (!selectedColumns[id]) { + selectedColumns[id] = true; + for(size_t c = id; c <= type.getMaximumColumnId(); ++c){ + selectedColumns[c] = true; + } + } + } + + /** + * Recurses over a type tree and selects the parents of every selected type. + * @return true if any child was selected. + */ + bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const Type& type) { + size_t id = static_cast<size_t>(type.getColumnId()); + bool result = selectedColumns[id]; + for(uint64_t c=0; c < type.getSubtypeCount(); ++c) { + result |= selectParents(selectedColumns, *type.getSubtype(c)); + } + selectedColumns[id] = result; + return result; + } + + /** + * Recurses over a type tree and build two maps + * map<TypeName, TypeId>, map<TypeId, Type> + */ + void ColumnSelector::buildTypeNameIdMap(const Type* type) { + // map<type_id, Type*> + idTypeMap[type->getColumnId()] = type; + + if (STRUCT == type->getKind()) { + for (size_t i = 0; i < type->getSubtypeCount(); ++i) { + const std::string& fieldName = type->getFieldName(i); + columns.push_back(fieldName); + nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId(); + buildTypeNameIdMap(type->getSubtype(i)); + columns.pop_back(); + } + } else { + // other non-primitive type + for (size_t j = 0; j < type->getSubtypeCount(); ++j) { + buildTypeNameIdMap(type->getSubtype(j)); + } + } + } + + void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns, + const RowReaderOptions& options) { + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) { + for(std::list<uint64_t>::const_iterator field = options.getInclude().begin(); + field != options.getInclude().end(); ++field) { + updateSelectedByFieldId(selectedColumns, *field); + } + } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { + for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); + field != options.getIncludeNames().end(); ++field) { + updateSelectedByName(selectedColumns, *field); + } + } else if (options.getTypeIdsSet()) { + for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); + typeId != options.getInclude().end(); ++typeId) { + updateSelectedByTypeId(selectedColumns, *typeId); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + } + + void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns, + uint64_t fieldId) { + if (fieldId < contents->schema->getSubtypeCount()) { + selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId)); + } else { + std::stringstream buffer; + buffer << "Invalid column selected " << fieldId << " out of " + << contents->schema->getSubtypeCount(); + throw ParseError(buffer.str()); + } + } + + void ColumnSelector::updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId) { + if (typeId < selectedColumns.size()) { + const Type& type = *idTypeMap[typeId]; + selectChildren(selectedColumns, type); + } else { + std::stringstream buffer; + buffer << "Invalid type id selected " << typeId << " out of " + << selectedColumns.size(); + throw ParseError(buffer.str()); + } + } + + void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns, + const std::string& fieldName) { + std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName); + if (ite != nameIdMap.end()) { + updateSelectedByTypeId(selectedColumns, ite->second); + } else { + throw ParseError("Invalid column selected " + fieldName); + } + } + + ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) { + buildTypeNameIdMap(contents->schema.get()); + } + + RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents, + const RowReaderOptions& opts + ): localTimezone(getLocalTimezone()), + contents(_contents), + throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), + forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), + footer(contents->footer.get()), + firstRowOfStripe(*contents->pool, 0), + enableEncodedBlock(opts.getEnableLazyDecoding()) { + uint64_t numberOfStripes; + numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); + currentStripe = numberOfStripes; + lastStripe = 0; + currentRowInStripe = 0; + rowsInCurrentStripe = 0; + uint64_t rowTotal = 0; + + firstRowOfStripe.resize(numberOfStripes); + for(size_t i=0; i < numberOfStripes; ++i) { + firstRowOfStripe[i] = rowTotal; + proto::StripeInformation stripeInfo = + footer->stripes(static_cast<int>(i)); + rowTotal += stripeInfo.numberofrows(); + bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() && + stripeInfo.offset() < opts.getOffset() + opts.getLength(); + if (isStripeInRange) { + if (i < currentStripe) { + currentStripe = i; + } + if (i >= lastStripe) { + lastStripe = i + 1; + } + } + } + firstStripe = currentStripe; + + if (currentStripe == 0) { + previousRow = (std::numeric_limits<uint64_t>::max)(); + } else if (currentStripe == numberOfStripes) { + previousRow = footer->numberofrows(); + } else { + previousRow = firstRowOfStripe[firstStripe]-1; + } + + ColumnSelector column_selector(contents.get()); + column_selector.updateSelected(selectedColumns, opts); + } + + CompressionKind RowReaderImpl::getCompression() const { + return contents->compression; + } + + uint64_t RowReaderImpl::getCompressionSize() const { + return contents->blockSize; + } + + const std::vector<bool> RowReaderImpl::getSelectedColumns() const { + return selectedColumns; + } + + const Type& RowReaderImpl::getSelectedType() const { + if (selectedSchema.get() == nullptr) { + selectedSchema = buildSelectedType(contents->schema.get(), + selectedColumns); + } + return *(selectedSchema.get()); + } + + uint64_t RowReaderImpl::getRowNumber() const { + return previousRow; + } + + void RowReaderImpl::seekToRow(uint64_t rowNumber) { + // Empty file + if (lastStripe == 0) { + return; + } + + // If we are reading only a portion of the file + // (bounded by firstStripe and lastStripe), + // seeking before or after the portion of interest should return no data. + // Implement this by setting previousRow to the number of rows in the file. + + // seeking past lastStripe + uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size()); + if ( (lastStripe == num_stripes + && rowNumber >= footer->numberofrows()) || + (lastStripe < num_stripes + && rowNumber >= firstRowOfStripe[lastStripe]) ) { + currentStripe = num_stripes; + previousRow = footer->numberofrows(); + return; + } + + uint64_t seekToStripe = 0; + while (seekToStripe+1 < lastStripe && + firstRowOfStripe[seekToStripe+1] <= rowNumber) { + seekToStripe++; + } + + // seeking before the first stripe + if (seekToStripe < firstStripe) { + currentStripe = num_stripes; + previousRow = footer->numberofrows(); + return; + } + + currentStripe = seekToStripe; + currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; + previousRow = rowNumber; + startNextStripe(); + + uint64_t rowsToSkip = currentRowInStripe; + + if (footer->rowindexstride() > 0 && + currentStripeInfo.indexlength() > 0) { + uint32_t rowGroupId = + static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride()); + rowsToSkip -= rowGroupId * footer->rowindexstride(); + + if (rowGroupId != 0) { + seekToRowGroup(rowGroupId); + } + } + + reader->skip(rowsToSkip); + } + + void RowReaderImpl::seekToRowGroup(uint32_t rowGroupEntryId) { + // reset all previous row indexes + rowIndexes.clear(); + + // obtain row indexes for selected columns + uint64_t offset = currentStripeInfo.offset(); + for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { + const proto::Stream& pbStream = currentStripeFooter.streams(i); + uint64_t colId = pbStream.column(); + if (selectedColumns[colId] && pbStream.has_kind() + && pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { + std::unique_ptr<SeekableInputStream> inStream = + createDecompressor(getCompression(), + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream + (contents->stream.get(), + offset, + pbStream.length(), + *contents->pool)), + getCompressionSize(), + *contents->pool); + + proto::RowIndex rowIndex; + if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) { + throw ParseError("Failed to parse the row index"); + } + + rowIndexes[colId] = rowIndex; + } + offset += pbStream.length(); + } + + // store positions for selected columns + std::vector<std::list<uint64_t>> positions; + // store position providers for selected colimns + std::unordered_map<uint64_t, PositionProvider> positionProviders; + + for (auto rowIndex = rowIndexes.cbegin(); + rowIndex != rowIndexes.cend(); ++rowIndex) { + uint64_t colId = rowIndex->first; + const proto::RowIndexEntry& entry = + rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); + + // copy index positions for a specific column + positions.push_back({}); + auto& position = positions.back(); + for (int pos = 0; pos != entry.positions_size(); ++pos) { + position.push_back(entry.positions(pos)); + } + positionProviders.insert(std::make_pair(colId, PositionProvider(position))); + } + + reader->seekToRowGroup(positionProviders); + } + + const FileContents& RowReaderImpl::getFileContents() const { + return *contents; + } + + bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const { + return throwOnHive11DecimalOverflow; + } + + int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const { + return forcedScaleOnHive11Decimal; + } + + proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, + const FileContents& contents) { + uint64_t stripeFooterStart = info.offset() + info.indexlength() + + info.datalength(); + uint64_t stripeFooterLength = info.footerlength(); + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents.compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents.stream.get(), + stripeFooterStart, + stripeFooterLength, + *contents.pool)), + contents.blockSize, + *contents.pool); + proto::StripeFooter result; + if (!result.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError(std::string("bad StripeFooter from ") + + pbStream->getName()); + } + return result; + } + + ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, + const ReaderOptions& opts, + uint64_t _fileLength, + uint64_t _postscriptLength + ): contents(std::move(_contents)), + options(opts), + fileLength(_fileLength), + postscriptLength(_postscriptLength), + footer(contents->footer.get()) { + isMetadataLoaded = false; + checkOrcVersion(); + numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); + contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer)); + contents->blockSize = getCompressionBlockSize(*contents->postscript); + contents->compression= convertCompressionKind(*contents->postscript); + } + + std::string ReaderImpl::getSerializedFileTail() const { + proto::FileTail tail; + proto::PostScript *mutable_ps = tail.mutable_postscript(); + mutable_ps->CopyFrom(*contents->postscript); + proto::Footer *mutableFooter = tail.mutable_footer(); + mutableFooter->CopyFrom(*footer); + tail.set_filelength(fileLength); + tail.set_postscriptlength(postscriptLength); + TString result; + if (!tail.SerializeToString(&result)) { + throw ParseError("Failed to serialize file tail"); + } + return result; + } + + const ReaderOptions& ReaderImpl::getReaderOptions() const { + return options; + } + + CompressionKind ReaderImpl::getCompression() const { + return contents->compression; + } + + uint64_t ReaderImpl::getCompressionSize() const { + return contents->blockSize; + } + + uint64_t ReaderImpl::getNumberOfStripes() const { + return numberOfStripes; + } + + uint64_t ReaderImpl::getNumberOfStripeStatistics() const { + if (!isMetadataLoaded) { + readMetadata(); + } + return metadata.get() == nullptr ? 0 : + static_cast<uint64_t>(metadata->stripestats_size()); + } + + std::unique_ptr<StripeInformation> + ReaderImpl::getStripe(uint64_t stripeIndex) const { + if (stripeIndex > getNumberOfStripes()) { + throw std::logic_error("stripe index out of range"); + } + proto::StripeInformation stripeInfo = + footer->stripes(static_cast<int>(stripeIndex)); + + return std::unique_ptr<StripeInformation> + (new StripeInformationImpl + (stripeInfo.offset(), + stripeInfo.indexlength(), + stripeInfo.datalength(), + stripeInfo.footerlength(), + stripeInfo.numberofrows(), + contents->stream.get(), + *contents->pool, + contents->compression, + contents->blockSize)); + } + + FileVersion ReaderImpl::getFormatVersion() const { + if (contents->postscript->version_size() != 2) { + return FileVersion::v_0_11(); + } + return FileVersion( + contents->postscript->version(0), + contents->postscript->version(1)); + } + + uint64_t ReaderImpl::getNumberOfRows() const { + return footer->numberofrows(); + } + + WriterId ReaderImpl::getWriterId() const { + if (footer->has_writer()) { + uint32_t id = footer->writer(); if (id > WriterId::TRINO_WRITER) { - return WriterId::UNKNOWN_WRITER; - } else { - return static_cast<WriterId>(id); - } - } - return WriterId::ORC_JAVA_WRITER; - } - - uint32_t ReaderImpl::getWriterIdValue() const { - if (footer->has_writer()) { - return footer->writer(); - } else { - return WriterId::ORC_JAVA_WRITER; - } - } - + return WriterId::UNKNOWN_WRITER; + } else { + return static_cast<WriterId>(id); + } + } + return WriterId::ORC_JAVA_WRITER; + } + + uint32_t ReaderImpl::getWriterIdValue() const { + if (footer->has_writer()) { + return footer->writer(); + } else { + return WriterId::ORC_JAVA_WRITER; + } + } + std::string ReaderImpl::getSoftwareVersion() const { std::ostringstream buffer; buffer << writerIdToString(getWriterIdValue()); @@ -517,704 +517,704 @@ namespace orc { return buffer.str(); } - WriterVersion ReaderImpl::getWriterVersion() const { - if (!contents->postscript->has_writerversion()) { - return WriterVersion_ORIGINAL; - } - return static_cast<WriterVersion>(contents->postscript->writerversion()); - } - - uint64_t ReaderImpl::getContentLength() const { - return footer->contentlength(); - } - - uint64_t ReaderImpl::getStripeStatisticsLength() const { - return contents->postscript->metadatalength(); - } - - uint64_t ReaderImpl::getFileFooterLength() const { - return contents->postscript->footerlength(); - } - - uint64_t ReaderImpl::getFilePostscriptLength() const { - return postscriptLength; - } - - uint64_t ReaderImpl::getFileLength() const { - return fileLength; - } - - uint64_t ReaderImpl::getRowIndexStride() const { - return footer->rowindexstride(); - } - - const std::string& ReaderImpl::getStreamName() const { - return contents->stream->getName(); - } - - std::list<std::string> ReaderImpl::getMetadataKeys() const { - std::list<std::string> result; - for(int i=0; i < footer->metadata_size(); ++i) { - result.push_back(footer->metadata(i).name()); - } - return result; - } - - std::string ReaderImpl::getMetadataValue(const std::string& key) const { - for(int i=0; i < footer->metadata_size(); ++i) { - if (footer->metadata(i).name() == TString(key)) { - return footer->metadata(i).value(); - } - } - throw std::range_error("key not found"); - } - - void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo, - uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const { - int num_streams = currentStripeFooter.streams_size(); - uint64_t offset = stripeInfo.offset(); - uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength(); - for (int i = 0; i < num_streams; i++) { - const proto::Stream& stream = currentStripeFooter.streams(i); - StreamKind streamKind = static_cast<StreamKind>(stream.kind()); - uint64_t length = static_cast<uint64_t>(stream.length()); - if (streamKind == StreamKind::StreamKind_ROW_INDEX) { - if (offset + length > indexEnd) { - std::stringstream msg; - msg << "Malformed RowIndex stream meta in stripe " << stripeIndex - << ": streamOffset=" << offset << ", streamLength=" << length - << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" - << stripeInfo.indexlength(); - throw ParseError(msg.str()); - } - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - offset, - length, - *contents->pool)), - contents->blockSize, - *(contents->pool)); - - proto::RowIndex rowIndex; - if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse RowIndex from stripe footer"); - } - int num_entries = rowIndex.entry_size(); - size_t column = static_cast<size_t>(stream.column()); - for (int j = 0; j < num_entries; j++) { - const proto::RowIndexEntry& entry = rowIndex.entry(j); - (*indexStats)[column].push_back(entry.statistics()); - } - } - offset += length; - } - } - - bool ReaderImpl::hasMetadataValue(const std::string& key) const { - for(int i=0; i < footer->metadata_size(); ++i) { - if (footer->metadata(i).name() == TString(key)) { - return true; - } - } - return false; - } - - const Type& ReaderImpl::getType() const { - return *(contents->schema.get()); - } - - std::unique_ptr<StripeStatistics> - ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { - if (!isMetadataLoaded) { - readMetadata(); - } - if (metadata.get() == nullptr) { - throw std::logic_error("No stripe statistics in file"); - } - size_t num_cols = static_cast<size_t>( - metadata->stripestats( - static_cast<int>(stripeIndex)).colstats_size()); - std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols); - - proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - proto::StripeFooter currentStripeFooter = - getStripeFooter(currentStripeInfo, *contents.get()); - - getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - - const Timezone& writerTZ = - currentStripeFooter.has_writertimezone() ? - getTimezoneByName(currentStripeFooter.writertimezone()) : - getLocalTimezone(); - StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::unique_ptr<StripeStatistics> - (new StripeStatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)), - indexStats, statContext)); - } - - std::unique_ptr<Statistics> ReaderImpl::getStatistics() const { - StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr<Statistics> - (new StatisticsImpl(*footer, statContext)); - } - - std::unique_ptr<ColumnStatistics> - ReaderImpl::getColumnStatistics(uint32_t index) const { - if (index >= static_cast<uint64_t>(footer->statistics_size())) { - throw std::logic_error("column index out of range"); - } - proto::ColumnStatistics col = - footer->statistics(static_cast<int32_t>(index)); - - StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext)); - } - - void ReaderImpl::readMetadata() const { - uint64_t metadataSize = contents->postscript->metadatalength(); - uint64_t footerLength = contents->postscript->footerlength(); - if (fileLength < metadataSize + footerLength + postscriptLength + 1) { - std::stringstream msg; - msg << "Invalid Metadata length: fileLength=" << fileLength - << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength - << ", postscriptLength=" << postscriptLength; - throw ParseError(msg.str()); - } - uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; - if (metadataSize != 0) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - metadataStart, - metadataSize, - *contents->pool)), - contents->blockSize, - *contents->pool); - metadata.reset(new proto::Metadata()); - if (!metadata->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the metadata"); - } - } - isMetadataLoaded = true; - } - - bool ReaderImpl::hasCorrectStatistics() const { - return !WriterVersionImpl::VERSION_HIVE_8732().compareGT(getWriterVersion()); - } - - void ReaderImpl::checkOrcVersion() { - FileVersion version = getFormatVersion(); - if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) { - *(options.getErrorStream()) - << "Warning: ORC file " << contents->stream->getName() - << " was written in an unknown format version " - << version.toString() << "\n"; - } - } - - std::unique_ptr<RowReader> ReaderImpl::createRowReader() const { - RowReaderOptions defaultOpts; - return createRowReader(defaultOpts); - } - - std::unique_ptr<RowReader> ReaderImpl::createRowReader( - const RowReaderOptions& opts) const { - return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts)); - } - - uint64_t maxStreamsForType(const proto::Type& type) { - switch (static_cast<int64_t>(type.kind())) { - case proto::Type_Kind_STRUCT: - return 1; - case proto::Type_Kind_INT: - case proto::Type_Kind_LONG: - case proto::Type_Kind_SHORT: - case proto::Type_Kind_FLOAT: - case proto::Type_Kind_DOUBLE: - case proto::Type_Kind_BOOLEAN: - case proto::Type_Kind_BYTE: - case proto::Type_Kind_DATE: - case proto::Type_Kind_LIST: - case proto::Type_Kind_MAP: - case proto::Type_Kind_UNION: - return 2; - case proto::Type_Kind_BINARY: - case proto::Type_Kind_DECIMAL: - case proto::Type_Kind_TIMESTAMP: - return 3; - case proto::Type_Kind_CHAR: - case proto::Type_Kind_STRING: - case proto::Type_Kind_VARCHAR: - return 4; - default: - return 0; - } - } - - uint64_t ReaderImpl::getMemoryUse(int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true); - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) { - for(std::list<uint64_t>::const_iterator field = include.begin(); - field != include.end(); ++field) { - column_selector.updateSelectedByFieldId(selectedColumns, *field); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) { - for(std::list<std::string>::const_iterator field = names.begin(); - field != names.end(); ++field) { - column_selector.updateSelectedByName(selectedColumns, *field); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (include.begin() != include.end()) { - for(std::list<uint64_t>::const_iterator field = include.begin(); - field != include.end(); ++field) { - column_selector.updateSelectedByTypeId(selectedColumns, *field); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) { - uint64_t maxDataLength = 0; - - if (stripeIx >= 0 && stripeIx < footer->stripes_size()) { - uint64_t stripe = footer->stripes(stripeIx).datalength(); - if (maxDataLength < stripe) { - maxDataLength = stripe; - } - } else { - for (int i=0; i < footer->stripes_size(); i++) { - uint64_t stripe = footer->stripes(i).datalength(); - if (maxDataLength < stripe) { - maxDataLength = stripe; - } - } - } - - bool hasStringColumn = false; - uint64_t nSelectedStreams = 0; - for (int i=0; !hasStringColumn && i < footer->types_size(); i++) { - if (selectedColumns[static_cast<size_t>(i)]) { - const proto::Type& type = footer->types(i); - nSelectedStreams += maxStreamsForType(type) ; - switch (static_cast<int64_t>(type.kind())) { - case proto::Type_Kind_CHAR: - case proto::Type_Kind_STRING: - case proto::Type_Kind_VARCHAR: - case proto::Type_Kind_BINARY: { - hasStringColumn = true; - break; - } - default: { - break; - } - } - } - } - - /* If a string column is read, use stripe datalength as a memory estimate - * because we don't know the dictionary size. Multiply by 2 because - * a string column requires two buffers: - * in the input stream and in the seekable input stream. - * If no string column is read, estimate from the number of streams. - */ - uint64_t memory = hasStringColumn ? 2 * maxDataLength : - std::min(uint64_t(maxDataLength), - nSelectedStreams * contents->stream->getNaturalReadSize()); - - // Do we need even more memory to read the footer or the metadata? - if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) { - memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS; - } - if (memory < contents->postscript->metadatalength()) { - memory = contents->postscript->metadatalength(); - } - - // Account for firstRowOfStripe. - memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t); - - // Decompressors need buffers for each stream - uint64_t decompressorMemory = 0; - if (contents->compression != CompressionKind_NONE) { - for (int i=0; i < footer->types_size(); i++) { - if (selectedColumns[static_cast<size_t>(i)]) { - const proto::Type& type = footer->types(i); - decompressorMemory += maxStreamsForType(type) * contents->blockSize; - } - } - if (contents->compression == CompressionKind_SNAPPY) { - decompressorMemory *= 2; // Snappy decompressor uses a second buffer - } - } - - return memory + decompressorMemory ; - } - - void RowReaderImpl::startNextStripe() { - reader.reset(); // ColumnReaders use lots of memory; free old memory first - currentStripeInfo = footer->stripes(static_cast<int>(currentStripe)); - uint64_t fileLength = contents->stream->getLength(); - if (currentStripeInfo.offset() + currentStripeInfo.indexlength() + - currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) { - std::stringstream msg; - msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength=" - << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength=" - << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength() - << ", footerLength=" << currentStripeInfo.footerlength() << ")"; - throw ParseError(msg.str()); - } - currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); - rowsInCurrentStripe = currentStripeInfo.numberofrows(); - const Timezone& writerTimezone = - currentStripeFooter.has_writertimezone() ? - getTimezoneByName(currentStripeFooter.writertimezone()) : - localTimezone; - StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, - currentStripeFooter, - currentStripeInfo.offset(), - *(contents->stream.get()), - writerTimezone); - reader = buildReader(*contents->schema.get(), stripeStreams); - } - - bool RowReaderImpl::next(ColumnVectorBatch& data) { - if (currentStripe >= lastStripe) { - data.numElements = 0; - if (lastStripe > 0) { - previousRow = firstRowOfStripe[lastStripe - 1] + - footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows(); - } else { - previousRow = 0; - } - return false; - } - if (currentRowInStripe == 0) { - startNextStripe(); - } - uint64_t rowsToRead = - std::min(static_cast<uint64_t>(data.capacity), - rowsInCurrentStripe - currentRowInStripe); - data.numElements = rowsToRead; - if (enableEncodedBlock) { - reader->nextEncoded(data, rowsToRead, nullptr); - } - else { - reader->next(data, rowsToRead, nullptr); - } - // update row number - previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe; - currentRowInStripe += rowsToRead; - if (currentRowInStripe >= rowsInCurrentStripe) { - currentStripe += 1; - currentRowInStripe = 0; - } - return rowsToRead != 0; - } - - std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch - (uint64_t capacity) const { - return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock); - } - - void ensureOrcFooter(InputStream* stream, - DataBuffer<char> *buffer, - uint64_t postscriptLength) { - - const std::string MAGIC("ORC"); - const uint64_t magicLength = MAGIC.length(); - const char * const bufferStart = buffer->data(); - const uint64_t bufferLength = buffer->size(); - - if (postscriptLength < magicLength || bufferLength < magicLength) { - throw ParseError("Invalid ORC postscript length"); - } - const char* magicStart = bufferStart + bufferLength - 1 - magicLength; - - // Look for the magic string at the end of the postscript. - if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) { - // If there is no magic string at the end, check the beginning. - // Only files written by Hive 0.11.0 don't have the tail ORC string. - std::unique_ptr<char[]> frontBuffer( new char[magicLength] ); - stream->read(frontBuffer.get(), magicLength, 0); - bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0; - - if (!foundMatch) { - throw ParseError("Not an ORC file"); - } - } - } - - /** - * Read the file's postscript from the given buffer. - * @param stream the file stream - * @param buffer the buffer with the tail of the file. - * @param postscriptSize the length of postscript in bytes - */ - std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream, - DataBuffer<char> *buffer, - uint64_t postscriptSize) { - char *ptr = buffer->data(); - uint64_t readSize = buffer->size(); - - ensureOrcFooter(stream, buffer, postscriptSize); - - std::unique_ptr<proto::PostScript> postscript = - std::unique_ptr<proto::PostScript>(new proto::PostScript()); - if (readSize < 1 + postscriptSize) { - std::stringstream msg; - msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = " - << stream->getLength(); - throw ParseError(msg.str()); - } - if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize, - static_cast<int>(postscriptSize))) { - throw ParseError("Failed to parse the postscript from " + - stream->getName()); - } - return REDUNDANT_MOVE(postscript); - } - - /** - * Check that indices in the type tree are valid, so we won't crash - * when we convert the proto::Types to TypeImpls. - */ - void checkProtoTypeIds(const proto::Footer &footer) { - std::stringstream msg; - int maxId = footer.types_size(); - if (maxId <= 0) { - throw ParseError("Footer is corrupt: no types found"); - } - for (int i = 0; i < maxId; ++i) { - const proto::Type& type = footer.types(i); - for (int j = 0; j < type.subtypes_size(); ++j) { - int subTypeId = static_cast<int>(type.subtypes(j)); - if (subTypeId <= i) { - msg << "Footer is corrupt: malformed link from type " << i << " to " - << subTypeId; - throw ParseError(msg.str()); - } - if (subTypeId >= maxId) { - msg << "Footer is corrupt: types(" << subTypeId << ") not exists"; - throw ParseError(msg.str()); - } - if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) { - msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j - << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= " - << subTypeId << ")"; - throw ParseError(msg.str()); - } - } - } - } - - /** - * Parse the footer from the given buffer. - * @param stream the file's stream - * @param buffer the buffer to parse the footer from - * @param footerOffset the offset within the buffer that contains the footer - * @param ps the file's postscript - * @param memoryPool the memory pool to use - */ - std::unique_ptr<proto::Footer> readFooter(InputStream* stream, - const DataBuffer<char> *buffer, - uint64_t footerOffset, - const proto::PostScript& ps, - MemoryPool& memoryPool) { - const char *footerPtr = buffer->data() + footerOffset; - - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(convertCompressionKind(ps), - std::unique_ptr<SeekableInputStream> - (new SeekableArrayInputStream(footerPtr, - ps.footerlength())), - getCompressionBlockSize(ps), - memoryPool); - - std::unique_ptr<proto::Footer> footer = - std::unique_ptr<proto::Footer>(new proto::Footer()); - if (!footer->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the footer from " + - stream->getName()); - } - - checkProtoTypeIds(*footer); - return REDUNDANT_MOVE(footer); - } - - std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream, - const ReaderOptions& options) { - std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents()); - contents->pool = options.getMemoryPool(); - contents->errorStream = options.getErrorStream(); - std::string serializedFooter = options.getSerializedFileTail(); - uint64_t fileLength; - uint64_t postscriptLength; - if (serializedFooter.length() != 0) { - // Parse the file tail from the serialized one. - proto::FileTail tail; - if (!tail.ParseFromString(TString(serializedFooter))) { - throw ParseError("Failed to parse the file tail from string"); - } - contents->postscript.reset(new proto::PostScript(tail.postscript())); - contents->footer.reset(new proto::Footer(tail.footer())); - fileLength = tail.filelength(); - postscriptLength = tail.postscriptlength(); - } else { - // figure out the size of the file using the option or filesystem - fileLength = std::min(options.getTailLocation(), - static_cast<uint64_t>(stream->getLength())); - - //read last bytes into buffer to get PostScript - uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS); - if (readSize < 4) { - throw ParseError("File size too small"); - } - std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) ); - stream->read(buffer->data(), readSize, fileLength - readSize); - - postscriptLength = buffer->data()[readSize - 1] & 0xff; - contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(), - buffer.get(), postscriptLength)); - uint64_t footerSize = contents->postscript->footerlength(); - uint64_t tailSize = 1 + postscriptLength + footerSize; - if (tailSize >= fileLength) { - std::stringstream msg; - msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; - throw ParseError(msg.str()); - } - uint64_t footerOffset; - - if (tailSize > readSize) { - buffer->resize(footerSize); - stream->read(buffer->data(), footerSize, fileLength - tailSize); - footerOffset = 0; - } else { - footerOffset = readSize - tailSize; - } - - contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(), - footerOffset, *contents->postscript, *contents->pool)); - } - contents->stream = std::move(stream); - return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents), - options, - fileLength, - postscriptLength)); - } - - std::map<uint32_t, BloomFilterIndex> - ReaderImpl::getBloomFilters(uint32_t stripeIndex, - const std::set<uint32_t>& included) const { - std::map<uint32_t, BloomFilterIndex> ret; - - // find stripe info - if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) { - throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex))); - } - const proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - const proto::StripeFooter currentStripeFooter = - getStripeFooter(currentStripeInfo, *contents); - - // iterate stripe footer to get stream of bloomfilter - uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset()); - for (int i = 0; i < currentStripeFooter.streams_size(); i++) { - const proto::Stream& stream = currentStripeFooter.streams(i); - uint32_t column = static_cast<uint32_t>(stream.column()); - uint64_t length = static_cast<uint64_t>(stream.length()); - - // a bloom filter stream from a selected column is found - if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 && - (included.empty() || included.find(column) != included.end())) { - - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - offset, - length, - *contents->pool)), - contents->blockSize, - *(contents->pool)); - - proto::BloomFilterIndex pbBFIndex; - if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse BloomFilterIndex"); - } - - BloomFilterIndex bfIndex; - for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) { - std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize( - stream.kind(), - currentStripeFooter.columns(static_cast<int>(stream.column())), - pbBFIndex.bloomfilter(j)); - bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry))); - } - - // add bloom filters to result for one column - ret[column] = bfIndex; - } - - offset += length; - } - - return ret; - } - - RowReader::~RowReader() { - // PASS - } - - Reader::~Reader() { - // PASS - } - - InputStream::~InputStream() { - // PASS - }; - - - -}// namespace + WriterVersion ReaderImpl::getWriterVersion() const { + if (!contents->postscript->has_writerversion()) { + return WriterVersion_ORIGINAL; + } + return static_cast<WriterVersion>(contents->postscript->writerversion()); + } + + uint64_t ReaderImpl::getContentLength() const { + return footer->contentlength(); + } + + uint64_t ReaderImpl::getStripeStatisticsLength() const { + return contents->postscript->metadatalength(); + } + + uint64_t ReaderImpl::getFileFooterLength() const { + return contents->postscript->footerlength(); + } + + uint64_t ReaderImpl::getFilePostscriptLength() const { + return postscriptLength; + } + + uint64_t ReaderImpl::getFileLength() const { + return fileLength; + } + + uint64_t ReaderImpl::getRowIndexStride() const { + return footer->rowindexstride(); + } + + const std::string& ReaderImpl::getStreamName() const { + return contents->stream->getName(); + } + + std::list<std::string> ReaderImpl::getMetadataKeys() const { + std::list<std::string> result; + for(int i=0; i < footer->metadata_size(); ++i) { + result.push_back(footer->metadata(i).name()); + } + return result; + } + + std::string ReaderImpl::getMetadataValue(const std::string& key) const { + for(int i=0; i < footer->metadata_size(); ++i) { + if (footer->metadata(i).name() == TString(key)) { + return footer->metadata(i).value(); + } + } + throw std::range_error("key not found"); + } + + void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo, + uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const { + int num_streams = currentStripeFooter.streams_size(); + uint64_t offset = stripeInfo.offset(); + uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength(); + for (int i = 0; i < num_streams; i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + StreamKind streamKind = static_cast<StreamKind>(stream.kind()); + uint64_t length = static_cast<uint64_t>(stream.length()); + if (streamKind == StreamKind::StreamKind_ROW_INDEX) { + if (offset + length > indexEnd) { + std::stringstream msg; + msg << "Malformed RowIndex stream meta in stripe " << stripeIndex + << ": streamOffset=" << offset << ", streamLength=" << length + << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" + << stripeInfo.indexlength(); + throw ParseError(msg.str()); + } + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents->stream.get(), + offset, + length, + *contents->pool)), + contents->blockSize, + *(contents->pool)); + + proto::RowIndex rowIndex; + if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse RowIndex from stripe footer"); + } + int num_entries = rowIndex.entry_size(); + size_t column = static_cast<size_t>(stream.column()); + for (int j = 0; j < num_entries; j++) { + const proto::RowIndexEntry& entry = rowIndex.entry(j); + (*indexStats)[column].push_back(entry.statistics()); + } + } + offset += length; + } + } + + bool ReaderImpl::hasMetadataValue(const std::string& key) const { + for(int i=0; i < footer->metadata_size(); ++i) { + if (footer->metadata(i).name() == TString(key)) { + return true; + } + } + return false; + } + + const Type& ReaderImpl::getType() const { + return *(contents->schema.get()); + } + + std::unique_ptr<StripeStatistics> + ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { + if (!isMetadataLoaded) { + readMetadata(); + } + if (metadata.get() == nullptr) { + throw std::logic_error("No stripe statistics in file"); + } + size_t num_cols = static_cast<size_t>( + metadata->stripestats( + static_cast<int>(stripeIndex)).colstats_size()); + std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols); + + proto::StripeInformation currentStripeInfo = + footer->stripes(static_cast<int>(stripeIndex)); + proto::StripeFooter currentStripeFooter = + getStripeFooter(currentStripeInfo, *contents.get()); + + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); + + const Timezone& writerTZ = + currentStripeFooter.has_writertimezone() ? + getTimezoneByName(currentStripeFooter.writertimezone()) : + getLocalTimezone(); + StatContext statContext(hasCorrectStatistics(), &writerTZ); + return std::unique_ptr<StripeStatistics> + (new StripeStatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)), + indexStats, statContext)); + } + + std::unique_ptr<Statistics> ReaderImpl::getStatistics() const { + StatContext statContext(hasCorrectStatistics()); + return std::unique_ptr<Statistics> + (new StatisticsImpl(*footer, statContext)); + } + + std::unique_ptr<ColumnStatistics> + ReaderImpl::getColumnStatistics(uint32_t index) const { + if (index >= static_cast<uint64_t>(footer->statistics_size())) { + throw std::logic_error("column index out of range"); + } + proto::ColumnStatistics col = + footer->statistics(static_cast<int32_t>(index)); + + StatContext statContext(hasCorrectStatistics()); + return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext)); + } + + void ReaderImpl::readMetadata() const { + uint64_t metadataSize = contents->postscript->metadatalength(); + uint64_t footerLength = contents->postscript->footerlength(); + if (fileLength < metadataSize + footerLength + postscriptLength + 1) { + std::stringstream msg; + msg << "Invalid Metadata length: fileLength=" << fileLength + << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength + << ", postscriptLength=" << postscriptLength; + throw ParseError(msg.str()); + } + uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; + if (metadataSize != 0) { + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents->stream.get(), + metadataStart, + metadataSize, + *contents->pool)), + contents->blockSize, + *contents->pool); + metadata.reset(new proto::Metadata()); + if (!metadata->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the metadata"); + } + } + isMetadataLoaded = true; + } + + bool ReaderImpl::hasCorrectStatistics() const { + return !WriterVersionImpl::VERSION_HIVE_8732().compareGT(getWriterVersion()); + } + + void ReaderImpl::checkOrcVersion() { + FileVersion version = getFormatVersion(); + if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) { + *(options.getErrorStream()) + << "Warning: ORC file " << contents->stream->getName() + << " was written in an unknown format version " + << version.toString() << "\n"; + } + } + + std::unique_ptr<RowReader> ReaderImpl::createRowReader() const { + RowReaderOptions defaultOpts; + return createRowReader(defaultOpts); + } + + std::unique_ptr<RowReader> ReaderImpl::createRowReader( + const RowReaderOptions& opts) const { + return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts)); + } + + uint64_t maxStreamsForType(const proto::Type& type) { + switch (static_cast<int64_t>(type.kind())) { + case proto::Type_Kind_STRUCT: + return 1; + case proto::Type_Kind_INT: + case proto::Type_Kind_LONG: + case proto::Type_Kind_SHORT: + case proto::Type_Kind_FLOAT: + case proto::Type_Kind_DOUBLE: + case proto::Type_Kind_BOOLEAN: + case proto::Type_Kind_BYTE: + case proto::Type_Kind_DATE: + case proto::Type_Kind_LIST: + case proto::Type_Kind_MAP: + case proto::Type_Kind_UNION: + return 2; + case proto::Type_Kind_BINARY: + case proto::Type_Kind_DECIMAL: + case proto::Type_Kind_TIMESTAMP: + return 3; + case proto::Type_Kind_CHAR: + case proto::Type_Kind_STRING: + case proto::Type_Kind_VARCHAR: + return 4; + default: + return 0; + } + } + + uint64_t ReaderImpl::getMemoryUse(int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true); + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + ColumnSelector column_selector(contents.get()); + if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) { + for(std::list<uint64_t>::const_iterator field = include.begin(); + field != include.end(); ++field) { + column_selector.updateSelectedByFieldId(selectedColumns, *field); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + column_selector.selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + ColumnSelector column_selector(contents.get()); + if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) { + for(std::list<std::string>::const_iterator field = names.begin(); + field != names.end(); ++field) { + column_selector.updateSelectedByName(selectedColumns, *field); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + column_selector.selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + ColumnSelector column_selector(contents.get()); + if (include.begin() != include.end()) { + for(std::list<uint64_t>::const_iterator field = include.begin(); + field != include.end(); ++field) { + column_selector.updateSelectedByTypeId(selectedColumns, *field); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + column_selector.selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) { + uint64_t maxDataLength = 0; + + if (stripeIx >= 0 && stripeIx < footer->stripes_size()) { + uint64_t stripe = footer->stripes(stripeIx).datalength(); + if (maxDataLength < stripe) { + maxDataLength = stripe; + } + } else { + for (int i=0; i < footer->stripes_size(); i++) { + uint64_t stripe = footer->stripes(i).datalength(); + if (maxDataLength < stripe) { + maxDataLength = stripe; + } + } + } + + bool hasStringColumn = false; + uint64_t nSelectedStreams = 0; + for (int i=0; !hasStringColumn && i < footer->types_size(); i++) { + if (selectedColumns[static_cast<size_t>(i)]) { + const proto::Type& type = footer->types(i); + nSelectedStreams += maxStreamsForType(type) ; + switch (static_cast<int64_t>(type.kind())) { + case proto::Type_Kind_CHAR: + case proto::Type_Kind_STRING: + case proto::Type_Kind_VARCHAR: + case proto::Type_Kind_BINARY: { + hasStringColumn = true; + break; + } + default: { + break; + } + } + } + } + + /* If a string column is read, use stripe datalength as a memory estimate + * because we don't know the dictionary size. Multiply by 2 because + * a string column requires two buffers: + * in the input stream and in the seekable input stream. + * If no string column is read, estimate from the number of streams. + */ + uint64_t memory = hasStringColumn ? 2 * maxDataLength : + std::min(uint64_t(maxDataLength), + nSelectedStreams * contents->stream->getNaturalReadSize()); + + // Do we need even more memory to read the footer or the metadata? + if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) { + memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS; + } + if (memory < contents->postscript->metadatalength()) { + memory = contents->postscript->metadatalength(); + } + + // Account for firstRowOfStripe. + memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t); + + // Decompressors need buffers for each stream + uint64_t decompressorMemory = 0; + if (contents->compression != CompressionKind_NONE) { + for (int i=0; i < footer->types_size(); i++) { + if (selectedColumns[static_cast<size_t>(i)]) { + const proto::Type& type = footer->types(i); + decompressorMemory += maxStreamsForType(type) * contents->blockSize; + } + } + if (contents->compression == CompressionKind_SNAPPY) { + decompressorMemory *= 2; // Snappy decompressor uses a second buffer + } + } + + return memory + decompressorMemory ; + } + + void RowReaderImpl::startNextStripe() { + reader.reset(); // ColumnReaders use lots of memory; free old memory first + currentStripeInfo = footer->stripes(static_cast<int>(currentStripe)); + uint64_t fileLength = contents->stream->getLength(); + if (currentStripeInfo.offset() + currentStripeInfo.indexlength() + + currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) { + std::stringstream msg; + msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength=" + << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength=" + << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength() + << ", footerLength=" << currentStripeInfo.footerlength() << ")"; + throw ParseError(msg.str()); + } + currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); + rowsInCurrentStripe = currentStripeInfo.numberofrows(); + const Timezone& writerTimezone = + currentStripeFooter.has_writertimezone() ? + getTimezoneByName(currentStripeFooter.writertimezone()) : + localTimezone; + StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, + currentStripeFooter, + currentStripeInfo.offset(), + *(contents->stream.get()), + writerTimezone); + reader = buildReader(*contents->schema.get(), stripeStreams); + } + + bool RowReaderImpl::next(ColumnVectorBatch& data) { + if (currentStripe >= lastStripe) { + data.numElements = 0; + if (lastStripe > 0) { + previousRow = firstRowOfStripe[lastStripe - 1] + + footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows(); + } else { + previousRow = 0; + } + return false; + } + if (currentRowInStripe == 0) { + startNextStripe(); + } + uint64_t rowsToRead = + std::min(static_cast<uint64_t>(data.capacity), + rowsInCurrentStripe - currentRowInStripe); + data.numElements = rowsToRead; + if (enableEncodedBlock) { + reader->nextEncoded(data, rowsToRead, nullptr); + } + else { + reader->next(data, rowsToRead, nullptr); + } + // update row number + previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe; + currentRowInStripe += rowsToRead; + if (currentRowInStripe >= rowsInCurrentStripe) { + currentStripe += 1; + currentRowInStripe = 0; + } + return rowsToRead != 0; + } + + std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch + (uint64_t capacity) const { + return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock); + } + + void ensureOrcFooter(InputStream* stream, + DataBuffer<char> *buffer, + uint64_t postscriptLength) { + + const std::string MAGIC("ORC"); + const uint64_t magicLength = MAGIC.length(); + const char * const bufferStart = buffer->data(); + const uint64_t bufferLength = buffer->size(); + + if (postscriptLength < magicLength || bufferLength < magicLength) { + throw ParseError("Invalid ORC postscript length"); + } + const char* magicStart = bufferStart + bufferLength - 1 - magicLength; + + // Look for the magic string at the end of the postscript. + if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) { + // If there is no magic string at the end, check the beginning. + // Only files written by Hive 0.11.0 don't have the tail ORC string. + std::unique_ptr<char[]> frontBuffer( new char[magicLength] ); + stream->read(frontBuffer.get(), magicLength, 0); + bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0; + + if (!foundMatch) { + throw ParseError("Not an ORC file"); + } + } + } + + /** + * Read the file's postscript from the given buffer. + * @param stream the file stream + * @param buffer the buffer with the tail of the file. + * @param postscriptSize the length of postscript in bytes + */ + std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream, + DataBuffer<char> *buffer, + uint64_t postscriptSize) { + char *ptr = buffer->data(); + uint64_t readSize = buffer->size(); + + ensureOrcFooter(stream, buffer, postscriptSize); + + std::unique_ptr<proto::PostScript> postscript = + std::unique_ptr<proto::PostScript>(new proto::PostScript()); + if (readSize < 1 + postscriptSize) { + std::stringstream msg; + msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = " + << stream->getLength(); + throw ParseError(msg.str()); + } + if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize, + static_cast<int>(postscriptSize))) { + throw ParseError("Failed to parse the postscript from " + + stream->getName()); + } + return REDUNDANT_MOVE(postscript); + } + + /** + * Check that indices in the type tree are valid, so we won't crash + * when we convert the proto::Types to TypeImpls. + */ + void checkProtoTypeIds(const proto::Footer &footer) { + std::stringstream msg; + int maxId = footer.types_size(); + if (maxId <= 0) { + throw ParseError("Footer is corrupt: no types found"); + } + for (int i = 0; i < maxId; ++i) { + const proto::Type& type = footer.types(i); + for (int j = 0; j < type.subtypes_size(); ++j) { + int subTypeId = static_cast<int>(type.subtypes(j)); + if (subTypeId <= i) { + msg << "Footer is corrupt: malformed link from type " << i << " to " + << subTypeId; + throw ParseError(msg.str()); + } + if (subTypeId >= maxId) { + msg << "Footer is corrupt: types(" << subTypeId << ") not exists"; + throw ParseError(msg.str()); + } + if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) { + msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j + << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= " + << subTypeId << ")"; + throw ParseError(msg.str()); + } + } + } + } + + /** + * Parse the footer from the given buffer. + * @param stream the file's stream + * @param buffer the buffer to parse the footer from + * @param footerOffset the offset within the buffer that contains the footer + * @param ps the file's postscript + * @param memoryPool the memory pool to use + */ + std::unique_ptr<proto::Footer> readFooter(InputStream* stream, + const DataBuffer<char> *buffer, + uint64_t footerOffset, + const proto::PostScript& ps, + MemoryPool& memoryPool) { + const char *footerPtr = buffer->data() + footerOffset; + + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(convertCompressionKind(ps), + std::unique_ptr<SeekableInputStream> + (new SeekableArrayInputStream(footerPtr, + ps.footerlength())), + getCompressionBlockSize(ps), + memoryPool); + + std::unique_ptr<proto::Footer> footer = + std::unique_ptr<proto::Footer>(new proto::Footer()); + if (!footer->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the footer from " + + stream->getName()); + } + + checkProtoTypeIds(*footer); + return REDUNDANT_MOVE(footer); + } + + std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream, + const ReaderOptions& options) { + std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents()); + contents->pool = options.getMemoryPool(); + contents->errorStream = options.getErrorStream(); + std::string serializedFooter = options.getSerializedFileTail(); + uint64_t fileLength; + uint64_t postscriptLength; + if (serializedFooter.length() != 0) { + // Parse the file tail from the serialized one. + proto::FileTail tail; + if (!tail.ParseFromString(TString(serializedFooter))) { + throw ParseError("Failed to parse the file tail from string"); + } + contents->postscript.reset(new proto::PostScript(tail.postscript())); + contents->footer.reset(new proto::Footer(tail.footer())); + fileLength = tail.filelength(); + postscriptLength = tail.postscriptlength(); + } else { + // figure out the size of the file using the option or filesystem + fileLength = std::min(options.getTailLocation(), + static_cast<uint64_t>(stream->getLength())); + + //read last bytes into buffer to get PostScript + uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS); + if (readSize < 4) { + throw ParseError("File size too small"); + } + std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) ); + stream->read(buffer->data(), readSize, fileLength - readSize); + + postscriptLength = buffer->data()[readSize - 1] & 0xff; + contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(), + buffer.get(), postscriptLength)); + uint64_t footerSize = contents->postscript->footerlength(); + uint64_t tailSize = 1 + postscriptLength + footerSize; + if (tailSize >= fileLength) { + std::stringstream msg; + msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } + uint64_t footerOffset; + + if (tailSize > readSize) { + buffer->resize(footerSize); + stream->read(buffer->data(), footerSize, fileLength - tailSize); + footerOffset = 0; + } else { + footerOffset = readSize - tailSize; + } + + contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(), + footerOffset, *contents->postscript, *contents->pool)); + } + contents->stream = std::move(stream); + return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents), + options, + fileLength, + postscriptLength)); + } + + std::map<uint32_t, BloomFilterIndex> + ReaderImpl::getBloomFilters(uint32_t stripeIndex, + const std::set<uint32_t>& included) const { + std::map<uint32_t, BloomFilterIndex> ret; + + // find stripe info + if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) { + throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex))); + } + const proto::StripeInformation currentStripeInfo = + footer->stripes(static_cast<int>(stripeIndex)); + const proto::StripeFooter currentStripeFooter = + getStripeFooter(currentStripeInfo, *contents); + + // iterate stripe footer to get stream of bloomfilter + uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset()); + for (int i = 0; i < currentStripeFooter.streams_size(); i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + uint32_t column = static_cast<uint32_t>(stream.column()); + uint64_t length = static_cast<uint64_t>(stream.length()); + + // a bloom filter stream from a selected column is found + if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 && + (included.empty() || included.find(column) != included.end())) { + + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents->stream.get(), + offset, + length, + *contents->pool)), + contents->blockSize, + *(contents->pool)); + + proto::BloomFilterIndex pbBFIndex; + if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse BloomFilterIndex"); + } + + BloomFilterIndex bfIndex; + for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) { + std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize( + stream.kind(), + currentStripeFooter.columns(static_cast<int>(stream.column())), + pbBFIndex.bloomfilter(j)); + bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry))); + } + + // add bloom filters to result for one column + ret[column] = bfIndex; + } + + offset += length; + } + + return ret; + } + + RowReader::~RowReader() { + // PASS + } + + Reader::~Reader() { + // PASS + } + + InputStream::~InputStream() { + // PASS + }; + + + +}// namespace diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh index 49e9d033d9..b4ce7f6529 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.hh +++ b/contrib/libs/apache/orc/c++/src/Reader.hh @@ -1,155 +1,155 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_READER_IMPL_HH -#define ORC_READER_IMPL_HH - -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include "ColumnReader.hh" -#include "orc/Exceptions.hh" -#include "RLE.hh" -#include "TypeImpl.hh" - -namespace orc { - - static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; - - /** - * WriterVersion Implementation - */ - class WriterVersionImpl { - private: - WriterVersion version; - public: - // Known Versions with issues resolved - // The static method below is to fix global constructors Clang warning - static const WriterVersionImpl& VERSION_HIVE_8732(); - - WriterVersionImpl(WriterVersion ver) : version(ver) {} - - bool compareGT(const WriterVersion other) const { - return version > other; - } - }; - - /** - * State shared between Reader and Row Reader - */ - struct FileContents { - std::unique_ptr<InputStream> stream; - std::unique_ptr<proto::PostScript> postscript; - std::unique_ptr<proto::Footer> footer; - std::unique_ptr<Type> schema; - uint64_t blockSize; - CompressionKind compression; - MemoryPool *pool; - std::ostream *errorStream; - }; - - proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, - const FileContents& contents); - - class ReaderImpl; - - class ColumnSelector { - private: - std::map<std::string, uint64_t> nameIdMap; - std::map<uint64_t, const Type*> idTypeMap; - const FileContents* contents; - std::vector<std::string> columns; - - // build map from type name and id, id to Type - void buildTypeNameIdMap(const Type* type); - std::string toDotColumnPath(); - - public: - // Select a field by name - void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); - // Select a field by id - void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); - // Select a type by id - void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); - - // Select all of the recursive children of the given type. - void selectChildren(std::vector<bool>& selectedColumns, const Type& type); - - // For each child of type, select it if one of its children - // is selected. - bool selectParents(std::vector<bool>& selectedColumns, const Type& type); - /** - * Constructor that selects columns. - * @param contents of the file - */ - ColumnSelector(const FileContents* contents); - - // Select the columns from the RowReaderoptions object - void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); - - // Select the columns from the Readeroptions object - void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); - }; - - - class RowReaderImpl : public RowReader { - private: - const Timezone& localTimezone; - - // contents - std::shared_ptr<FileContents> contents; - const bool throwOnHive11DecimalOverflow; - const int32_t forcedScaleOnHive11Decimal; - - // inputs - std::vector<bool> selectedColumns; - - // footer - proto::Footer* footer; - DataBuffer<uint64_t> firstRowOfStripe; - mutable std::unique_ptr<Type> selectedSchema; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_READER_IMPL_HH +#define ORC_READER_IMPL_HH + +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include "ColumnReader.hh" +#include "orc/Exceptions.hh" +#include "RLE.hh" +#include "TypeImpl.hh" + +namespace orc { + + static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; + + /** + * WriterVersion Implementation + */ + class WriterVersionImpl { + private: + WriterVersion version; + public: + // Known Versions with issues resolved + // The static method below is to fix global constructors Clang warning + static const WriterVersionImpl& VERSION_HIVE_8732(); + + WriterVersionImpl(WriterVersion ver) : version(ver) {} + + bool compareGT(const WriterVersion other) const { + return version > other; + } + }; + + /** + * State shared between Reader and Row Reader + */ + struct FileContents { + std::unique_ptr<InputStream> stream; + std::unique_ptr<proto::PostScript> postscript; + std::unique_ptr<proto::Footer> footer; + std::unique_ptr<Type> schema; + uint64_t blockSize; + CompressionKind compression; + MemoryPool *pool; + std::ostream *errorStream; + }; + + proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, + const FileContents& contents); + + class ReaderImpl; + + class ColumnSelector { + private: + std::map<std::string, uint64_t> nameIdMap; + std::map<uint64_t, const Type*> idTypeMap; + const FileContents* contents; + std::vector<std::string> columns; + + // build map from type name and id, id to Type + void buildTypeNameIdMap(const Type* type); + std::string toDotColumnPath(); + + public: + // Select a field by name + void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); + // Select a field by id + void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); + // Select a type by id + void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); + + // Select all of the recursive children of the given type. + void selectChildren(std::vector<bool>& selectedColumns, const Type& type); + + // For each child of type, select it if one of its children + // is selected. + bool selectParents(std::vector<bool>& selectedColumns, const Type& type); + /** + * Constructor that selects columns. + * @param contents of the file + */ + ColumnSelector(const FileContents* contents); + + // Select the columns from the RowReaderoptions object + void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); + + // Select the columns from the Readeroptions object + void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); + }; + + + class RowReaderImpl : public RowReader { + private: + const Timezone& localTimezone; + + // contents + std::shared_ptr<FileContents> contents; + const bool throwOnHive11DecimalOverflow; + const int32_t forcedScaleOnHive11Decimal; + + // inputs + std::vector<bool> selectedColumns; + + // footer + proto::Footer* footer; + DataBuffer<uint64_t> firstRowOfStripe; + mutable std::unique_ptr<Type> selectedSchema; bool skipBloomFilters; - - // reading state - uint64_t previousRow; - uint64_t firstStripe; - uint64_t currentStripe; - uint64_t lastStripe; // the stripe AFTER the last one - uint64_t currentRowInStripe; - uint64_t rowsInCurrentStripe; - proto::StripeInformation currentStripeInfo; - proto::StripeFooter currentStripeFooter; - std::unique_ptr<ColumnReader> reader; - - bool enableEncodedBlock; - // internal methods - void startNextStripe(); - - // row index of current stripe with column id as the key - std::unordered_map<uint64_t, proto::RowIndex> rowIndexes; - - /** - * Seek to the start of a row group in the current stripe - * @param rowGroupEntryId the row group id to seek to - */ - void seekToRowGroup(uint32_t rowGroupEntryId); - + + // reading state + uint64_t previousRow; + uint64_t firstStripe; + uint64_t currentStripe; + uint64_t lastStripe; // the stripe AFTER the last one + uint64_t currentRowInStripe; + uint64_t rowsInCurrentStripe; + proto::StripeInformation currentStripeInfo; + proto::StripeFooter currentStripeFooter; + std::unique_ptr<ColumnReader> reader; + + bool enableEncodedBlock; + // internal methods + void startNextStripe(); + + // row index of current stripe with column id as the key + std::unordered_map<uint64_t, proto::RowIndex> rowIndexes; + + /** + * Seek to the start of a row group in the current stripe + * @param rowGroupEntryId the row group id to seek to + */ + void seekToRowGroup(uint32_t rowGroupEntryId); + /** * Check if the file has bad bloom filters. We will skip using them in the * following reads. @@ -157,159 +157,159 @@ namespace orc { */ bool hasBadBloomFilters(); - public: - /** - * Constructor that lets the user specify additional options. - * @param contents of the file - * @param options options for reading - */ - RowReaderImpl(std::shared_ptr<FileContents> contents, - const RowReaderOptions& options); - - // Select the columns from the options object - void updateSelected(); - const std::vector<bool> getSelectedColumns() const override; - - const Type& getSelectedType() const override; - - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size - ) const override; - - bool next(ColumnVectorBatch& data) override; - - CompressionKind getCompression() const; - - uint64_t getCompressionSize() const; - - uint64_t getRowNumber() const override; - - void seekToRow(uint64_t rowNumber) override; - - const FileContents& getFileContents() const; - bool getThrowOnHive11DecimalOverflow() const; - int32_t getForcedScaleOnHive11Decimal() const; - }; - - class ReaderImpl : public Reader { - private: - // FileContents - std::shared_ptr<FileContents> contents; - - // inputs - const ReaderOptions options; - const uint64_t fileLength; - const uint64_t postscriptLength; - - // footer - proto::Footer* footer; - uint64_t numberOfStripes; - uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); - - // internal methods - void readMetadata() const; - void checkOrcVersion(); - void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, - const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; - - // metadata - mutable std::unique_ptr<proto::Metadata> metadata; - mutable bool isMetadataLoaded; - public: - /** - * Constructor that lets the user specify additional options. - * @param contents of the file - * @param options options for reading - * @param fileLength the length of the file in bytes - * @param postscriptLength the length of the postscript in bytes - */ - ReaderImpl(std::shared_ptr<FileContents> contents, - const ReaderOptions& options, - uint64_t fileLength, - uint64_t postscriptLength); - - const ReaderOptions& getReaderOptions() const; - - CompressionKind getCompression() const override; - - FileVersion getFormatVersion() const override; - - WriterId getWriterId() const override; - - uint32_t getWriterIdValue() const override; - + public: + /** + * Constructor that lets the user specify additional options. + * @param contents of the file + * @param options options for reading + */ + RowReaderImpl(std::shared_ptr<FileContents> contents, + const RowReaderOptions& options); + + // Select the columns from the options object + void updateSelected(); + const std::vector<bool> getSelectedColumns() const override; + + const Type& getSelectedType() const override; + + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size + ) const override; + + bool next(ColumnVectorBatch& data) override; + + CompressionKind getCompression() const; + + uint64_t getCompressionSize() const; + + uint64_t getRowNumber() const override; + + void seekToRow(uint64_t rowNumber) override; + + const FileContents& getFileContents() const; + bool getThrowOnHive11DecimalOverflow() const; + int32_t getForcedScaleOnHive11Decimal() const; + }; + + class ReaderImpl : public Reader { + private: + // FileContents + std::shared_ptr<FileContents> contents; + + // inputs + const ReaderOptions options; + const uint64_t fileLength; + const uint64_t postscriptLength; + + // footer + proto::Footer* footer; + uint64_t numberOfStripes; + uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); + + // internal methods + void readMetadata() const; + void checkOrcVersion(); + void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; + + // metadata + mutable std::unique_ptr<proto::Metadata> metadata; + mutable bool isMetadataLoaded; + public: + /** + * Constructor that lets the user specify additional options. + * @param contents of the file + * @param options options for reading + * @param fileLength the length of the file in bytes + * @param postscriptLength the length of the postscript in bytes + */ + ReaderImpl(std::shared_ptr<FileContents> contents, + const ReaderOptions& options, + uint64_t fileLength, + uint64_t postscriptLength); + + const ReaderOptions& getReaderOptions() const; + + CompressionKind getCompression() const override; + + FileVersion getFormatVersion() const override; + + WriterId getWriterId() const override; + + uint32_t getWriterIdValue() const override; + std::string getSoftwareVersion() const override; - WriterVersion getWriterVersion() const override; - - uint64_t getNumberOfRows() const override; - - uint64_t getRowIndexStride() const override; - - std::list<std::string> getMetadataKeys() const override; - - std::string getMetadataValue(const std::string& key) const override; - - bool hasMetadataValue(const std::string& key) const override; - - uint64_t getCompressionSize() const override; - - uint64_t getNumberOfStripes() const override; - - std::unique_ptr<StripeInformation> getStripe(uint64_t - ) const override; - - uint64_t getNumberOfStripeStatistics() const override; - - const std::string& getStreamName() const override; - - std::unique_ptr<StripeStatistics> - getStripeStatistics(uint64_t stripeIndex) const override; - - std::unique_ptr<RowReader> createRowReader() const override; - - std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options - ) const override; - - uint64_t getContentLength() const override; - uint64_t getStripeStatisticsLength() const override; - uint64_t getFileFooterLength() const override; - uint64_t getFilePostscriptLength() const override; - uint64_t getFileLength() const override; - - std::unique_ptr<Statistics> getStatistics() const override; - - std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId - ) const override; - - std::string getSerializedFileTail() const override; - - const Type& getType() const override; - - bool hasCorrectStatistics() const override; - - const proto::PostScript* getPostscript() const {return contents->postscript.get();} - - uint64_t getBlockSize() const {return contents->blockSize;} - - const proto::Footer* getFooter() const {return contents->footer.get();} - - const Type* getSchema() const {return contents->schema.get();} - - InputStream* getStream() const {return contents->stream.get();} - - uint64_t getMemoryUse(int stripeIx = -1) override; - - uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; - - uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; - - uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; - - std::map<uint32_t, BloomFilterIndex> - getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override; - }; - -}// namespace - -#endif + WriterVersion getWriterVersion() const override; + + uint64_t getNumberOfRows() const override; + + uint64_t getRowIndexStride() const override; + + std::list<std::string> getMetadataKeys() const override; + + std::string getMetadataValue(const std::string& key) const override; + + bool hasMetadataValue(const std::string& key) const override; + + uint64_t getCompressionSize() const override; + + uint64_t getNumberOfStripes() const override; + + std::unique_ptr<StripeInformation> getStripe(uint64_t + ) const override; + + uint64_t getNumberOfStripeStatistics() const override; + + const std::string& getStreamName() const override; + + std::unique_ptr<StripeStatistics> + getStripeStatistics(uint64_t stripeIndex) const override; + + std::unique_ptr<RowReader> createRowReader() const override; + + std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options + ) const override; + + uint64_t getContentLength() const override; + uint64_t getStripeStatisticsLength() const override; + uint64_t getFileFooterLength() const override; + uint64_t getFilePostscriptLength() const override; + uint64_t getFileLength() const override; + + std::unique_ptr<Statistics> getStatistics() const override; + + std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId + ) const override; + + std::string getSerializedFileTail() const override; + + const Type& getType() const override; + + bool hasCorrectStatistics() const override; + + const proto::PostScript* getPostscript() const {return contents->postscript.get();} + + uint64_t getBlockSize() const {return contents->blockSize;} + + const proto::Footer* getFooter() const {return contents->footer.get();} + + const Type* getSchema() const {return contents->schema.get();} + + InputStream* getStream() const {return contents->stream.get();} + + uint64_t getMemoryUse(int stripeIx = -1) override; + + uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; + + uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; + + uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; + + std::map<uint32_t, BloomFilterIndex> + getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override; + }; + +}// namespace + +#endif diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc index c5c6f6a801..2b7acb0bd5 100644 --- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc @@ -1,426 +1,426 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "RLEv2.hh" -#include "RLEV2Util.hh" - -namespace orc { - -int64_t RleDecoderV2::readLongBE(uint64_t bsz) { - int64_t ret = 0, val; - uint64_t n = bsz; - while (n > 0) { - n--; - val = readByte(); - ret |= (val << (n * 8)); - } - return ret; -} - -inline int64_t RleDecoderV2::readVslong() { - return unZigZag(readVulong()); -} - -uint64_t RleDecoderV2::readVulong() { - uint64_t ret = 0, b; - uint64_t offset = 0; - do { - b = readByte(); - ret |= (0x7f & b) << offset; - offset += 7; - } while (b >= 0x80); - return ret; -} - -RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, - bool _isSigned, MemoryPool& pool - ): inputStream(std::move(input)), - isSigned(_isSigned), - firstByte(0), - runLength(0), - runRead(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - deltaBase(0), - byteSize(0), - firstValue(0), - prevValue(0), - bitSize(0), - bitsLeft(0), - curByte(0), - patchBitSize(0), - unpackedIdx(0), - patchIdx(0), - base(0), - curGap(0), - curPatch(0), - patchMask(0), - actualGap(0), - unpacked(pool, 0), - unpackedPatch(pool, 0) { - // PASS -} - -void RleDecoderV2::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // clear state - bufferEnd = bufferStart = nullptr; - runRead = runLength = 0; - // skip ahead the given number of records - skip(location.next()); -} - -void RleDecoderV2::skip(uint64_t numValues) { - // simple for now, until perf tests indicate something encoding specific is - // needed - const uint64_t N = 64; - int64_t dummy[N]; - - while (numValues) { - uint64_t nRead = std::min(N, numValues); - next(dummy, nRead, nullptr); - numValues -= nRead; - } -} - -void RleDecoderV2::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - uint64_t nRead = 0; - - while (nRead < numValues) { - // Skip any nulls before attempting to read first byte. - while (notNull && !notNull[nRead]) { - if (++nRead == numValues) { - return; // ended with null values - } - } - - if (runRead == runLength) { - resetRun(); - firstByte = readByte(); - } - - uint64_t offset = nRead, length = numValues - nRead; - - EncodingType enc = static_cast<EncodingType> - ((firstByte >> 6) & 0x03); - switch(static_cast<int64_t>(enc)) { - case SHORT_REPEAT: - nRead += nextShortRepeats(data, offset, length, notNull); - break; - case DIRECT: - nRead += nextDirect(data, offset, length, notNull); - break; - case PATCHED_BASE: - nRead += nextPatched(data, offset, length, notNull); - break; - case DELTA: - nRead += nextDelta(data, offset, length, notNull); - break; - default: - throw ParseError("unknown encoding"); - } - } -} - -uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bytes - byteSize = (firstByte >> 3) & 0x07; - byteSize += 1; - - runLength = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - runLength += MIN_REPEAT; - runRead = 0; - - // read the repeated value which is store using fixed bytes - firstValue = readLongBE(byteSize); - - if (isSigned) { - firstValue = unZigZag(static_cast<uint64_t>(firstValue)); - } - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - if (notNull) { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = firstValue; - ++runRead; - } - } - } else { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = firstValue; - ++runRead; - } - } - - return nRead; -} - -uint64_t RleDecoderV2::nextDirect(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - runRead += readLongs(data, offset, nRead, bitSize, notNull); - - if (isSigned) { - if (notNull) { - for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); - } - } - } else { - for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); - } - } - } - - return nRead; -} - -uint64_t RleDecoderV2::nextPatched(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(); - byteSize = (thirdByte >> 5) & 0x07; - // base width is one off - byteSize += 1; - - // extract patch width - uint32_t pwo = thirdByte & 0x1f; - patchBitSize = decodeBitWidth(pwo); - - // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(); - uint32_t pgw = (fourthByte >> 5) & 0x07; - // patch gap width is one off - pgw += 1; - - // extract the length of the patch list - size_t pl = fourthByte & 0x1f; - if (pl == 0) { - throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); - } - - // read the next base width number of bytes to extract base value - base = readLongBE(byteSize); - int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1)); - // if mask of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } - - // TODO: something more efficient than resize - unpacked.resize(runLength); - unpackedIdx = 0; - readLongs(unpacked.data(), 0, runLength, bitSize); - // any remaining bits are thrown out - resetReadLongs(); - - // TODO: something more efficient than resize - unpackedPatch.resize(pl); - patchIdx = 0; - // TODO: Skip corrupt? - // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { - if ((patchBitSize + pgw) > 64) { - throw ParseError("Corrupt PATCHED_BASE encoded data " - "(patchBitSize + pgw > 64)!"); - } - uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); - // any remaining bits are thrown out - resetReadLongs(); - - // apply the patch directly when decoding the packed data - patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1); - - adjustGapAndPatch(); - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - if (static_cast<int64_t>(unpackedIdx) != actualGap) { - // no patching required. add base to unpacked value to get final value - data[pos] = base + unpacked[unpackedIdx]; - } else { - // extract the patch value - int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize); - - // add base to patched value - data[pos] = base + patchedVal; - - // increment the patch to point to next entry in patch list - ++patchIdx; - - if (patchIdx < unpackedPatch.size()) { - adjustGapAndPatch(); - - // next gap is relative to the current gap - actualGap += unpackedIdx; - } - } - - ++runRead; - ++unpackedIdx; - } - - return nRead; -} - -uint64_t RleDecoderV2::nextDelta(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - if (fbo != 0) { - bitSize = decodeBitWidth(fbo); - } else { - bitSize = 0; - } - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = deltaBase = 0; - - // read the first value stored as vint - if (isSigned) { - firstValue = static_cast<int64_t>(readVslong()); - } else { - firstValue = static_cast<int64_t>(readVulong()); - } - - prevValue = firstValue; - - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - deltaBase = static_cast<int64_t>(readVslong()); - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - uint64_t pos = offset; - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead == 0 && pos < offset + nRead) { - data[pos++] = firstValue; - ++runRead; - } - - if (bitSize == 0) { - // add fixed deltas to adjacent values - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue + deltaBase; - ++runRead; - } - } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead < 2 && pos < offset + nRead) { - // add delta base and first value - prevValue = data[pos++] = firstValue + deltaBase; - ++runRead; - } - - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence - uint64_t remaining = (offset + nRead) - pos; - runRead += readLongs(data, pos, remaining, bitSize, notNull); - - if (deltaBase < 0) { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue - data[pos]; - } - } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue + data[pos]; - } - } - } - return nRead; -} - -} // namespace orc +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "RLEv2.hh" +#include "RLEV2Util.hh" + +namespace orc { + +int64_t RleDecoderV2::readLongBE(uint64_t bsz) { + int64_t ret = 0, val; + uint64_t n = bsz; + while (n > 0) { + n--; + val = readByte(); + ret |= (val << (n * 8)); + } + return ret; +} + +inline int64_t RleDecoderV2::readVslong() { + return unZigZag(readVulong()); +} + +uint64_t RleDecoderV2::readVulong() { + uint64_t ret = 0, b; + uint64_t offset = 0; + do { + b = readByte(); + ret |= (0x7f & b) << offset; + offset += 7; + } while (b >= 0x80); + return ret; +} + +RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, + bool _isSigned, MemoryPool& pool + ): inputStream(std::move(input)), + isSigned(_isSigned), + firstByte(0), + runLength(0), + runRead(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + deltaBase(0), + byteSize(0), + firstValue(0), + prevValue(0), + bitSize(0), + bitsLeft(0), + curByte(0), + patchBitSize(0), + unpackedIdx(0), + patchIdx(0), + base(0), + curGap(0), + curPatch(0), + patchMask(0), + actualGap(0), + unpacked(pool, 0), + unpackedPatch(pool, 0) { + // PASS +} + +void RleDecoderV2::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // clear state + bufferEnd = bufferStart = nullptr; + runRead = runLength = 0; + // skip ahead the given number of records + skip(location.next()); +} + +void RleDecoderV2::skip(uint64_t numValues) { + // simple for now, until perf tests indicate something encoding specific is + // needed + const uint64_t N = 64; + int64_t dummy[N]; + + while (numValues) { + uint64_t nRead = std::min(N, numValues); + next(dummy, nRead, nullptr); + numValues -= nRead; + } +} + +void RleDecoderV2::next(int64_t* const data, + const uint64_t numValues, + const char* const notNull) { + uint64_t nRead = 0; + + while (nRead < numValues) { + // Skip any nulls before attempting to read first byte. + while (notNull && !notNull[nRead]) { + if (++nRead == numValues) { + return; // ended with null values + } + } + + if (runRead == runLength) { + resetRun(); + firstByte = readByte(); + } + + uint64_t offset = nRead, length = numValues - nRead; + + EncodingType enc = static_cast<EncodingType> + ((firstByte >> 6) & 0x03); + switch(static_cast<int64_t>(enc)) { + case SHORT_REPEAT: + nRead += nextShortRepeats(data, offset, length, notNull); + break; + case DIRECT: + nRead += nextDirect(data, offset, length, notNull); + break; + case PATCHED_BASE: + nRead += nextPatched(data, offset, length, notNull); + break; + case DELTA: + nRead += nextDelta(data, offset, length, notNull); + break; + default: + throw ParseError("unknown encoding"); + } + } +} + +uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bytes + byteSize = (firstByte >> 3) & 0x07; + byteSize += 1; + + runLength = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + runLength += MIN_REPEAT; + runRead = 0; + + // read the repeated value which is store using fixed bytes + firstValue = readLongBE(byteSize); + + if (isSigned) { + firstValue = unZigZag(static_cast<uint64_t>(firstValue)); + } + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + if (notNull) { + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = firstValue; + ++runRead; + } + } + } else { + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = firstValue; + ++runRead; + } + } + + return nRead; +} + +uint64_t RleDecoderV2::nextDirect(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + runRead += readLongs(data, offset, nRead, bitSize, notNull); + + if (isSigned) { + if (notNull) { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); + } + } + } else { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); + } + } + } + + return nRead; +} + +uint64_t RleDecoderV2::nextPatched(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + // extract the number of bytes occupied by base + uint64_t thirdByte = readByte(); + byteSize = (thirdByte >> 5) & 0x07; + // base width is one off + byteSize += 1; + + // extract patch width + uint32_t pwo = thirdByte & 0x1f; + patchBitSize = decodeBitWidth(pwo); + + // read fourth byte and extract patch gap width + uint64_t fourthByte = readByte(); + uint32_t pgw = (fourthByte >> 5) & 0x07; + // patch gap width is one off + pgw += 1; + + // extract the length of the patch list + size_t pl = fourthByte & 0x1f; + if (pl == 0) { + throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); + } + + // read the next base width number of bytes to extract base value + base = readLongBE(byteSize); + int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1)); + // if mask of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } + + // TODO: something more efficient than resize + unpacked.resize(runLength); + unpackedIdx = 0; + readLongs(unpacked.data(), 0, runLength, bitSize); + // any remaining bits are thrown out + resetReadLongs(); + + // TODO: something more efficient than resize + unpackedPatch.resize(pl); + patchIdx = 0; + // TODO: Skip corrupt? + // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { + if ((patchBitSize + pgw) > 64) { + throw ParseError("Corrupt PATCHED_BASE encoded data " + "(patchBitSize + pgw > 64)!"); + } + uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); + readLongs(unpackedPatch.data(), 0, pl, cfb); + // any remaining bits are thrown out + resetReadLongs(); + + // apply the patch directly when decoding the packed data + patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1); + + adjustGapAndPatch(); + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + if (static_cast<int64_t>(unpackedIdx) != actualGap) { + // no patching required. add base to unpacked value to get final value + data[pos] = base + unpacked[unpackedIdx]; + } else { + // extract the patch value + int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize); + + // add base to patched value + data[pos] = base + patchedVal; + + // increment the patch to point to next entry in patch list + ++patchIdx; + + if (patchIdx < unpackedPatch.size()) { + adjustGapAndPatch(); + + // next gap is relative to the current gap + actualGap += unpackedIdx; + } + } + + ++runRead; + ++unpackedIdx; + } + + return nRead; +} + +uint64_t RleDecoderV2::nextDelta(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + if (fbo != 0) { + bitSize = decodeBitWidth(fbo); + } else { + bitSize = 0; + } + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + ++runLength; // account for first value + runRead = deltaBase = 0; + + // read the first value stored as vint + if (isSigned) { + firstValue = static_cast<int64_t>(readVslong()); + } else { + firstValue = static_cast<int64_t>(readVulong()); + } + + prevValue = firstValue; + + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + deltaBase = static_cast<int64_t>(readVslong()); + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + uint64_t pos = offset; + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (!notNull || notNull[pos]) break; + } + if (runRead == 0 && pos < offset + nRead) { + data[pos++] = firstValue; + ++runRead; + } + + if (bitSize == 0) { + // add fixed deltas to adjacent values + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + prevValue = data[pos] = prevValue + deltaBase; + ++runRead; + } + } else { + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (!notNull || notNull[pos]) break; + } + if (runRead < 2 && pos < offset + nRead) { + // add delta base and first value + prevValue = data[pos++] = firstValue + deltaBase; + ++runRead; + } + + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence + uint64_t remaining = (offset + nRead) - pos; + runRead += readLongs(data, pos, remaining, bitSize, notNull); + + if (deltaBase < 0) { + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + prevValue = data[pos] = prevValue - data[pos]; + } + } else { + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + prevValue = data[pos] = prevValue + data[pos]; + } + } + } + return nRead; +} + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc index 44e2761b74..f77838a4dd 100644 --- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc @@ -1,773 +1,773 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with option work for additional information - * regarding copyright ownership. The ASF licenses option file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use option file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "RLEv2.hh" -#include "RLEV2Util.hh" - -#define MAX_LITERAL_SIZE 512 -#define MAX_SHORT_REPEAT_LENGTH 10 - -namespace orc { - -/** - * Compute the bits required to represent pth percentile value - * @param data - array - * @param p - percentile value (>=0.0 to <=1.0) - * @return pth percentile bits - */ -uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { - if ((p > 1.0) || (p <= 0.0)) { - throw InvalidArgument("Invalid p value: " + to_string(p)); - } - - if (!reuseHist) { - // histogram that store the encoded bit requirement for each values. - // maximum number of bits that can encoded is 32 (refer FixedBitSizes) - memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); - // compute the histogram - for(size_t i = offset; i < (offset + length); i++) { - uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); - histgram[idx] += 1; - } - } - - int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); - - // return the bits required by pth percentile length - for(int32_t i = HIST_LEN - 1; i >= 0; i--) { - perLen -= histgram[i]; - if (perLen < 0) { - return decodeBitWidth(static_cast<uint32_t>(i)); - } - } - return 0; -} - -RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned, bool alignBitPacking) : - RleEncoder(std::move(outStream), hasSigned), - alignedBitPacking(alignBitPacking), - prevDelta(0){ - literals = new int64_t[MAX_LITERAL_SIZE]; - gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; - zigzagLiterals = new int64_t[MAX_LITERAL_SIZE]; - baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; - adjDeltas = new int64_t[MAX_LITERAL_SIZE]; -} - -void RleEncoderV2::write(int64_t val) { - if(numLiterals == 0) { - initializeLiterals(val); - return; - } - - if(numLiterals == 1) { - prevDelta = val - literals[0]; - literals[numLiterals++] = val; - - if(val == literals[0]) { - fixedRunLength = 2; - variableRunLength = 0; - } else { - fixedRunLength = 0; - variableRunLength = 2; - } - return; - } - - int64_t currentDelta = val - literals[numLiterals - 1]; - EncodingOption option = {}; - if (prevDelta == 0 && currentDelta == 0) { - // case 1: fixed delta run - literals[numLiterals++] = val; - - if (variableRunLength > 0) { - // if variable run is non-zero then we are seeing repeating - // values at the end of variable run in which case fixed Run - // length is 2 - fixedRunLength = 2; - } - fixedRunLength++; - - // if fixed run met the minimum condition and if variable - // run is non-zero then flush the variable run and shift the - // tail fixed runs to start of the buffer - if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { - numLiterals -= MIN_REPEAT; - variableRunLength -= (MIN_REPEAT - 1); - - determineEncoding(option); - writeValues(option); - - // shift tail fixed runs to beginning of the buffer - for (size_t i = 0; i < MIN_REPEAT; ++i) { - literals[i] = val; - } - numLiterals = MIN_REPEAT; - } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with option work for additional information + * regarding copyright ownership. The ASF licenses option file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use option file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "RLEv2.hh" +#include "RLEV2Util.hh" + +#define MAX_LITERAL_SIZE 512 +#define MAX_SHORT_REPEAT_LENGTH 10 + +namespace orc { + +/** + * Compute the bits required to represent pth percentile value + * @param data - array + * @param p - percentile value (>=0.0 to <=1.0) + * @return pth percentile bits + */ +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { + if ((p > 1.0) || (p <= 0.0)) { + throw InvalidArgument("Invalid p value: " + to_string(p)); + } + + if (!reuseHist) { + // histogram that store the encoded bit requirement for each values. + // maximum number of bits that can encoded is 32 (refer FixedBitSizes) + memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); + // compute the histogram + for(size_t i = offset; i < (offset + length); i++) { + uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); + histgram[idx] += 1; + } + } + + int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); + + // return the bits required by pth percentile length + for(int32_t i = HIST_LEN - 1; i >= 0; i--) { + perLen -= histgram[i]; + if (perLen < 0) { + return decodeBitWidth(static_cast<uint32_t>(i)); + } + } + return 0; +} + +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned, bool alignBitPacking) : + RleEncoder(std::move(outStream), hasSigned), + alignedBitPacking(alignBitPacking), + prevDelta(0){ + literals = new int64_t[MAX_LITERAL_SIZE]; + gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; + zigzagLiterals = new int64_t[MAX_LITERAL_SIZE]; + baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; + adjDeltas = new int64_t[MAX_LITERAL_SIZE]; +} + +void RleEncoderV2::write(int64_t val) { + if(numLiterals == 0) { + initializeLiterals(val); + return; + } + + if(numLiterals == 1) { + prevDelta = val - literals[0]; + literals[numLiterals++] = val; + + if(val == literals[0]) { + fixedRunLength = 2; + variableRunLength = 0; + } else { + fixedRunLength = 0; + variableRunLength = 2; + } + return; + } + + int64_t currentDelta = val - literals[numLiterals - 1]; + EncodingOption option = {}; + if (prevDelta == 0 && currentDelta == 0) { + // case 1: fixed delta run + literals[numLiterals++] = val; + + if (variableRunLength > 0) { + // if variable run is non-zero then we are seeing repeating + // values at the end of variable run in which case fixed Run + // length is 2 + fixedRunLength = 2; + } + fixedRunLength++; + + // if fixed run met the minimum condition and if variable + // run is non-zero then flush the variable run and shift the + // tail fixed runs to start of the buffer + if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { + numLiterals -= MIN_REPEAT; + variableRunLength -= (MIN_REPEAT - 1); + + determineEncoding(option); + writeValues(option); + + // shift tail fixed runs to beginning of the buffer + for (size_t i = 0; i < MIN_REPEAT; ++i) { + literals[i] = val; + } + numLiterals = MIN_REPEAT; + } + if (fixedRunLength == MAX_LITERAL_SIZE) { - determineEncoding(option); - writeValues(option); - } - return; - } - - // case 2: variable delta run - - // if fixed run length is non-zero and if it satisfies the - // short repeat conditions then write the values as short repeats - // else use delta encoding - if (fixedRunLength >= MIN_REPEAT) { - if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - } - writeValues(option); - } - - // if fixed run length is <MIN_REPEAT and current value is - // different from previous then treat it as variable run - if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - } - - // after writing values re-initialize the variables - if (numLiterals == 0) { - initializeLiterals(val); - } else { - prevDelta = val - literals[numLiterals - 1]; - literals[numLiterals++] = val; - variableRunLength++; - - if (variableRunLength == MAX_LITERAL_SIZE) { - determineEncoding(option); - writeValues(option); - } - } -} - -void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { - int64_t zzEncVal = 0; - for (size_t i = 0; i < numLiterals; i++) { - if (isSigned) { - zzEncVal = zigZag(literals[i]); - } else { - zzEncVal = literals[i]; - } - zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal; - } -} - -void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { - // mask will be max value beyond which patch will be generated - int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; - - // since we are considering only 95 percentile, the size of gap and - // patch array can contain only be 5% values - option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20))); - - // #bit for patch - option.patchWidth = option.brBits100p - option.brBits95p; - option.patchWidth = getClosestFixedBits(option.patchWidth); - - // if patch bit requirement is 64 then it will not possible to pack - // gap and patch together in a long. To make sure gap and patch can be - // packed together adjust the patch width - if (option.patchWidth == 64) { - option.patchWidth = 56; - option.brBits95p = 8; - mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; - } - - uint32_t gapIdx = 0; - uint32_t patchIdx = 0; - size_t prev = 0; - size_t maxGap = 0; - - std::vector<int64_t> gapList; - std::vector<int64_t> patchList; - - for(size_t i = 0; i < numLiterals; i++) { - // if value is above mask then create the patch and record the gap - if (baseRedLiterals[i] > mask) { - size_t gap = i - prev; - if (gap > maxGap) { - maxGap = gap; - } - - // gaps are relative, so store the previous patched value index - prev = i; - gapList.push_back(static_cast<int64_t>(gap)); - gapIdx++; - - // extract the most significant bits that are over mask bits - int64_t patch = baseRedLiterals[i] >> option.brBits95p; - patchList.push_back(patch); - patchIdx++; - - // strip off the MSB to enable safe bit packing - baseRedLiterals[i] &= mask; - } - } - - // adjust the patch length to number of entries in gap list - option.patchLength = gapIdx; - - // if the element to be patched is the first and only element then - // max gap will be 0, but to store the gap as 0 we need atleast 1 bit - if (maxGap == 0 && option.patchLength != 0) { - option.patchGapWidth = 1; - } else { - option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); - } - - // special case: if the patch gap width is greater than 256, then - // we need 9 bits to encode the gap width. But we only have 3 bits in - // header to record the gap width. To deal with this case, we will save - // two entries in patch list in the following way - // 256 gap width => 0 for patch value - // actual gap - 256 => actual patch value - // We will do the same for gap width = 511. If the element to be patched is - // the last element in the scope then gap width will be 511. In this case we - // will have 3 entries in the patch list in the following way - // 255 gap width => 0 for patch value - // 255 gap width => 0 for patch value - // 1 gap width => actual patch value - if (option.patchGapWidth > 8) { - option.patchGapWidth = 8; - // for gap = 511, we need two additional entries in patch list - if (maxGap == 511) { - option.patchLength += 2; - } else { - option.patchLength += 1; - } - } - - // create gap vs patch list - gapIdx = 0; - patchIdx = 0; - for(size_t i = 0; i < option.patchLength; i++) { - int64_t g = gapList[gapIdx++]; - int64_t p = patchList[patchIdx++]; - while (g > 255) { - gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); - i++; - g -= 255; - } - - // store patch value in LSBs and gap in MSBs - gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); - } -} - -void RleEncoderV2::determineEncoding(EncodingOption& option) { - // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings, - // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag - // computation when it's determined to be necessary. - - // not a big win for shorter runs to determine encoding - if (numLiterals <= MIN_REPEAT) { - // we need to compute zigzag values for DIRECT encoding if we decide to - // break early for delta overflows or for shorter runs - computeZigZagLiterals(option); - option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - option.encoding = DIRECT; - return; - } - - // DELTA encoding check - - // for identifying monotonic sequences - bool isIncreasing = true; - bool isDecreasing = true; - option.isFixedDelta = true; - - option.min = literals[0]; - int64_t max = literals[0]; - int64_t initialDelta = literals[1] - literals[0]; - int64_t currDelta = 0; - int64_t deltaMax = 0; - adjDeltas[option.adjDeltasCount++] = initialDelta; - - for (size_t i = 1; i < numLiterals; i++) { - const int64_t l1 = literals[i]; - const int64_t l0 = literals[i - 1]; - currDelta = l1 - l0; - option.min = std::min(option.min, l1); - max = std::max(max, l1); - - isIncreasing &= (l0 <= l1); - isDecreasing &= (l0 >= l1); - - option.isFixedDelta &= (currDelta == initialDelta); - if (i > 1) { - adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); - deltaMax = std::max(deltaMax, adjDeltas[i - 1]); - } - } - - // it's faster to exit under delta overflow condition without checking for - // PATCHED_BASE condition as encoding using DIRECT is faster and has less - // overhead than PATCHED_BASE - if (!isSafeSubtract(max, option.min)) { - computeZigZagLiterals(option); - option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - option.encoding = DIRECT; - return; - } - - // invariant - subtracting any number from any other in the literals after - // option point won't overflow - - // if min is equal to max then the delta is 0, option condition happens for - // fixed values run >10 which cannot be encoded with SHORT_REPEAT - if (option.min == max) { - if (!option.isFixedDelta) { - throw InvalidArgument(to_string(option.min) + "==" + - to_string(max) + ", isFixedDelta cannot be false"); - } - - if(currDelta != 0) { - throw InvalidArgument(to_string(option.min) + "==" + - to_string(max) + ", currDelta should be zero"); - } - option.fixedDelta = 0; - option.encoding = DELTA; - return; - } - - if (option.isFixedDelta) { - if (currDelta != initialDelta) { - throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); - } - - option.encoding = DELTA; - option.fixedDelta = currDelta; - return; - } - - // if initialDelta is 0 then we cannot delta encode as we cannot identify - // the sign of deltas (increasing or decreasing) - if (initialDelta != 0) { - // stores the number of bits required for packing delta blob in - // delta encoding - option.bitsDeltaMax = findClosestNumBits(deltaMax); - - // monotonic condition - if (isIncreasing || isDecreasing) { - option.encoding = DELTA; - return; - } - } - - // PATCHED_BASE encoding check - - // percentile values are computed for the zigzag encoded values. if the - // number of bit requirement between 90th and 100th percentile varies - // beyond a threshold then we need to patch the values. if the variation - // is not significant then we can use direct encoding - - computeZigZagLiterals(option); - option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true); - uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p; - - // if the difference between 90th percentile and 100th percentile fixed - // bits is > 1 then we need patch the values - if (diffBitsLH > 1) { - - // patching is done only on base reduced values. - // remove base from literals - for (size_t i = 0; i < numLiterals; i++) { - baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); - } - - // 95th percentile width is used to determine max allowed value - // after which patching will be done - option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); - - // 100th percentile is used to compute the max patch width - option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); - - // after base reducing the values, if the difference in bits between - // 95th percentile and 100th percentile value is zero then there - // is no point in patching the values, in which case we will - // fallback to DIRECT encoding. - // The decision to use patched base was based on zigzag values, but the - // actual patching is done on base reduced literals. - if ((option.brBits100p - option.brBits95p) != 0) { - option.encoding = PATCHED_BASE; - preparePatchedBlob(option); - return; - } else { - option.encoding = DIRECT; - return; - } - } else { - // if difference in bits between 95th percentile and 100th percentile is - // 0, then patch length will become 0. Hence we will fallback to direct - option.encoding = DIRECT; - return; - } -} - -uint64_t RleEncoderV2::flush() { - if (numLiterals != 0) { - EncodingOption option = {}; - if (variableRunLength != 0) { - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength != 0) { - if (fixedRunLength < MIN_REPEAT) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength >= MIN_REPEAT - && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - writeValues(option); - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); - } - } - } - - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; -} - -void RleEncoderV2::writeValues(EncodingOption& option) { - if (numLiterals != 0) { - switch (option.encoding) { - case SHORT_REPEAT: - writeShortRepeatValues(option); - break; - case DIRECT: - writeDirectValues(option); - break; - case PATCHED_BASE: - writePatchedBasedValues(option); - break; - case DELTA: - writeDeltaValues(option); - break; - default: - throw NotImplementedYet("Not implemented yet"); - } - - numLiterals = 0; - prevDelta = 0; - } -} - -void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { - int64_t repeatVal; - if (isSigned) { - repeatVal = zigZag(literals[0]); - } else { - repeatVal = literals[0]; - } - - const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal); - const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); - - uint32_t header = getOpCode(SHORT_REPEAT); - - fixedRunLength -= MIN_REPEAT; - header |= fixedRunLength; - header |= ((numBytesRepeatVal - 1) << 3); - - writeByte(static_cast<char>(header)); - - for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { - int64_t b = ((repeatVal >> (i * 8)) & 0xff); - writeByte(static_cast<char>(b)); - } - - fixedRunLength = 0; -} - -void RleEncoderV2::writeDirectValues(EncodingOption& option) { - // write the number of fixed bits required in next 5 bits - uint32_t fb = option.zzBits100p; - if (alignedBitPacking) { - fb = getClosestAlignedFixedBits(fb); - } - - const uint32_t efb = encodeBitWidth(fb) << 1; - - // adjust variable run length - variableRunLength -= 1; - - // extract the 9th bit of run length - const uint32_t tailBits = (variableRunLength & 0x100) >> 8; - - // create first byte of the header - const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits); - - // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); - - // write header - writeByte(headerFirstByte); - writeByte(headerSecondByte); - - // bit packing the zigzag encoded literals - writeInts(zigzagLiterals, 0, numLiterals, fb); - - // reset run length - variableRunLength = 0; -} - -void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { - // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding - // because patch is applied to MSB bits. For example: If fixed bit width of - // base value is 7 bits and if patch is 3 bits, the actual value is - // constructed by shifting the patch to left by 7 positions. - // actual_value = patch << 7 | base_value - // So, if we align base_value then actual_value can not be reconstructed. - - // write the number of fixed bits required in next 5 bits - const uint32_t efb = encodeBitWidth(option.brBits95p) << 1; - - // adjust variable run length, they are one off - variableRunLength -= 1; - - // extract the 9th bit of run length - const uint32_t tailBits = (variableRunLength & 0x100) >> 8; - - // create first byte of the header - const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits); - - // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); - - // if the min value is negative toggle the sign - const bool isNegative = (option.min < 0); - if (isNegative) { - option.min = -option.min; - } - - // find the number of bytes required for base and shift it by 5 bits - // to accommodate patch width. The additional bit is used to store the sign - // of the base value. - const uint32_t baseWidth = findClosestNumBits(option.min) + 1; - const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; - const uint32_t bb = (baseBytes - 1) << 5; - - // if the base value is negative then set MSB to 1 - if (isNegative) { - option.min |= (1LL << ((baseBytes * 8) - 1)); - } - - // third byte contains 3 bits for number of bytes occupied by base - // and 5 bits for patchWidth - const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth)); - - // fourth byte contains 3 bits for page gap width and 5 bits for - // patch length - const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); - - // write header - writeByte(headerFirstByte); - writeByte(headerSecondByte); - writeByte(headerThirdByte); - writeByte(headerFourthByte); - - // write the base value using fixed bytes in big endian order - for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { - char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); - writeByte(b); - } - - // base reduced literals are bit packed - uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p); - - writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits); - - // write patch list - closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth); - - writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits); - - // reset run length - variableRunLength = 0; -} - -void RleEncoderV2::writeDeltaValues(EncodingOption& option) { - uint32_t len = 0; - uint32_t fb = option.bitsDeltaMax; - uint32_t efb = 0; - - if (alignedBitPacking) { - fb = getClosestAlignedFixedBits(fb); - } - - if (option.isFixedDelta) { - // if fixed run length is greater than threshold then it will be fixed - // delta sequence with delta value 0 else fixed delta sequence with - // non-zero delta value - if (fixedRunLength > MIN_REPEAT) { - // ex. sequence: 2 2 2 2 2 2 2 2 - len = fixedRunLength - 1; - fixedRunLength = 0; - } else { - // ex. sequence: 4 6 8 10 12 14 16 - len = variableRunLength - 1; - variableRunLength = 0; - } - } else { - // fixed width 0 is used for long repeating values. - // sequences that require only 1 bit to encode will have an additional bit - if (fb == 1) { - fb = 2; - } - efb = encodeBitWidth(fb) << 1; - len = variableRunLength - 1; - variableRunLength = 0; - } - - // extract the 9th bit of run length - const uint32_t tailBits = (len & 0x100) >> 8; - - // create first byte of the header - const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits); - - // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(len & 0xff); - - // write header - writeByte(headerFirstByte); - writeByte(headerSecondByte); - - // store the first value from zigzag literal array - if (isSigned) { - writeVslong(literals[0]); - } else { - writeVulong(literals[0]); - } - - if (option.isFixedDelta) { - // if delta is fixed then we don't need to store delta blob - writeVslong(option.fixedDelta); - } else { - // store the first value as delta value using zigzag encoding - writeVslong(adjDeltas[0]); - - // adjacent delta values are bit packed. The length of adjDeltas array is - // always one less than the number of literals (delta difference for n - // elements is n-1). We have already written one element, write the - // remaining numLiterals - 2 elements here - writeInts(adjDeltas, 1, numLiterals - 2, fb); - } -} - -void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { - if(input == nullptr || len < 1 || bitSize < 1) { - return; - } - - if (getClosestAlignedFixedBits(bitSize) == bitSize) { - uint32_t numBytes; - uint32_t endOffSet = static_cast<uint32_t>(offset + len); + determineEncoding(option); + writeValues(option); + } + return; + } + + // case 2: variable delta run + + // if fixed run length is non-zero and if it satisfies the + // short repeat conditions then write the values as short repeats + // else use delta encoding + if (fixedRunLength >= MIN_REPEAT) { + if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + } + writeValues(option); + } + + // if fixed run length is <MIN_REPEAT and current value is + // different from previous then treat it as variable run + if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + } + + // after writing values re-initialize the variables + if (numLiterals == 0) { + initializeLiterals(val); + } else { + prevDelta = val - literals[numLiterals - 1]; + literals[numLiterals++] = val; + variableRunLength++; + + if (variableRunLength == MAX_LITERAL_SIZE) { + determineEncoding(option); + writeValues(option); + } + } +} + +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { + int64_t zzEncVal = 0; + for (size_t i = 0; i < numLiterals; i++) { + if (isSigned) { + zzEncVal = zigZag(literals[i]); + } else { + zzEncVal = literals[i]; + } + zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal; + } +} + +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { + // mask will be max value beyond which patch will be generated + int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; + + // since we are considering only 95 percentile, the size of gap and + // patch array can contain only be 5% values + option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20))); + + // #bit for patch + option.patchWidth = option.brBits100p - option.brBits95p; + option.patchWidth = getClosestFixedBits(option.patchWidth); + + // if patch bit requirement is 64 then it will not possible to pack + // gap and patch together in a long. To make sure gap and patch can be + // packed together adjust the patch width + if (option.patchWidth == 64) { + option.patchWidth = 56; + option.brBits95p = 8; + mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; + } + + uint32_t gapIdx = 0; + uint32_t patchIdx = 0; + size_t prev = 0; + size_t maxGap = 0; + + std::vector<int64_t> gapList; + std::vector<int64_t> patchList; + + for(size_t i = 0; i < numLiterals; i++) { + // if value is above mask then create the patch and record the gap + if (baseRedLiterals[i] > mask) { + size_t gap = i - prev; + if (gap > maxGap) { + maxGap = gap; + } + + // gaps are relative, so store the previous patched value index + prev = i; + gapList.push_back(static_cast<int64_t>(gap)); + gapIdx++; + + // extract the most significant bits that are over mask bits + int64_t patch = baseRedLiterals[i] >> option.brBits95p; + patchList.push_back(patch); + patchIdx++; + + // strip off the MSB to enable safe bit packing + baseRedLiterals[i] &= mask; + } + } + + // adjust the patch length to number of entries in gap list + option.patchLength = gapIdx; + + // if the element to be patched is the first and only element then + // max gap will be 0, but to store the gap as 0 we need atleast 1 bit + if (maxGap == 0 && option.patchLength != 0) { + option.patchGapWidth = 1; + } else { + option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); + } + + // special case: if the patch gap width is greater than 256, then + // we need 9 bits to encode the gap width. But we only have 3 bits in + // header to record the gap width. To deal with this case, we will save + // two entries in patch list in the following way + // 256 gap width => 0 for patch value + // actual gap - 256 => actual patch value + // We will do the same for gap width = 511. If the element to be patched is + // the last element in the scope then gap width will be 511. In this case we + // will have 3 entries in the patch list in the following way + // 255 gap width => 0 for patch value + // 255 gap width => 0 for patch value + // 1 gap width => actual patch value + if (option.patchGapWidth > 8) { + option.patchGapWidth = 8; + // for gap = 511, we need two additional entries in patch list + if (maxGap == 511) { + option.patchLength += 2; + } else { + option.patchLength += 1; + } + } + + // create gap vs patch list + gapIdx = 0; + patchIdx = 0; + for(size_t i = 0; i < option.patchLength; i++) { + int64_t g = gapList[gapIdx++]; + int64_t p = patchList[patchIdx++]; + while (g > 255) { + gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); + i++; + g -= 255; + } + + // store patch value in LSBs and gap in MSBs + gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); + } +} + +void RleEncoderV2::determineEncoding(EncodingOption& option) { + // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings, + // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag + // computation when it's determined to be necessary. + + // not a big win for shorter runs to determine encoding + if (numLiterals <= MIN_REPEAT) { + // we need to compute zigzag values for DIRECT encoding if we decide to + // break early for delta overflows or for shorter runs + computeZigZagLiterals(option); + option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + option.encoding = DIRECT; + return; + } + + // DELTA encoding check + + // for identifying monotonic sequences + bool isIncreasing = true; + bool isDecreasing = true; + option.isFixedDelta = true; + + option.min = literals[0]; + int64_t max = literals[0]; + int64_t initialDelta = literals[1] - literals[0]; + int64_t currDelta = 0; + int64_t deltaMax = 0; + adjDeltas[option.adjDeltasCount++] = initialDelta; + + for (size_t i = 1; i < numLiterals; i++) { + const int64_t l1 = literals[i]; + const int64_t l0 = literals[i - 1]; + currDelta = l1 - l0; + option.min = std::min(option.min, l1); + max = std::max(max, l1); + + isIncreasing &= (l0 <= l1); + isDecreasing &= (l0 >= l1); + + option.isFixedDelta &= (currDelta == initialDelta); + if (i > 1) { + adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); + deltaMax = std::max(deltaMax, adjDeltas[i - 1]); + } + } + + // it's faster to exit under delta overflow condition without checking for + // PATCHED_BASE condition as encoding using DIRECT is faster and has less + // overhead than PATCHED_BASE + if (!isSafeSubtract(max, option.min)) { + computeZigZagLiterals(option); + option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + option.encoding = DIRECT; + return; + } + + // invariant - subtracting any number from any other in the literals after + // option point won't overflow + + // if min is equal to max then the delta is 0, option condition happens for + // fixed values run >10 which cannot be encoded with SHORT_REPEAT + if (option.min == max) { + if (!option.isFixedDelta) { + throw InvalidArgument(to_string(option.min) + "==" + + to_string(max) + ", isFixedDelta cannot be false"); + } + + if(currDelta != 0) { + throw InvalidArgument(to_string(option.min) + "==" + + to_string(max) + ", currDelta should be zero"); + } + option.fixedDelta = 0; + option.encoding = DELTA; + return; + } + + if (option.isFixedDelta) { + if (currDelta != initialDelta) { + throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); + } + + option.encoding = DELTA; + option.fixedDelta = currDelta; + return; + } + + // if initialDelta is 0 then we cannot delta encode as we cannot identify + // the sign of deltas (increasing or decreasing) + if (initialDelta != 0) { + // stores the number of bits required for packing delta blob in + // delta encoding + option.bitsDeltaMax = findClosestNumBits(deltaMax); + + // monotonic condition + if (isIncreasing || isDecreasing) { + option.encoding = DELTA; + return; + } + } + + // PATCHED_BASE encoding check + + // percentile values are computed for the zigzag encoded values. if the + // number of bit requirement between 90th and 100th percentile varies + // beyond a threshold then we need to patch the values. if the variation + // is not significant then we can use direct encoding + + computeZigZagLiterals(option); + option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true); + uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p; + + // if the difference between 90th percentile and 100th percentile fixed + // bits is > 1 then we need patch the values + if (diffBitsLH > 1) { + + // patching is done only on base reduced values. + // remove base from literals + for (size_t i = 0; i < numLiterals; i++) { + baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); + } + + // 95th percentile width is used to determine max allowed value + // after which patching will be done + option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); + + // 100th percentile is used to compute the max patch width + option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); + + // after base reducing the values, if the difference in bits between + // 95th percentile and 100th percentile value is zero then there + // is no point in patching the values, in which case we will + // fallback to DIRECT encoding. + // The decision to use patched base was based on zigzag values, but the + // actual patching is done on base reduced literals. + if ((option.brBits100p - option.brBits95p) != 0) { + option.encoding = PATCHED_BASE; + preparePatchedBlob(option); + return; + } else { + option.encoding = DIRECT; + return; + } + } else { + // if difference in bits between 95th percentile and 100th percentile is + // 0, then patch length will become 0. Hence we will fallback to direct + option.encoding = DIRECT; + return; + } +} + +uint64_t RleEncoderV2::flush() { + if (numLiterals != 0) { + EncodingOption option = {}; + if (variableRunLength != 0) { + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength != 0) { + if (fixedRunLength < MIN_REPEAT) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength >= MIN_REPEAT + && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + writeValues(option); + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); + } + } + } + + outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; +} + +void RleEncoderV2::writeValues(EncodingOption& option) { + if (numLiterals != 0) { + switch (option.encoding) { + case SHORT_REPEAT: + writeShortRepeatValues(option); + break; + case DIRECT: + writeDirectValues(option); + break; + case PATCHED_BASE: + writePatchedBasedValues(option); + break; + case DELTA: + writeDeltaValues(option); + break; + default: + throw NotImplementedYet("Not implemented yet"); + } + + numLiterals = 0; + prevDelta = 0; + } +} + +void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { + int64_t repeatVal; + if (isSigned) { + repeatVal = zigZag(literals[0]); + } else { + repeatVal = literals[0]; + } + + const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal); + const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); + + uint32_t header = getOpCode(SHORT_REPEAT); + + fixedRunLength -= MIN_REPEAT; + header |= fixedRunLength; + header |= ((numBytesRepeatVal - 1) << 3); + + writeByte(static_cast<char>(header)); + + for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { + int64_t b = ((repeatVal >> (i * 8)) & 0xff); + writeByte(static_cast<char>(b)); + } + + fixedRunLength = 0; +} + +void RleEncoderV2::writeDirectValues(EncodingOption& option) { + // write the number of fixed bits required in next 5 bits + uint32_t fb = option.zzBits100p; + if (alignedBitPacking) { + fb = getClosestAlignedFixedBits(fb); + } + + const uint32_t efb = encodeBitWidth(fb) << 1; + + // adjust variable run length + variableRunLength -= 1; + + // extract the 9th bit of run length + const uint32_t tailBits = (variableRunLength & 0x100) >> 8; + + // create first byte of the header + const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits); + + // second byte of the header stores the remaining 8 bits of runlength + const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); + + // write header + writeByte(headerFirstByte); + writeByte(headerSecondByte); + + // bit packing the zigzag encoded literals + writeInts(zigzagLiterals, 0, numLiterals, fb); + + // reset run length + variableRunLength = 0; +} + +void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { + // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding + // because patch is applied to MSB bits. For example: If fixed bit width of + // base value is 7 bits and if patch is 3 bits, the actual value is + // constructed by shifting the patch to left by 7 positions. + // actual_value = patch << 7 | base_value + // So, if we align base_value then actual_value can not be reconstructed. + + // write the number of fixed bits required in next 5 bits + const uint32_t efb = encodeBitWidth(option.brBits95p) << 1; + + // adjust variable run length, they are one off + variableRunLength -= 1; + + // extract the 9th bit of run length + const uint32_t tailBits = (variableRunLength & 0x100) >> 8; + + // create first byte of the header + const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits); + + // second byte of the header stores the remaining 8 bits of runlength + const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); + + // if the min value is negative toggle the sign + const bool isNegative = (option.min < 0); + if (isNegative) { + option.min = -option.min; + } + + // find the number of bytes required for base and shift it by 5 bits + // to accommodate patch width. The additional bit is used to store the sign + // of the base value. + const uint32_t baseWidth = findClosestNumBits(option.min) + 1; + const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; + const uint32_t bb = (baseBytes - 1) << 5; + + // if the base value is negative then set MSB to 1 + if (isNegative) { + option.min |= (1LL << ((baseBytes * 8) - 1)); + } + + // third byte contains 3 bits for number of bytes occupied by base + // and 5 bits for patchWidth + const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth)); + + // fourth byte contains 3 bits for page gap width and 5 bits for + // patch length + const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); + + // write header + writeByte(headerFirstByte); + writeByte(headerSecondByte); + writeByte(headerThirdByte); + writeByte(headerFourthByte); + + // write the base value using fixed bytes in big endian order + for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { + char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); + writeByte(b); + } + + // base reduced literals are bit packed + uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p); + + writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits); + + // write patch list + closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth); + + writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits); + + // reset run length + variableRunLength = 0; +} + +void RleEncoderV2::writeDeltaValues(EncodingOption& option) { + uint32_t len = 0; + uint32_t fb = option.bitsDeltaMax; + uint32_t efb = 0; + + if (alignedBitPacking) { + fb = getClosestAlignedFixedBits(fb); + } + + if (option.isFixedDelta) { + // if fixed run length is greater than threshold then it will be fixed + // delta sequence with delta value 0 else fixed delta sequence with + // non-zero delta value + if (fixedRunLength > MIN_REPEAT) { + // ex. sequence: 2 2 2 2 2 2 2 2 + len = fixedRunLength - 1; + fixedRunLength = 0; + } else { + // ex. sequence: 4 6 8 10 12 14 16 + len = variableRunLength - 1; + variableRunLength = 0; + } + } else { + // fixed width 0 is used for long repeating values. + // sequences that require only 1 bit to encode will have an additional bit + if (fb == 1) { + fb = 2; + } + efb = encodeBitWidth(fb) << 1; + len = variableRunLength - 1; + variableRunLength = 0; + } + + // extract the 9th bit of run length + const uint32_t tailBits = (len & 0x100) >> 8; + + // create first byte of the header + const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits); + + // second byte of the header stores the remaining 8 bits of runlength + const char headerSecondByte = static_cast<char>(len & 0xff); + + // write header + writeByte(headerFirstByte); + writeByte(headerSecondByte); + + // store the first value from zigzag literal array + if (isSigned) { + writeVslong(literals[0]); + } else { + writeVulong(literals[0]); + } + + if (option.isFixedDelta) { + // if delta is fixed then we don't need to store delta blob + writeVslong(option.fixedDelta); + } else { + // store the first value as delta value using zigzag encoding + writeVslong(adjDeltas[0]); + + // adjacent delta values are bit packed. The length of adjDeltas array is + // always one less than the number of literals (delta difference for n + // elements is n-1). We have already written one element, write the + // remaining numLiterals - 2 elements here + writeInts(adjDeltas, 1, numLiterals - 2, fb); + } +} + +void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { + if(input == nullptr || len < 1 || bitSize < 1) { + return; + } + + if (getClosestAlignedFixedBits(bitSize) == bitSize) { + uint32_t numBytes; + uint32_t endOffSet = static_cast<uint32_t>(offset + len); if (bitSize < 8 ) { - char bitMask = static_cast<char>((1 << bitSize) - 1); - uint32_t numHops = 8 / bitSize; - uint32_t remainder = static_cast<uint32_t>(len % numHops); - uint32_t endUnroll = endOffSet - remainder; - for (uint32_t i = offset; i < endUnroll; i+=numHops) { - char toWrite = 0; - for (uint32_t j = 0; j < numHops; ++j) { - toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize)); - } - writeByte(toWrite); - } - - if (remainder > 0) { - uint32_t startShift = 8 - bitSize; - char toWrite = 0; - for (uint32_t i = endUnroll; i < endOffSet; ++i) { - toWrite |= static_cast<char>((input[i] & bitMask) << startShift); - startShift -= bitSize; - } - writeByte(toWrite); - } - - } else { - numBytes = bitSize / 8; - - for (uint32_t i = offset; i < endOffSet; ++i) { - for (uint32_t j = 0; j < numBytes; ++j) { - char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); - writeByte(toWrite); - } - } - } - - return; - } - - // write for unaligned bit size - uint32_t bitsLeft = 8; - char current = 0; - for(uint32_t i = offset; i < (offset + len); i++) { - int64_t value = input[i]; - uint32_t bitsToWrite = bitSize; - while (bitsToWrite > bitsLeft) { - // add the bits to the bottom of the current word - current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); - // subtract out the bits we just added - bitsToWrite -= bitsLeft; - // zero out the bits above bitsToWrite - value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; - writeByte(current); - current = 0; - bitsLeft = 8; - } - bitsLeft -= bitsToWrite; - current |= static_cast<char>(value << bitsLeft); - if (bitsLeft == 0) { - writeByte(current); - current = 0; - bitsLeft = 8; - } - } - - // flush - if (bitsLeft != 8) { - writeByte(current); - } -} - -void RleEncoderV2::initializeLiterals(int64_t val) { - literals[numLiterals++] = val; - fixedRunLength = 1; - variableRunLength = 1; -} -} + char bitMask = static_cast<char>((1 << bitSize) - 1); + uint32_t numHops = 8 / bitSize; + uint32_t remainder = static_cast<uint32_t>(len % numHops); + uint32_t endUnroll = endOffSet - remainder; + for (uint32_t i = offset; i < endUnroll; i+=numHops) { + char toWrite = 0; + for (uint32_t j = 0; j < numHops; ++j) { + toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize)); + } + writeByte(toWrite); + } + + if (remainder > 0) { + uint32_t startShift = 8 - bitSize; + char toWrite = 0; + for (uint32_t i = endUnroll; i < endOffSet; ++i) { + toWrite |= static_cast<char>((input[i] & bitMask) << startShift); + startShift -= bitSize; + } + writeByte(toWrite); + } + + } else { + numBytes = bitSize / 8; + + for (uint32_t i = offset; i < endOffSet; ++i) { + for (uint32_t j = 0; j < numBytes; ++j) { + char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); + writeByte(toWrite); + } + } + } + + return; + } + + // write for unaligned bit size + uint32_t bitsLeft = 8; + char current = 0; + for(uint32_t i = offset; i < (offset + len); i++) { + int64_t value = input[i]; + uint32_t bitsToWrite = bitSize; + while (bitsToWrite > bitsLeft) { + // add the bits to the bottom of the current word + current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); + // subtract out the bits we just added + bitsToWrite -= bitsLeft; + // zero out the bits above bitsToWrite + value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; + writeByte(current); + current = 0; + bitsLeft = 8; + } + bitsLeft -= bitsToWrite; + current |= static_cast<char>(value << bitsLeft); + if (bitsLeft == 0) { + writeByte(current); + current = 0; + bitsLeft = 8; + } + } + + // flush + if (bitsLeft != 8) { + writeByte(current); + } +} + +void RleEncoderV2::initializeLiterals(int64_t val) { + literals[numLiterals++] = val; + fixedRunLength = 1; + variableRunLength = 1; +} +} diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc index 2401f5e0cb..f13381b5b0 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.cc +++ b/contrib/libs/apache/orc/c++/src/Statistics.cc @@ -1,408 +1,408 @@ - /** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "RLE.hh" -#include "Statistics.hh" - -#include "wrap/coded-stream-wrapper.h" - -namespace orc { - - ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, - const StatContext& statContext) { - if (s.has_intstatistics()) { - return new IntegerColumnStatisticsImpl(s); - } else if (s.has_doublestatistics()) { - return new DoubleColumnStatisticsImpl(s); - } else if (s.has_stringstatistics()) { - return new StringColumnStatisticsImpl(s, statContext); - } else if (s.has_bucketstatistics()) { - return new BooleanColumnStatisticsImpl(s, statContext); - } else if (s.has_decimalstatistics()) { - return new DecimalColumnStatisticsImpl(s, statContext); - } else if (s.has_timestampstatistics()) { - return new TimestampColumnStatisticsImpl(s, statContext); - } else if (s.has_datestatistics()) { - return new DateColumnStatisticsImpl(s, statContext); - } else if (s.has_binarystatistics()) { - return new BinaryColumnStatisticsImpl(s, statContext); - } else { - return new ColumnStatisticsImpl(s); - } - } - - StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, - const StatContext& statContext) { - for(int i = 0; i < stripeStats.colstats_size(); i++) { - colStats.push_back( - convertColumnStatistics(stripeStats.colstats(i), statContext)); - } - } - - StatisticsImpl::StatisticsImpl(const proto::Footer& footer, - const StatContext& statContext) { - for(int i = 0; i < footer.statistics_size(); i++) { - colStats.push_back( - convertColumnStatistics(footer.statistics(i), statContext)); - } - } - - StatisticsImpl::~StatisticsImpl() { - for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); - ptr != colStats.end(); - ++ptr) { - delete *ptr; - } - } - - Statistics::~Statistics() { - // PASS - } - - StripeStatistics::~StripeStatistics() { - // PASS - } - - StripeStatisticsImpl::~StripeStatisticsImpl() { - // PASS - } - - StripeStatisticsImpl::StripeStatisticsImpl( - const proto::StripeStatistics& stripeStats, - std::vector<std::vector<proto::ColumnStatistics> >& indexStats, - const StatContext& statContext) { - columnStats.reset(new StatisticsImpl(stripeStats, statContext)); - rowIndexStats.resize(indexStats.size()); - for(size_t i = 0; i < rowIndexStats.size(); i++) { - for(size_t j = 0; j < indexStats[i].size(); j++) { - rowIndexStats[i].push_back( - std::shared_ptr<const ColumnStatistics>( - convertColumnStatistics(indexStats[i][j], statContext))); - } - } - } - - - ColumnStatistics::~ColumnStatistics() { - // PASS - } - - BinaryColumnStatistics::~BinaryColumnStatistics() { - // PASS - } - - BooleanColumnStatistics::~BooleanColumnStatistics() { - // PASS - } - - DateColumnStatistics::~DateColumnStatistics() { - // PASS - } - - DecimalColumnStatistics::~DecimalColumnStatistics() { - // PASS - } - - DoubleColumnStatistics::~DoubleColumnStatistics() { - // PASS - } - - IntegerColumnStatistics::~IntegerColumnStatistics() { - // PASS - } - - StringColumnStatistics::~StringColumnStatistics() { - // PASS - } - - TimestampColumnStatistics::~TimestampColumnStatistics() { - // PASS - } - - MutableColumnStatistics::~MutableColumnStatistics() { - // PASS - } - - ColumnStatisticsImpl::~ColumnStatisticsImpl() { - // PASS - } - - BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() { - // PASS - } - - BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() { - // PASS - } - - DateColumnStatisticsImpl::~DateColumnStatisticsImpl() { - // PASS - } - - DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() { - // PASS - } - - DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() { - // PASS - } - - IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() { - // PASS - } - - StringColumnStatisticsImpl::~StringColumnStatisticsImpl() { - // PASS - } - - TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() { - // PASS - } - - ColumnStatisticsImpl::ColumnStatisticsImpl - (const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - } - - BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_binarystatistics() && statContext.correctStats) { - _stats.setHasTotalLength(pb.binarystatistics().has_sum()); - _stats.setTotalLength( - static_cast<uint64_t>(pb.binarystatistics().sum())); - } - } - - BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_bucketstatistics() && statContext.correctStats) { - _hasCount = true; - _trueCount = pb.bucketstatistics().count(0); - } else { - _hasCount = false; - _trueCount = 0; - } - } - - DateColumnStatisticsImpl::DateColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_datestatistics() || !statContext.correctStats) { - // hasMinimum_ is false by default; - // hasMaximum_ is false by default; - _stats.setMinimum(0); - _stats.setMaximum(0); - } else { - _stats.setHasMinimum(pb.datestatistics().has_minimum()); - _stats.setHasMaximum(pb.datestatistics().has_maximum()); - _stats.setMinimum(pb.datestatistics().minimum()); - _stats.setMaximum(pb.datestatistics().maximum()); - } - } - - DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_decimalstatistics() && statContext.correctStats) { - const proto::DecimalStatistics& stats = pb.decimalstatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); - - _stats.setMinimum(Decimal(stats.minimum())); - _stats.setMaximum(Decimal(stats.maximum())); - _stats.setSum(Decimal(stats.sum())); - } - } - - DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_doublestatistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); - }else{ - const proto::DoubleStatistics& stats = pb.doublestatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); - - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setSum(stats.sum()); - } - } - - IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_intstatistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); - }else{ - const proto::IntegerStatistics& stats = pb.intstatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); - - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setSum(stats.sum()); - } - } - - StringColumnStatisticsImpl::StringColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_stringstatistics() || !statContext.correctStats) { - _stats.setTotalLength(0); - }else{ - const proto::StringStatistics& stats = pb.stringstatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasTotalLength(stats.has_sum()); - - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setTotalLength(static_cast<uint64_t>(stats.sum())); - } - } - - TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_timestampstatistics() || !statContext.correctStats) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _lowerBound = 0; - _upperBound = 0; - }else{ - const proto::TimestampStatistics& stats = pb.timestampstatistics(); - _stats.setHasMinimum( - stats.has_minimumutc() || - (stats.has_minimum() && (statContext.writerTimezone != nullptr))); - _stats.setHasMaximum( - stats.has_maximumutc() || - (stats.has_maximum() && (statContext.writerTimezone != nullptr))); - _hasLowerBound = stats.has_minimumutc() || stats.has_minimum(); - _hasUpperBound = stats.has_maximumutc() || stats.has_maximum(); - - // Timestamp stats are stored in milliseconds - if (stats.has_minimumutc()) { - int64_t minimum = stats.minimumutc(); - _stats.setMinimum(minimum); - _lowerBound = minimum; - } else if (statContext.writerTimezone) { - int64_t writerTimeSec = stats.minimum() / 1000; - // multiply the offset by 1000 to convert to millisecond - int64_t minimum = - stats.minimum() + - (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) - * 1000; - _stats.setMinimum(minimum); - _lowerBound = minimum; - } else { - _stats.setMinimum(0); - // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown - // TZ and daylight savings - _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); - } - - // Timestamp stats are stored in milliseconds - if (stats.has_maximumutc()) { - int64_t maximum = stats.maximumutc(); - _stats.setMaximum(maximum); - _upperBound = maximum; - } else if (statContext.writerTimezone) { - int64_t writerTimeSec = stats.maximum() / 1000; - // multiply the offset by 1000 to convert to millisecond - int64_t maximum = stats.maximum() + - (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) - * 1000; - _stats.setMaximum(maximum); - _upperBound = maximum; - } else { - _stats.setMaximum(0); - // add 1 day 1 hour (25 hours) in milliseconds to handle unknown - // TZ and daylight savings - _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); - } - // Add 1 millisecond to account for microsecond precision of values - _upperBound += 1; - } - } - - std::unique_ptr<MutableColumnStatistics> createColumnStatistics( - const Type& type) { - switch (static_cast<int64_t>(type.getKind())) { - case BOOLEAN: - return std::unique_ptr<MutableColumnStatistics>( - new BooleanColumnStatisticsImpl()); - case BYTE: - case INT: - case LONG: - case SHORT: - return std::unique_ptr<MutableColumnStatistics>( - new IntegerColumnStatisticsImpl()); - case STRUCT: - case MAP: - case LIST: - case UNION: - return std::unique_ptr<MutableColumnStatistics>( - new ColumnStatisticsImpl()); - case FLOAT: - case DOUBLE: - return std::unique_ptr<MutableColumnStatistics>( - new DoubleColumnStatisticsImpl()); - case BINARY: - return std::unique_ptr<MutableColumnStatistics>( - new BinaryColumnStatisticsImpl()); - case STRING: - case CHAR: - case VARCHAR: - return std::unique_ptr<MutableColumnStatistics>( - new StringColumnStatisticsImpl()); - case DATE: - return std::unique_ptr<MutableColumnStatistics>( - new DateColumnStatisticsImpl()); - case TIMESTAMP: - return std::unique_ptr<MutableColumnStatistics>( - new TimestampColumnStatisticsImpl()); - case DECIMAL: - return std::unique_ptr<MutableColumnStatistics>( - new DecimalColumnStatisticsImpl()); - default: - throw NotImplementedYet("Not supported type: " + type.toString()); - } - } - -}// namespace + /** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "RLE.hh" +#include "Statistics.hh" + +#include "wrap/coded-stream-wrapper.h" + +namespace orc { + + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, + const StatContext& statContext) { + if (s.has_intstatistics()) { + return new IntegerColumnStatisticsImpl(s); + } else if (s.has_doublestatistics()) { + return new DoubleColumnStatisticsImpl(s); + } else if (s.has_stringstatistics()) { + return new StringColumnStatisticsImpl(s, statContext); + } else if (s.has_bucketstatistics()) { + return new BooleanColumnStatisticsImpl(s, statContext); + } else if (s.has_decimalstatistics()) { + return new DecimalColumnStatisticsImpl(s, statContext); + } else if (s.has_timestampstatistics()) { + return new TimestampColumnStatisticsImpl(s, statContext); + } else if (s.has_datestatistics()) { + return new DateColumnStatisticsImpl(s, statContext); + } else if (s.has_binarystatistics()) { + return new BinaryColumnStatisticsImpl(s, statContext); + } else { + return new ColumnStatisticsImpl(s); + } + } + + StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext) { + for(int i = 0; i < stripeStats.colstats_size(); i++) { + colStats.push_back( + convertColumnStatistics(stripeStats.colstats(i), statContext)); + } + } + + StatisticsImpl::StatisticsImpl(const proto::Footer& footer, + const StatContext& statContext) { + for(int i = 0; i < footer.statistics_size(); i++) { + colStats.push_back( + convertColumnStatistics(footer.statistics(i), statContext)); + } + } + + StatisticsImpl::~StatisticsImpl() { + for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); + ptr != colStats.end(); + ++ptr) { + delete *ptr; + } + } + + Statistics::~Statistics() { + // PASS + } + + StripeStatistics::~StripeStatistics() { + // PASS + } + + StripeStatisticsImpl::~StripeStatisticsImpl() { + // PASS + } + + StripeStatisticsImpl::StripeStatisticsImpl( + const proto::StripeStatistics& stripeStats, + std::vector<std::vector<proto::ColumnStatistics> >& indexStats, + const StatContext& statContext) { + columnStats.reset(new StatisticsImpl(stripeStats, statContext)); + rowIndexStats.resize(indexStats.size()); + for(size_t i = 0; i < rowIndexStats.size(); i++) { + for(size_t j = 0; j < indexStats[i].size(); j++) { + rowIndexStats[i].push_back( + std::shared_ptr<const ColumnStatistics>( + convertColumnStatistics(indexStats[i][j], statContext))); + } + } + } + + + ColumnStatistics::~ColumnStatistics() { + // PASS + } + + BinaryColumnStatistics::~BinaryColumnStatistics() { + // PASS + } + + BooleanColumnStatistics::~BooleanColumnStatistics() { + // PASS + } + + DateColumnStatistics::~DateColumnStatistics() { + // PASS + } + + DecimalColumnStatistics::~DecimalColumnStatistics() { + // PASS + } + + DoubleColumnStatistics::~DoubleColumnStatistics() { + // PASS + } + + IntegerColumnStatistics::~IntegerColumnStatistics() { + // PASS + } + + StringColumnStatistics::~StringColumnStatistics() { + // PASS + } + + TimestampColumnStatistics::~TimestampColumnStatistics() { + // PASS + } + + MutableColumnStatistics::~MutableColumnStatistics() { + // PASS + } + + ColumnStatisticsImpl::~ColumnStatisticsImpl() { + // PASS + } + + BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() { + // PASS + } + + BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() { + // PASS + } + + DateColumnStatisticsImpl::~DateColumnStatisticsImpl() { + // PASS + } + + DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() { + // PASS + } + + DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() { + // PASS + } + + IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() { + // PASS + } + + StringColumnStatisticsImpl::~StringColumnStatisticsImpl() { + // PASS + } + + TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() { + // PASS + } + + ColumnStatisticsImpl::ColumnStatisticsImpl + (const proto::ColumnStatistics& pb) { + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + } + + BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (pb.has_binarystatistics() && statContext.correctStats) { + _stats.setHasTotalLength(pb.binarystatistics().has_sum()); + _stats.setTotalLength( + static_cast<uint64_t>(pb.binarystatistics().sum())); + } + } + + BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (pb.has_bucketstatistics() && statContext.correctStats) { + _hasCount = true; + _trueCount = pb.bucketstatistics().count(0); + } else { + _hasCount = false; + _trueCount = 0; + } + } + + DateColumnStatisticsImpl::DateColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_datestatistics() || !statContext.correctStats) { + // hasMinimum_ is false by default; + // hasMaximum_ is false by default; + _stats.setMinimum(0); + _stats.setMaximum(0); + } else { + _stats.setHasMinimum(pb.datestatistics().has_minimum()); + _stats.setHasMaximum(pb.datestatistics().has_maximum()); + _stats.setMinimum(pb.datestatistics().minimum()); + _stats.setMaximum(pb.datestatistics().maximum()); + } + } + + DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (pb.has_decimalstatistics() && statContext.correctStats) { + const proto::DecimalStatistics& stats = pb.decimalstatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasSum(stats.has_sum()); + + _stats.setMinimum(Decimal(stats.minimum())); + _stats.setMaximum(Decimal(stats.maximum())); + _stats.setSum(Decimal(stats.sum())); + } + } + + DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl + (const proto::ColumnStatistics& pb){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_doublestatistics()) { + _stats.setMinimum(0); + _stats.setMaximum(0); + _stats.setSum(0); + }else{ + const proto::DoubleStatistics& stats = pb.doublestatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasSum(stats.has_sum()); + + _stats.setMinimum(stats.minimum()); + _stats.setMaximum(stats.maximum()); + _stats.setSum(stats.sum()); + } + } + + IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl + (const proto::ColumnStatistics& pb){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_intstatistics()) { + _stats.setMinimum(0); + _stats.setMaximum(0); + _stats.setSum(0); + }else{ + const proto::IntegerStatistics& stats = pb.intstatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasSum(stats.has_sum()); + + _stats.setMinimum(stats.minimum()); + _stats.setMaximum(stats.maximum()); + _stats.setSum(stats.sum()); + } + } + + StringColumnStatisticsImpl::StringColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_stringstatistics() || !statContext.correctStats) { + _stats.setTotalLength(0); + }else{ + const proto::StringStatistics& stats = pb.stringstatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasTotalLength(stats.has_sum()); + + _stats.setMinimum(stats.minimum()); + _stats.setMaximum(stats.maximum()); + _stats.setTotalLength(static_cast<uint64_t>(stats.sum())); + } + } + + TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext) { + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_timestampstatistics() || !statContext.correctStats) { + _stats.setMinimum(0); + _stats.setMaximum(0); + _lowerBound = 0; + _upperBound = 0; + }else{ + const proto::TimestampStatistics& stats = pb.timestampstatistics(); + _stats.setHasMinimum( + stats.has_minimumutc() || + (stats.has_minimum() && (statContext.writerTimezone != nullptr))); + _stats.setHasMaximum( + stats.has_maximumutc() || + (stats.has_maximum() && (statContext.writerTimezone != nullptr))); + _hasLowerBound = stats.has_minimumutc() || stats.has_minimum(); + _hasUpperBound = stats.has_maximumutc() || stats.has_maximum(); + + // Timestamp stats are stored in milliseconds + if (stats.has_minimumutc()) { + int64_t minimum = stats.minimumutc(); + _stats.setMinimum(minimum); + _lowerBound = minimum; + } else if (statContext.writerTimezone) { + int64_t writerTimeSec = stats.minimum() / 1000; + // multiply the offset by 1000 to convert to millisecond + int64_t minimum = + stats.minimum() + + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) + * 1000; + _stats.setMinimum(minimum); + _lowerBound = minimum; + } else { + _stats.setMinimum(0); + // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown + // TZ and daylight savings + _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); + } + + // Timestamp stats are stored in milliseconds + if (stats.has_maximumutc()) { + int64_t maximum = stats.maximumutc(); + _stats.setMaximum(maximum); + _upperBound = maximum; + } else if (statContext.writerTimezone) { + int64_t writerTimeSec = stats.maximum() / 1000; + // multiply the offset by 1000 to convert to millisecond + int64_t maximum = stats.maximum() + + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) + * 1000; + _stats.setMaximum(maximum); + _upperBound = maximum; + } else { + _stats.setMaximum(0); + // add 1 day 1 hour (25 hours) in milliseconds to handle unknown + // TZ and daylight savings + _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); + } + // Add 1 millisecond to account for microsecond precision of values + _upperBound += 1; + } + } + + std::unique_ptr<MutableColumnStatistics> createColumnStatistics( + const Type& type) { + switch (static_cast<int64_t>(type.getKind())) { + case BOOLEAN: + return std::unique_ptr<MutableColumnStatistics>( + new BooleanColumnStatisticsImpl()); + case BYTE: + case INT: + case LONG: + case SHORT: + return std::unique_ptr<MutableColumnStatistics>( + new IntegerColumnStatisticsImpl()); + case STRUCT: + case MAP: + case LIST: + case UNION: + return std::unique_ptr<MutableColumnStatistics>( + new ColumnStatisticsImpl()); + case FLOAT: + case DOUBLE: + return std::unique_ptr<MutableColumnStatistics>( + new DoubleColumnStatisticsImpl()); + case BINARY: + return std::unique_ptr<MutableColumnStatistics>( + new BinaryColumnStatisticsImpl()); + case STRING: + case CHAR: + case VARCHAR: + return std::unique_ptr<MutableColumnStatistics>( + new StringColumnStatisticsImpl()); + case DATE: + return std::unique_ptr<MutableColumnStatistics>( + new DateColumnStatisticsImpl()); + case TIMESTAMP: + return std::unique_ptr<MutableColumnStatistics>( + new TimestampColumnStatisticsImpl()); + case DECIMAL: + return std::unique_ptr<MutableColumnStatistics>( + new DecimalColumnStatisticsImpl()); + default: + throw NotImplementedYet("Not supported type: " + type.toString()); + } + } + +}// namespace diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh index ee9db23f86..849019d8d7 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.hh +++ b/contrib/libs/apache/orc/c++/src/Statistics.hh @@ -1,971 +1,971 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_STATISTICS_IMPL_HH -#define ORC_STATISTICS_IMPL_HH - -#include "orc/Common.hh" -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include "Timezone.hh" -#include "TypeImpl.hh" - -namespace orc { - -/** - * StatContext contains fields required to compute statistics - */ - - struct StatContext { - const bool correctStats; - const Timezone* const writerTimezone; - StatContext() : correctStats(false), writerTimezone(nullptr) {} - StatContext(bool cStat, const Timezone* const timezone = nullptr) : - correctStats(cStat), writerTimezone(timezone) {} - }; - -/** - * Internal Statistics Implementation - */ - - template <typename T> - class InternalStatisticsImpl { - private: - bool _hasNull; - bool _hasMinimum; - bool _hasMaximum; - bool _hasSum; - bool _hasTotalLength; - uint64_t _totalLength; - uint64_t _valueCount; - T _minimum; - T _maximum; - T _sum; - public: - InternalStatisticsImpl() { - _hasNull = false; - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; - _hasTotalLength = false; - _totalLength = 0; - _valueCount = 0; - } - - ~InternalStatisticsImpl() {} - - // GET / SET _totalLength - bool hasTotalLength() const { return _hasTotalLength; } - - void setHasTotalLength(bool hasTotalLength) { - _hasTotalLength = hasTotalLength; - } - - uint64_t getTotalLength() const { return _totalLength; } - - void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; } - - // GET / SET _sum - bool hasSum() const { return _hasSum; } - - void setHasSum(bool hasSum) { _hasSum = hasSum; } - - T getSum() const { return _sum; } - - void setSum(T sum) { _sum = sum; } - - // GET / SET _maximum - bool hasMaximum() const { return _hasMaximum; } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STATISTICS_IMPL_HH +#define ORC_STATISTICS_IMPL_HH + +#include "orc/Common.hh" +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include "Timezone.hh" +#include "TypeImpl.hh" + +namespace orc { + +/** + * StatContext contains fields required to compute statistics + */ + + struct StatContext { + const bool correctStats; + const Timezone* const writerTimezone; + StatContext() : correctStats(false), writerTimezone(nullptr) {} + StatContext(bool cStat, const Timezone* const timezone = nullptr) : + correctStats(cStat), writerTimezone(timezone) {} + }; + +/** + * Internal Statistics Implementation + */ + + template <typename T> + class InternalStatisticsImpl { + private: + bool _hasNull; + bool _hasMinimum; + bool _hasMaximum; + bool _hasSum; + bool _hasTotalLength; + uint64_t _totalLength; + uint64_t _valueCount; + T _minimum; + T _maximum; + T _sum; + public: + InternalStatisticsImpl() { + _hasNull = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; + _hasTotalLength = false; + _totalLength = 0; + _valueCount = 0; + } + + ~InternalStatisticsImpl() {} + + // GET / SET _totalLength + bool hasTotalLength() const { return _hasTotalLength; } + + void setHasTotalLength(bool hasTotalLength) { + _hasTotalLength = hasTotalLength; + } + + uint64_t getTotalLength() const { return _totalLength; } + + void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; } + + // GET / SET _sum + bool hasSum() const { return _hasSum; } + + void setHasSum(bool hasSum) { _hasSum = hasSum; } + + T getSum() const { return _sum; } + + void setSum(T sum) { _sum = sum; } + + // GET / SET _maximum + bool hasMaximum() const { return _hasMaximum; } + const T & getMaximum() const { return _maximum; } - - void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; } - - void setMaximum(T max) { _maximum = max; } - - // GET / SET _minimum - bool hasMinimum() const { return _hasMinimum; } - - void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; } - + + void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; } + + void setMaximum(T max) { _maximum = max; } + + // GET / SET _minimum + bool hasMinimum() const { return _hasMinimum; } + + void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; } + const T & getMinimum() const { return _minimum; } - - void setMinimum(T min) { _minimum = min; } - - // GET / SET _valueCount - uint64_t getNumberOfValues() const { return _valueCount; } - - void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; } - - // GET / SET _hasNullValue - bool hasNull() const { return _hasNull; } - - void setHasNull(bool hasNull) { _hasNull = hasNull; } - - void reset() { - _hasNull = false; - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; - _hasTotalLength = false; - _totalLength = 0; - _valueCount = 0; - } - - void updateMinMax(T value) { - if (!_hasMinimum) { - _hasMinimum = _hasMaximum = true; - _minimum = _maximum = value; - } else if (compare(value, _minimum)) { - _minimum = value; - } else if (compare(_maximum, value)) { - _maximum = value; - } - } - - // sum is not merged here as we need to check overflow - void merge(const InternalStatisticsImpl& other) { - _hasNull = _hasNull || other._hasNull; - _valueCount += other._valueCount; - - if (other._hasMinimum) { - if (!_hasMinimum) { - _hasMinimum = _hasMaximum = true; - _minimum = other._minimum; - _maximum = other._maximum; - } else { - // all template types should support operator< - if (compare(_maximum, other._maximum)) { - _maximum = other._maximum; - } - if (compare(other._minimum, _minimum)) { - _minimum = other._minimum; - } - } - } - - _hasTotalLength = _hasTotalLength && other._hasTotalLength; - _totalLength += other._totalLength; - } - }; - - typedef InternalStatisticsImpl<char> InternalCharStatistics; - typedef InternalStatisticsImpl<char> InternalBooleanStatistics; - typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics; - typedef InternalStatisticsImpl<int32_t> InternalDateStatistics; - typedef InternalStatisticsImpl<double> InternalDoubleStatistics; - typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics; - typedef InternalStatisticsImpl<std::string> InternalStringStatistics; - - /** - * Mutable column statistics for use by the writer. - */ - class MutableColumnStatistics { - public: - virtual ~MutableColumnStatistics(); - - virtual void increase(uint64_t count) = 0; - - virtual void setNumberOfValues(uint64_t value) = 0; - - virtual void setHasNull(bool hasNull) = 0; - - virtual void merge(const MutableColumnStatistics& other) = 0; - - virtual void reset() = 0; - - virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0; - }; - -/** - * ColumnStatistics Implementation - */ - - class ColumnStatisticsImpl: public ColumnStatistics, - public MutableColumnStatistics { - private: - InternalCharStatistics _stats; - public: - ColumnStatisticsImpl() { reset(); } - ColumnStatisticsImpl(const proto::ColumnStatistics& stats); - virtual ~ColumnStatisticsImpl() override; - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - void merge(const MutableColumnStatistics& other) override { - _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats); - } - - void reset() override { - _stats.reset(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Column has " << getNumberOfValues() << " values" - << " and has null value: " << (hasNull() ? "yes" : "no") - << std::endl; - return buffer.str(); - } - }; - - class BinaryColumnStatisticsImpl: public BinaryColumnStatistics, - public MutableColumnStatistics { - private: - InternalCharStatistics _stats; - public: - BinaryColumnStatisticsImpl() { reset(); } - BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~BinaryColumnStatisticsImpl() override; - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - bool hasTotalLength() const override { - return _stats.hasTotalLength(); - } - - uint64_t getTotalLength() const override { - if(hasTotalLength()){ - return _stats.getTotalLength(); - }else{ - throw ParseError("Total length is not defined."); - } - } - - void setTotalLength(uint64_t length) { - _stats.setHasTotalLength(true); - _stats.setTotalLength(length); - } - - void update(size_t length) { - _stats.setTotalLength(_stats.getTotalLength() + length); - } - - void merge(const MutableColumnStatistics& other) override { - const BinaryColumnStatisticsImpl& binStats = - dynamic_cast<const BinaryColumnStatisticsImpl&>(other); - _stats.merge(binStats._stats); - } - - void reset() override { - _stats.reset(); - setTotalLength(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); - binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Binary" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasTotalLength()){ - buffer << "Total length: " << getTotalLength() << std::endl; - }else{ - buffer << "Total length: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class BooleanColumnStatisticsImpl: public BooleanColumnStatistics, - public MutableColumnStatistics { - private: - InternalBooleanStatistics _stats; - bool _hasCount; - uint64_t _trueCount; - - public: - BooleanColumnStatisticsImpl() { reset(); } - BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~BooleanColumnStatisticsImpl() override; - - bool hasCount() const override { - return _hasCount; - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - _hasCount = true; - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - uint64_t getFalseCount() const override { - if(hasCount()){ - return getNumberOfValues() - _trueCount; - }else{ - throw ParseError("False count is not defined."); - } - } - - uint64_t getTrueCount() const override { - if(hasCount()){ - return _trueCount; - }else{ - throw ParseError("True count is not defined."); - } - } - - void setTrueCount(uint64_t trueCount) { - _hasCount = true; - _trueCount = trueCount; - } - - void update(bool value, size_t repetitions) { - if (value) { - _trueCount += repetitions; - } - } - - void merge(const MutableColumnStatistics& other) override { - const BooleanColumnStatisticsImpl& boolStats = - dynamic_cast<const BooleanColumnStatisticsImpl&>(other); - _stats.merge(boolStats._stats); - _hasCount = _hasCount && boolStats._hasCount; - _trueCount += boolStats._trueCount; - } - - void reset() override { - _stats.reset(); - setTrueCount(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); - if (_hasCount) { - bucketStats->add_count(_trueCount); - } else { - bucketStats->clear_count(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Boolean" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasCount()){ - buffer << "(true: " << getTrueCount() << "; false: " - << getFalseCount() << ")" << std::endl; - } else { - buffer << "(true: not defined; false: not defined)" << std::endl; - buffer << "True and false counts are not defined" << std::endl; - } - return buffer.str(); - } - }; - - class DateColumnStatisticsImpl: public DateColumnStatistics, - public MutableColumnStatistics{ - private: - InternalDateStatistics _stats; - public: - DateColumnStatisticsImpl() { reset(); } - DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~DateColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - int32_t getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - int32_t getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(int32_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(int32_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - void update(int32_t value) { - _stats.updateMinMax(value); - } - - void merge(const MutableColumnStatistics& other) override { - const DateColumnStatisticsImpl& dateStats = - dynamic_cast<const DateColumnStatisticsImpl&>(other); - _stats.merge(dateStats._stats); - } - - void reset() override { - _stats.reset(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::DateStatistics* dateStatistics = - pbStats.mutable_datestatistics(); - if (_stats.hasMinimum()) { - dateStatistics->set_maximum(_stats.getMaximum()); - dateStatistics->set_minimum(_stats.getMinimum()); - } else { - dateStatistics->clear_minimum(); - dateStatistics->clear_maximum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Date" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class DecimalColumnStatisticsImpl: public DecimalColumnStatistics, - public MutableColumnStatistics { - private: - InternalDecimalStatistics _stats; - - public: - DecimalColumnStatisticsImpl() { reset(); } - DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~DecimalColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasSum() const override { - return _stats.hasSum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - Decimal getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - Decimal getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(Decimal minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(Decimal maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - Decimal getSum() const override { - if(hasSum()){ - return _stats.getSum(); - }else{ - throw ParseError("Sum is not defined."); - } - } - - void setSum(Decimal sum) { - _stats.setHasSum(true); - _stats.setSum(sum); - } - - void update(const Decimal& value) { - _stats.updateMinMax(value); - - if (_stats.hasSum()) { - updateSum(value); - } - } - - void merge(const MutableColumnStatistics& other) override { - const DecimalColumnStatisticsImpl& decStats = - dynamic_cast<const DecimalColumnStatisticsImpl&>(other); - - _stats.merge(decStats._stats); - - _stats.setHasSum(_stats.hasSum() && decStats.hasSum()); - if (_stats.hasSum()) { - updateSum(decStats.getSum()); - } - } - - void reset() override { - _stats.reset(); - setSum(Decimal()); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); - if (_stats.hasMinimum()) { - decStats->set_minimum(TString(_stats.getMinimum().toString())); - decStats->set_maximum(TString(_stats.getMaximum().toString())); - } else { - decStats->clear_minimum(); - decStats->clear_maximum(); - } - if (_stats.hasSum()) { - decStats->set_sum(TString(_stats.getSum().toString())); - } else { - decStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Decimal" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum().toString() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum().toString() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - - if(hasSum()){ - buffer << "Sum: " << getSum().toString() << std::endl; - }else{ - buffer << "Sum: not defined" << std::endl; - } - - return buffer.str(); - } - - private: - void updateSum(Decimal value) { - if (_stats.hasSum()) { - bool overflow = false; - Decimal sum = _stats.getSum(); - if (sum.scale > value.scale) { - value.value = scaleUpInt128ByPowerOfTen(value.value, - sum.scale - value.scale, - overflow); - } else if (sum.scale < value.scale) { - sum.value = scaleUpInt128ByPowerOfTen(sum.value, - value.scale - sum.scale, - overflow); - sum.scale = value.scale; - } - - if (!overflow) { - bool wasPositive = sum.value >= 0; - sum.value += value.value; - if ((value.value >= 0) == wasPositive) { - _stats.setHasSum((sum.value >= 0) == wasPositive); - } - } else { - _stats.setHasSum(false); - } - - if (_stats.hasSum()) { - _stats.setSum(sum); - } - } - } - }; - - class DoubleColumnStatisticsImpl: public DoubleColumnStatistics, - public MutableColumnStatistics { - private: - InternalDoubleStatistics _stats; - public: - DoubleColumnStatisticsImpl() { reset(); } - DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); - virtual ~DoubleColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasSum() const override { - return _stats.hasSum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - double getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - double getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(double minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(double maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - double getSum() const override { - if(hasSum()){ - return _stats.getSum(); - }else{ - throw ParseError("Sum is not defined."); - } - } - - void setSum(double sum) { - _stats.setHasSum(true); - _stats.setSum(sum); - } - - void update(double value) { - _stats.updateMinMax(value); - _stats.setSum(_stats.getSum() + value); - } - - void merge(const MutableColumnStatistics& other) override { - const DoubleColumnStatisticsImpl& doubleStats = - dynamic_cast<const DoubleColumnStatisticsImpl&>(other); - _stats.merge(doubleStats._stats); - - _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); - if (_stats.hasSum()) { - _stats.setSum(_stats.getSum() + doubleStats.getSum()); - } - } - - void reset() override { - _stats.reset(); - setSum(0.0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); - if (_stats.hasMinimum()) { - doubleStats->set_minimum(_stats.getMinimum()); - doubleStats->set_maximum(_stats.getMaximum()); - } else { - doubleStats->clear_minimum(); - doubleStats->clear_maximum(); - } - if (_stats.hasSum()) { - doubleStats->set_sum(_stats.getSum()); - } else { - doubleStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Double" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - - if(hasSum()){ - buffer << "Sum: " << getSum() << std::endl; - }else{ - buffer << "Sum: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class IntegerColumnStatisticsImpl: public IntegerColumnStatistics, - public MutableColumnStatistics { - private: - InternalIntegerStatistics _stats; - public: - IntegerColumnStatisticsImpl() { reset(); } - IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); - virtual ~IntegerColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasSum() const override { - return _stats.hasSum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - int64_t getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - int64_t getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(int64_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(int64_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - int64_t getSum() const override { - if(hasSum()){ - return _stats.getSum(); - }else{ - throw ParseError("Sum is not defined."); - } - } - - void setSum(int64_t sum) { - _stats.setHasSum(true); - _stats.setSum(sum); - } - + + void setMinimum(T min) { _minimum = min; } + + // GET / SET _valueCount + uint64_t getNumberOfValues() const { return _valueCount; } + + void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; } + + // GET / SET _hasNullValue + bool hasNull() const { return _hasNull; } + + void setHasNull(bool hasNull) { _hasNull = hasNull; } + + void reset() { + _hasNull = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; + _hasTotalLength = false; + _totalLength = 0; + _valueCount = 0; + } + + void updateMinMax(T value) { + if (!_hasMinimum) { + _hasMinimum = _hasMaximum = true; + _minimum = _maximum = value; + } else if (compare(value, _minimum)) { + _minimum = value; + } else if (compare(_maximum, value)) { + _maximum = value; + } + } + + // sum is not merged here as we need to check overflow + void merge(const InternalStatisticsImpl& other) { + _hasNull = _hasNull || other._hasNull; + _valueCount += other._valueCount; + + if (other._hasMinimum) { + if (!_hasMinimum) { + _hasMinimum = _hasMaximum = true; + _minimum = other._minimum; + _maximum = other._maximum; + } else { + // all template types should support operator< + if (compare(_maximum, other._maximum)) { + _maximum = other._maximum; + } + if (compare(other._minimum, _minimum)) { + _minimum = other._minimum; + } + } + } + + _hasTotalLength = _hasTotalLength && other._hasTotalLength; + _totalLength += other._totalLength; + } + }; + + typedef InternalStatisticsImpl<char> InternalCharStatistics; + typedef InternalStatisticsImpl<char> InternalBooleanStatistics; + typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics; + typedef InternalStatisticsImpl<int32_t> InternalDateStatistics; + typedef InternalStatisticsImpl<double> InternalDoubleStatistics; + typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics; + typedef InternalStatisticsImpl<std::string> InternalStringStatistics; + + /** + * Mutable column statistics for use by the writer. + */ + class MutableColumnStatistics { + public: + virtual ~MutableColumnStatistics(); + + virtual void increase(uint64_t count) = 0; + + virtual void setNumberOfValues(uint64_t value) = 0; + + virtual void setHasNull(bool hasNull) = 0; + + virtual void merge(const MutableColumnStatistics& other) = 0; + + virtual void reset() = 0; + + virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0; + }; + +/** + * ColumnStatistics Implementation + */ + + class ColumnStatisticsImpl: public ColumnStatistics, + public MutableColumnStatistics { + private: + InternalCharStatistics _stats; + public: + ColumnStatisticsImpl() { reset(); } + ColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~ColumnStatisticsImpl() override; + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + void merge(const MutableColumnStatistics& other) override { + _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats); + } + + void reset() override { + _stats.reset(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Column has " << getNumberOfValues() << " values" + << " and has null value: " << (hasNull() ? "yes" : "no") + << std::endl; + return buffer.str(); + } + }; + + class BinaryColumnStatisticsImpl: public BinaryColumnStatistics, + public MutableColumnStatistics { + private: + InternalCharStatistics _stats; + public: + BinaryColumnStatisticsImpl() { reset(); } + BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~BinaryColumnStatisticsImpl() override; + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + bool hasTotalLength() const override { + return _stats.hasTotalLength(); + } + + uint64_t getTotalLength() const override { + if(hasTotalLength()){ + return _stats.getTotalLength(); + }else{ + throw ParseError("Total length is not defined."); + } + } + + void setTotalLength(uint64_t length) { + _stats.setHasTotalLength(true); + _stats.setTotalLength(length); + } + + void update(size_t length) { + _stats.setTotalLength(_stats.getTotalLength() + length); + } + + void merge(const MutableColumnStatistics& other) override { + const BinaryColumnStatisticsImpl& binStats = + dynamic_cast<const BinaryColumnStatisticsImpl&>(other); + _stats.merge(binStats._stats); + } + + void reset() override { + _stats.reset(); + setTotalLength(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); + binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Binary" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasTotalLength()){ + buffer << "Total length: " << getTotalLength() << std::endl; + }else{ + buffer << "Total length: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class BooleanColumnStatisticsImpl: public BooleanColumnStatistics, + public MutableColumnStatistics { + private: + InternalBooleanStatistics _stats; + bool _hasCount; + uint64_t _trueCount; + + public: + BooleanColumnStatisticsImpl() { reset(); } + BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~BooleanColumnStatisticsImpl() override; + + bool hasCount() const override { + return _hasCount; + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + _hasCount = true; + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + uint64_t getFalseCount() const override { + if(hasCount()){ + return getNumberOfValues() - _trueCount; + }else{ + throw ParseError("False count is not defined."); + } + } + + uint64_t getTrueCount() const override { + if(hasCount()){ + return _trueCount; + }else{ + throw ParseError("True count is not defined."); + } + } + + void setTrueCount(uint64_t trueCount) { + _hasCount = true; + _trueCount = trueCount; + } + + void update(bool value, size_t repetitions) { + if (value) { + _trueCount += repetitions; + } + } + + void merge(const MutableColumnStatistics& other) override { + const BooleanColumnStatisticsImpl& boolStats = + dynamic_cast<const BooleanColumnStatisticsImpl&>(other); + _stats.merge(boolStats._stats); + _hasCount = _hasCount && boolStats._hasCount; + _trueCount += boolStats._trueCount; + } + + void reset() override { + _stats.reset(); + setTrueCount(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); + if (_hasCount) { + bucketStats->add_count(_trueCount); + } else { + bucketStats->clear_count(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Boolean" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasCount()){ + buffer << "(true: " << getTrueCount() << "; false: " + << getFalseCount() << ")" << std::endl; + } else { + buffer << "(true: not defined; false: not defined)" << std::endl; + buffer << "True and false counts are not defined" << std::endl; + } + return buffer.str(); + } + }; + + class DateColumnStatisticsImpl: public DateColumnStatistics, + public MutableColumnStatistics{ + private: + InternalDateStatistics _stats; + public: + DateColumnStatisticsImpl() { reset(); } + DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~DateColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + int32_t getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + int32_t getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(int32_t minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(int32_t maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + void update(int32_t value) { + _stats.updateMinMax(value); + } + + void merge(const MutableColumnStatistics& other) override { + const DateColumnStatisticsImpl& dateStats = + dynamic_cast<const DateColumnStatisticsImpl&>(other); + _stats.merge(dateStats._stats); + } + + void reset() override { + _stats.reset(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::DateStatistics* dateStatistics = + pbStats.mutable_datestatistics(); + if (_stats.hasMinimum()) { + dateStatistics->set_maximum(_stats.getMaximum()); + dateStatistics->set_minimum(_stats.getMinimum()); + } else { + dateStatistics->clear_minimum(); + dateStatistics->clear_maximum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Date" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class DecimalColumnStatisticsImpl: public DecimalColumnStatistics, + public MutableColumnStatistics { + private: + InternalDecimalStatistics _stats; + + public: + DecimalColumnStatisticsImpl() { reset(); } + DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~DecimalColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasSum() const override { + return _stats.hasSum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + Decimal getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + Decimal getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(Decimal minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(Decimal maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + Decimal getSum() const override { + if(hasSum()){ + return _stats.getSum(); + }else{ + throw ParseError("Sum is not defined."); + } + } + + void setSum(Decimal sum) { + _stats.setHasSum(true); + _stats.setSum(sum); + } + + void update(const Decimal& value) { + _stats.updateMinMax(value); + + if (_stats.hasSum()) { + updateSum(value); + } + } + + void merge(const MutableColumnStatistics& other) override { + const DecimalColumnStatisticsImpl& decStats = + dynamic_cast<const DecimalColumnStatisticsImpl&>(other); + + _stats.merge(decStats._stats); + + _stats.setHasSum(_stats.hasSum() && decStats.hasSum()); + if (_stats.hasSum()) { + updateSum(decStats.getSum()); + } + } + + void reset() override { + _stats.reset(); + setSum(Decimal()); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); + if (_stats.hasMinimum()) { + decStats->set_minimum(TString(_stats.getMinimum().toString())); + decStats->set_maximum(TString(_stats.getMaximum().toString())); + } else { + decStats->clear_minimum(); + decStats->clear_maximum(); + } + if (_stats.hasSum()) { + decStats->set_sum(TString(_stats.getSum().toString())); + } else { + decStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Decimal" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum().toString() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum().toString() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + + if(hasSum()){ + buffer << "Sum: " << getSum().toString() << std::endl; + }else{ + buffer << "Sum: not defined" << std::endl; + } + + return buffer.str(); + } + + private: + void updateSum(Decimal value) { + if (_stats.hasSum()) { + bool overflow = false; + Decimal sum = _stats.getSum(); + if (sum.scale > value.scale) { + value.value = scaleUpInt128ByPowerOfTen(value.value, + sum.scale - value.scale, + overflow); + } else if (sum.scale < value.scale) { + sum.value = scaleUpInt128ByPowerOfTen(sum.value, + value.scale - sum.scale, + overflow); + sum.scale = value.scale; + } + + if (!overflow) { + bool wasPositive = sum.value >= 0; + sum.value += value.value; + if ((value.value >= 0) == wasPositive) { + _stats.setHasSum((sum.value >= 0) == wasPositive); + } + } else { + _stats.setHasSum(false); + } + + if (_stats.hasSum()) { + _stats.setSum(sum); + } + } + } + }; + + class DoubleColumnStatisticsImpl: public DoubleColumnStatistics, + public MutableColumnStatistics { + private: + InternalDoubleStatistics _stats; + public: + DoubleColumnStatisticsImpl() { reset(); } + DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~DoubleColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasSum() const override { + return _stats.hasSum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + double getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + double getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(double minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(double maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + double getSum() const override { + if(hasSum()){ + return _stats.getSum(); + }else{ + throw ParseError("Sum is not defined."); + } + } + + void setSum(double sum) { + _stats.setHasSum(true); + _stats.setSum(sum); + } + + void update(double value) { + _stats.updateMinMax(value); + _stats.setSum(_stats.getSum() + value); + } + + void merge(const MutableColumnStatistics& other) override { + const DoubleColumnStatisticsImpl& doubleStats = + dynamic_cast<const DoubleColumnStatisticsImpl&>(other); + _stats.merge(doubleStats._stats); + + _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); + if (_stats.hasSum()) { + _stats.setSum(_stats.getSum() + doubleStats.getSum()); + } + } + + void reset() override { + _stats.reset(); + setSum(0.0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); + if (_stats.hasMinimum()) { + doubleStats->set_minimum(_stats.getMinimum()); + doubleStats->set_maximum(_stats.getMaximum()); + } else { + doubleStats->clear_minimum(); + doubleStats->clear_maximum(); + } + if (_stats.hasSum()) { + doubleStats->set_sum(_stats.getSum()); + } else { + doubleStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Double" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + + if(hasSum()){ + buffer << "Sum: " << getSum() << std::endl; + }else{ + buffer << "Sum: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class IntegerColumnStatisticsImpl: public IntegerColumnStatistics, + public MutableColumnStatistics { + private: + InternalIntegerStatistics _stats; + public: + IntegerColumnStatisticsImpl() { reset(); } + IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~IntegerColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasSum() const override { + return _stats.hasSum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + int64_t getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + int64_t getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(int64_t minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(int64_t maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + int64_t getSum() const override { + if(hasSum()){ + return _stats.getSum(); + }else{ + throw ParseError("Sum is not defined."); + } + } + + void setSum(int64_t sum) { + _stats.setHasSum(true); + _stats.setSum(sum); + } + void update(int64_t value, int repetitions) { _stats.updateMinMax(value); - + if (_stats.hasSum()) { if (repetitions > 1) { _stats.setHasSum(multiplyExact(value, repetitions, &value)); @@ -981,498 +981,498 @@ namespace orc { } } - void merge(const MutableColumnStatistics& other) override { - const IntegerColumnStatisticsImpl& intStats = - dynamic_cast<const IntegerColumnStatisticsImpl&>(other); - - _stats.merge(intStats._stats); - - // update sum and check overflow - _stats.setHasSum(_stats.hasSum() && intStats.hasSum()); - if (_stats.hasSum()) { + void merge(const MutableColumnStatistics& other) override { + const IntegerColumnStatisticsImpl& intStats = + dynamic_cast<const IntegerColumnStatisticsImpl&>(other); + + _stats.merge(intStats._stats); + + // update sum and check overflow + _stats.setHasSum(_stats.hasSum() && intStats.hasSum()); + if (_stats.hasSum()) { int64_t value; _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value)); if (_stats.hasSum()) { _stats.setSum(value); - } - } - } - - void reset() override { - _stats.reset(); - setSum(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); - if (_stats.hasMinimum()) { - intStats->set_minimum(_stats.getMinimum()); - intStats->set_maximum(_stats.getMaximum()); - } else { - intStats->clear_minimum(); - intStats->clear_maximum(); - } - if (_stats.hasSum()) { - intStats->set_sum(_stats.getSum()); - } else { - intStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Integer" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - - if(hasSum()){ - buffer << "Sum: " << getSum() << std::endl; - }else{ - buffer << "Sum: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class StringColumnStatisticsImpl: public StringColumnStatistics, - public MutableColumnStatistics{ - private: - InternalStringStatistics _stats; - - public: - StringColumnStatisticsImpl() { - reset(); - } - StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~StringColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasTotalLength() const override { - return _stats.hasTotalLength(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - + } + } + } + + void reset() override { + _stats.reset(); + setSum(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); + if (_stats.hasMinimum()) { + intStats->set_minimum(_stats.getMinimum()); + intStats->set_maximum(_stats.getMaximum()); + } else { + intStats->clear_minimum(); + intStats->clear_maximum(); + } + if (_stats.hasSum()) { + intStats->set_sum(_stats.getSum()); + } else { + intStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Integer" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + + if(hasSum()){ + buffer << "Sum: " << getSum() << std::endl; + }else{ + buffer << "Sum: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class StringColumnStatisticsImpl: public StringColumnStatistics, + public MutableColumnStatistics{ + private: + InternalStringStatistics _stats; + + public: + StringColumnStatisticsImpl() { + reset(); + } + StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~StringColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasTotalLength() const override { + return _stats.hasTotalLength(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + const std::string & getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + const std::string & getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(std::string minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(std::string maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - uint64_t getTotalLength() const override { - if(hasTotalLength()){ - return _stats.getTotalLength(); - }else{ - throw ParseError("Total length is not defined."); - } - } - - void setTotalLength(uint64_t length) { - _stats.setHasTotalLength(true); - _stats.setTotalLength(length); - } - - void update(const char* value, size_t length) { - if (value != nullptr) { - if (!_stats.hasMinimum()) { - std::string tempStr(value, value + length); - setMinimum(tempStr); - setMaximum(tempStr); - } else { - // update min - int minCmp = strncmp(_stats.getMinimum().c_str(), - value, - std::min(_stats.getMinimum().length(), length)); - if (minCmp > 0 || - (minCmp == 0 && length < _stats.getMinimum().length())) { - setMinimum(std::string(value, value + length)); - } - - // update max - int maxCmp = strncmp(_stats.getMaximum().c_str(), - value, - std::min(_stats.getMaximum().length(), length)); - if (maxCmp < 0 || - (maxCmp == 0 && length > _stats.getMaximum().length())) { - setMaximum(std::string(value, value + length)); - } - } - } - - _stats.setTotalLength(_stats.getTotalLength() + length); - } - - void update(std::string value) { - update(value.c_str(), value.length()); - } - - void merge(const MutableColumnStatistics& other) override { - const StringColumnStatisticsImpl& strStats = - dynamic_cast<const StringColumnStatisticsImpl&>(other); - _stats.merge(strStats._stats); - } - - void reset() override { - _stats.reset(); - setTotalLength(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); - if (_stats.hasMinimum()) { - strStats->set_minimum(TString(_stats.getMinimum())); - strStats->set_maximum(TString(_stats.getMaximum())); - } else { - strStats->clear_minimum(); - strStats->clear_maximum(); - } - if (_stats.hasTotalLength()) { - strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); - } else { - strStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: String" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum is not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum is not defined" << std::endl; - } - - if(hasTotalLength()){ - buffer << "Total length: " << getTotalLength() << std::endl; - }else{ - buffer << "Total length is not defined" << std::endl; - } - return buffer.str(); - } - }; - - class TimestampColumnStatisticsImpl: public TimestampColumnStatistics, - public MutableColumnStatistics { - private: - InternalIntegerStatistics _stats; - bool _hasLowerBound; - bool _hasUpperBound; - int64_t _lowerBound; - int64_t _upperBound; - - public: - TimestampColumnStatisticsImpl() { reset(); } - TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~TimestampColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - int64_t getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - int64_t getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(int64_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(int64_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - void update(int64_t value) { - _stats.updateMinMax(value); - } - - void merge(const MutableColumnStatistics& other) override { - const TimestampColumnStatisticsImpl& tsStats = - dynamic_cast<const TimestampColumnStatisticsImpl&>(other); - _stats.merge(tsStats._stats); - } - - void reset() override { - _stats.reset(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::TimestampStatistics* tsStats = - pbStats.mutable_timestampstatistics(); - if (_stats.hasMinimum()) { - tsStats->set_minimumutc(_stats.getMinimum()); - tsStats->set_maximumutc(_stats.getMaximum()); - } else { - tsStats->clear_minimumutc(); - tsStats->clear_maximumutc(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - struct tm tmValue; - char timeBuffer[20]; - time_t secs = 0; - - buffer << "Data type: Timestamp" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - secs = static_cast<time_t>(getMinimum() / 1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "Minimum: " << timeBuffer << "." - << (getMinimum() % 1000) << std::endl; - }else{ - buffer << "Minimum is not defined" << std::endl; - } - - if(hasLowerBound()){ - secs = static_cast<time_t>(getLowerBound() / 1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "LowerBound: " << timeBuffer << "." - << (getLowerBound() % 1000) << std::endl; - }else{ - buffer << "LowerBound is not defined" << std::endl; - } - - if(hasMaximum()){ - secs = static_cast<time_t>(getMaximum()/1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "Maximum: " << timeBuffer << "." - << (getMaximum() % 1000) << std::endl; - }else{ - buffer << "Maximum is not defined" << std::endl; - } - - if(hasUpperBound()){ - secs = static_cast<time_t>(getUpperBound() / 1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "UpperBound: " << timeBuffer << "." - << (getUpperBound() % 1000) << std::endl; - }else{ - buffer << "UpperBound is not defined" << std::endl; - } - - return buffer.str(); - } - - bool hasLowerBound() const override { - return _hasLowerBound; - } - - bool hasUpperBound() const override { - return _hasUpperBound; - } - - int64_t getLowerBound() const override { - if(hasLowerBound()){ - return _lowerBound; - }else{ - throw ParseError("LowerBound is not defined."); - } - } - - int64_t getUpperBound() const override { - if(hasUpperBound()){ - return _upperBound; - }else{ - throw ParseError("UpperBound is not defined."); - } - } - }; - - ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, - const StatContext& statContext); - - class StatisticsImpl: public Statistics { - private: - std::vector<ColumnStatistics*> colStats; - - // DELIBERATELY NOT IMPLEMENTED - StatisticsImpl(const StatisticsImpl&); - StatisticsImpl& operator=(const StatisticsImpl&); - - public: - StatisticsImpl(const proto::StripeStatistics& stripeStats, - const StatContext& statContext); - - StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); - - virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId - ) const override { - return colStats[columnId]; - } - - virtual ~StatisticsImpl() override; - - uint32_t getNumberOfColumns() const override { - return static_cast<uint32_t>(colStats.size()); - } - }; - - class StripeStatisticsImpl: public StripeStatistics { - private: - std::unique_ptr<StatisticsImpl> columnStats; - std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > - rowIndexStats; - - // DELIBERATELY NOT IMPLEMENTED - StripeStatisticsImpl(const StripeStatisticsImpl&); - StripeStatisticsImpl& operator=(const StripeStatisticsImpl&); - - public: - StripeStatisticsImpl( - const proto::StripeStatistics& stripeStats, - std::vector<std::vector<proto::ColumnStatistics> >& indexStats, - const StatContext& statContext); - - virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId - ) const override { - return columnStats->getColumnStatistics(columnId); - } - - uint32_t getNumberOfColumns() const override { - return columnStats->getNumberOfColumns(); - } - - virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, - uint32_t rowIndex - ) const override { - // check id indices are valid - return rowIndexStats[columnId][rowIndex].get(); - } - - virtual ~StripeStatisticsImpl() override; - - uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { - return static_cast<uint32_t>(rowIndexStats[columnId].size()); - } - }; - - /** - * Create ColumnStatistics for writers - * @param type of column - * @return MutableColumnStatistics instances - */ - std::unique_ptr<MutableColumnStatistics> createColumnStatistics( - const Type& type); - -}// namespace - -#endif + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(std::string minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(std::string maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + uint64_t getTotalLength() const override { + if(hasTotalLength()){ + return _stats.getTotalLength(); + }else{ + throw ParseError("Total length is not defined."); + } + } + + void setTotalLength(uint64_t length) { + _stats.setHasTotalLength(true); + _stats.setTotalLength(length); + } + + void update(const char* value, size_t length) { + if (value != nullptr) { + if (!_stats.hasMinimum()) { + std::string tempStr(value, value + length); + setMinimum(tempStr); + setMaximum(tempStr); + } else { + // update min + int minCmp = strncmp(_stats.getMinimum().c_str(), + value, + std::min(_stats.getMinimum().length(), length)); + if (minCmp > 0 || + (minCmp == 0 && length < _stats.getMinimum().length())) { + setMinimum(std::string(value, value + length)); + } + + // update max + int maxCmp = strncmp(_stats.getMaximum().c_str(), + value, + std::min(_stats.getMaximum().length(), length)); + if (maxCmp < 0 || + (maxCmp == 0 && length > _stats.getMaximum().length())) { + setMaximum(std::string(value, value + length)); + } + } + } + + _stats.setTotalLength(_stats.getTotalLength() + length); + } + + void update(std::string value) { + update(value.c_str(), value.length()); + } + + void merge(const MutableColumnStatistics& other) override { + const StringColumnStatisticsImpl& strStats = + dynamic_cast<const StringColumnStatisticsImpl&>(other); + _stats.merge(strStats._stats); + } + + void reset() override { + _stats.reset(); + setTotalLength(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); + if (_stats.hasMinimum()) { + strStats->set_minimum(TString(_stats.getMinimum())); + strStats->set_maximum(TString(_stats.getMaximum())); + } else { + strStats->clear_minimum(); + strStats->clear_maximum(); + } + if (_stats.hasTotalLength()) { + strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); + } else { + strStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: String" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum is not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum is not defined" << std::endl; + } + + if(hasTotalLength()){ + buffer << "Total length: " << getTotalLength() << std::endl; + }else{ + buffer << "Total length is not defined" << std::endl; + } + return buffer.str(); + } + }; + + class TimestampColumnStatisticsImpl: public TimestampColumnStatistics, + public MutableColumnStatistics { + private: + InternalIntegerStatistics _stats; + bool _hasLowerBound; + bool _hasUpperBound; + int64_t _lowerBound; + int64_t _upperBound; + + public: + TimestampColumnStatisticsImpl() { reset(); } + TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~TimestampColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + int64_t getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + int64_t getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(int64_t minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(int64_t maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + void update(int64_t value) { + _stats.updateMinMax(value); + } + + void merge(const MutableColumnStatistics& other) override { + const TimestampColumnStatisticsImpl& tsStats = + dynamic_cast<const TimestampColumnStatisticsImpl&>(other); + _stats.merge(tsStats._stats); + } + + void reset() override { + _stats.reset(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::TimestampStatistics* tsStats = + pbStats.mutable_timestampstatistics(); + if (_stats.hasMinimum()) { + tsStats->set_minimumutc(_stats.getMinimum()); + tsStats->set_maximumutc(_stats.getMaximum()); + } else { + tsStats->clear_minimumutc(); + tsStats->clear_maximumutc(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + struct tm tmValue; + char timeBuffer[20]; + time_t secs = 0; + + buffer << "Data type: Timestamp" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + secs = static_cast<time_t>(getMinimum() / 1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "Minimum: " << timeBuffer << "." + << (getMinimum() % 1000) << std::endl; + }else{ + buffer << "Minimum is not defined" << std::endl; + } + + if(hasLowerBound()){ + secs = static_cast<time_t>(getLowerBound() / 1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "LowerBound: " << timeBuffer << "." + << (getLowerBound() % 1000) << std::endl; + }else{ + buffer << "LowerBound is not defined" << std::endl; + } + + if(hasMaximum()){ + secs = static_cast<time_t>(getMaximum()/1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "Maximum: " << timeBuffer << "." + << (getMaximum() % 1000) << std::endl; + }else{ + buffer << "Maximum is not defined" << std::endl; + } + + if(hasUpperBound()){ + secs = static_cast<time_t>(getUpperBound() / 1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "UpperBound: " << timeBuffer << "." + << (getUpperBound() % 1000) << std::endl; + }else{ + buffer << "UpperBound is not defined" << std::endl; + } + + return buffer.str(); + } + + bool hasLowerBound() const override { + return _hasLowerBound; + } + + bool hasUpperBound() const override { + return _hasUpperBound; + } + + int64_t getLowerBound() const override { + if(hasLowerBound()){ + return _lowerBound; + }else{ + throw ParseError("LowerBound is not defined."); + } + } + + int64_t getUpperBound() const override { + if(hasUpperBound()){ + return _upperBound; + }else{ + throw ParseError("UpperBound is not defined."); + } + } + }; + + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, + const StatContext& statContext); + + class StatisticsImpl: public Statistics { + private: + std::vector<ColumnStatistics*> colStats; + + // DELIBERATELY NOT IMPLEMENTED + StatisticsImpl(const StatisticsImpl&); + StatisticsImpl& operator=(const StatisticsImpl&); + + public: + StatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext); + + StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); + + virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId + ) const override { + return colStats[columnId]; + } + + virtual ~StatisticsImpl() override; + + uint32_t getNumberOfColumns() const override { + return static_cast<uint32_t>(colStats.size()); + } + }; + + class StripeStatisticsImpl: public StripeStatistics { + private: + std::unique_ptr<StatisticsImpl> columnStats; + std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > + rowIndexStats; + + // DELIBERATELY NOT IMPLEMENTED + StripeStatisticsImpl(const StripeStatisticsImpl&); + StripeStatisticsImpl& operator=(const StripeStatisticsImpl&); + + public: + StripeStatisticsImpl( + const proto::StripeStatistics& stripeStats, + std::vector<std::vector<proto::ColumnStatistics> >& indexStats, + const StatContext& statContext); + + virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId + ) const override { + return columnStats->getColumnStatistics(columnId); + } + + uint32_t getNumberOfColumns() const override { + return columnStats->getNumberOfColumns(); + } + + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, + uint32_t rowIndex + ) const override { + // check id indices are valid + return rowIndexStats[columnId][rowIndex].get(); + } + + virtual ~StripeStatisticsImpl() override; + + uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { + return static_cast<uint32_t>(rowIndexStats[columnId].size()); + } + }; + + /** + * Create ColumnStatistics for writers + * @param type of column + * @return MutableColumnStatistics instances + */ + std::unique_ptr<MutableColumnStatistics> createColumnStatistics( + const Type& type); + +}// namespace + +#endif diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc index b63f19d28e..f9d82f30e0 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.cc +++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc @@ -1,161 +1,161 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "RLE.hh" -#include "Reader.hh" -#include "StripeStream.hh" - -#include "wrap/coded-stream-wrapper.h" - -namespace orc { - - StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, - const proto::StripeInformation& _stripeInfo, - const proto::StripeFooter& _footer, - uint64_t _stripeStart, - InputStream& _input, - const Timezone& _writerTimezone - ): reader(_reader), - stripeInfo(_stripeInfo), - footer(_footer), - stripeIndex(_index), - stripeStart(_stripeStart), - input(_input), - writerTimezone(_writerTimezone) { - // PASS - } - - StripeStreamsImpl::~StripeStreamsImpl() { - // PASS - } - - StreamInformation::~StreamInformation() { - // PASS - } - - StripeInformation::~StripeInformation() { - // PASS - } - - - StreamInformationImpl::~StreamInformationImpl() { - // PASS - } - - const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const { - return reader.getSelectedColumns(); - } - - proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId - ) const { - return footer.columns(static_cast<int>(columnId)); - } - - const Timezone& StripeStreamsImpl::getWriterTimezone() const { - return writerTimezone; - } - - std::ostream* StripeStreamsImpl::getErrorStream() const { - return reader.getFileContents().errorStream; - } - - std::unique_ptr<SeekableInputStream> - StripeStreamsImpl::getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const { - uint64_t offset = stripeStart; - uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength(); - MemoryPool *pool = reader.getFileContents().pool; - for(int i = 0; i < footer.streams_size(); ++i) { - const proto::Stream& stream = footer.streams(i); - if (stream.has_kind() && - stream.kind() == kind && - stream.column() == static_cast<uint64_t>(columnId)) { - uint64_t streamLength = stream.length(); - uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; - if (offset + streamLength > dataEnd) { - std::stringstream msg; - msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex - << ": streamOffset=" << offset << ", streamLength=" << streamLength - << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" - << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength(); - throw ParseError(msg.str()); - } - return createDecompressor(reader.getCompression(), - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream - (&input, - offset, - stream.length(), - *pool, - myBlock)), - reader.getCompressionSize(), - *pool); - } - offset += stream.length(); - } - return std::unique_ptr<SeekableInputStream>(); - } - - MemoryPool& StripeStreamsImpl::getMemoryPool() const { - return *reader.getFileContents().pool; - } - - bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const { - return reader.getThrowOnHive11DecimalOverflow(); - } - - int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const { - return reader.getForcedScaleOnHive11Decimal(); - } - - void StripeInformationImpl::ensureStripeFooterLoaded() const { - if (stripeFooter.get() == nullptr) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(stream, - offset + - indexLength + - dataLength, - footerLength, - memory)), - blockSize, - memory); - stripeFooter.reset(new proto::StripeFooter()); - if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the stripe footer"); - } - } - } - - std::unique_ptr<StreamInformation> - StripeInformationImpl::getStreamInformation(uint64_t streamId) const { - ensureStripeFooterLoaded(); - uint64_t streamOffset = offset; - for(uint64_t s=0; s < streamId; ++s) { - streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); - } - return ORC_UNIQUE_PTR<StreamInformation> - (new StreamInformationImpl(streamOffset, - stripeFooter-> - streams(static_cast<int>(streamId)))); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "RLE.hh" +#include "Reader.hh" +#include "StripeStream.hh" + +#include "wrap/coded-stream-wrapper.h" + +namespace orc { + + StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, + const proto::StripeInformation& _stripeInfo, + const proto::StripeFooter& _footer, + uint64_t _stripeStart, + InputStream& _input, + const Timezone& _writerTimezone + ): reader(_reader), + stripeInfo(_stripeInfo), + footer(_footer), + stripeIndex(_index), + stripeStart(_stripeStart), + input(_input), + writerTimezone(_writerTimezone) { + // PASS + } + + StripeStreamsImpl::~StripeStreamsImpl() { + // PASS + } + + StreamInformation::~StreamInformation() { + // PASS + } + + StripeInformation::~StripeInformation() { + // PASS + } + + + StreamInformationImpl::~StreamInformationImpl() { + // PASS + } + + const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const { + return reader.getSelectedColumns(); + } + + proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId + ) const { + return footer.columns(static_cast<int>(columnId)); + } + + const Timezone& StripeStreamsImpl::getWriterTimezone() const { + return writerTimezone; + } + + std::ostream* StripeStreamsImpl::getErrorStream() const { + return reader.getFileContents().errorStream; + } + + std::unique_ptr<SeekableInputStream> + StripeStreamsImpl::getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const { + uint64_t offset = stripeStart; + uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength(); + MemoryPool *pool = reader.getFileContents().pool; + for(int i = 0; i < footer.streams_size(); ++i) { + const proto::Stream& stream = footer.streams(i); + if (stream.has_kind() && + stream.kind() == kind && + stream.column() == static_cast<uint64_t>(columnId)) { + uint64_t streamLength = stream.length(); + uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; + if (offset + streamLength > dataEnd) { + std::stringstream msg; + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex + << ": streamOffset=" << offset << ", streamLength=" << streamLength + << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" + << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength(); + throw ParseError(msg.str()); + } + return createDecompressor(reader.getCompression(), + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream + (&input, + offset, + stream.length(), + *pool, + myBlock)), + reader.getCompressionSize(), + *pool); + } + offset += stream.length(); + } + return std::unique_ptr<SeekableInputStream>(); + } + + MemoryPool& StripeStreamsImpl::getMemoryPool() const { + return *reader.getFileContents().pool; + } + + bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const { + return reader.getThrowOnHive11DecimalOverflow(); + } + + int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const { + return reader.getForcedScaleOnHive11Decimal(); + } + + void StripeInformationImpl::ensureStripeFooterLoaded() const { + if (stripeFooter.get() == nullptr) { + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(stream, + offset + + indexLength + + dataLength, + footerLength, + memory)), + blockSize, + memory); + stripeFooter.reset(new proto::StripeFooter()); + if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the stripe footer"); + } + } + } + + std::unique_ptr<StreamInformation> + StripeInformationImpl::getStreamInformation(uint64_t streamId) const { + ensureStripeFooterLoaded(); + uint64_t streamOffset = offset; + for(uint64_t s=0; s < streamId; ++s) { + streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); + } + return ORC_UNIQUE_PTR<StreamInformation> + (new StreamInformationImpl(streamOffset, + stripeFooter-> + streams(static_cast<int>(streamId)))); + } + +} diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh index 5cbaf60a69..da5cb16f37 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.hh +++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh @@ -1,213 +1,213 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_STRIPE_STREAM_HH -#define ORC_STRIPE_STREAM_HH - -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include "Timezone.hh" -#include "TypeImpl.hh" - -namespace orc { - - class RowReaderImpl; - - /** - * StripeStream Implementation - */ - - class StripeStreamsImpl: public StripeStreams { - private: - const RowReaderImpl& reader; - const proto::StripeInformation& stripeInfo; - const proto::StripeFooter& footer; - const uint64_t stripeIndex; - const uint64_t stripeStart; - InputStream& input; - const Timezone& writerTimezone; - - public: - StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, - const proto::StripeInformation& stripeInfo, - const proto::StripeFooter& footer, - uint64_t stripeStart, - InputStream& input, - const Timezone& writerTimezone); - - virtual ~StripeStreamsImpl() override; - - virtual const std::vector<bool> getSelectedColumns() const override; - - virtual proto::ColumnEncoding getEncoding(uint64_t columnId - ) const override; - - virtual std::unique_ptr<SeekableInputStream> - getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const override; - - MemoryPool& getMemoryPool() const override; - - const Timezone& getWriterTimezone() const override; - - std::ostream* getErrorStream() const override; - - bool getThrowOnHive11DecimalOverflow() const override; - - int32_t getForcedScaleOnHive11Decimal() const override; - }; - - /** - * StreamInformation Implementation - */ - - class StreamInformationImpl: public StreamInformation { - private: - StreamKind kind; - uint64_t column; - uint64_t offset; - uint64_t length; - public: - StreamInformationImpl(uint64_t _offset, - const proto::Stream& stream - ): kind(static_cast<StreamKind>(stream.kind())), - column(stream.column()), - offset(_offset), - length(stream.length()) { - // PASS - } - - ~StreamInformationImpl() override; - - StreamKind getKind() const override { - return kind; - } - - uint64_t getColumnId() const override { - return column; - } - - uint64_t getOffset() const override { - return offset; - } - - uint64_t getLength() const override { - return length; - } - }; - - /** - * StripeInformation Implementation - */ - - class StripeInformationImpl : public StripeInformation { - uint64_t offset; - uint64_t indexLength; - uint64_t dataLength; - uint64_t footerLength; - uint64_t numRows; - InputStream* stream; - MemoryPool& memory; - CompressionKind compression; - uint64_t blockSize; - mutable std::unique_ptr<proto::StripeFooter> stripeFooter; - void ensureStripeFooterLoaded() const; - public: - - StripeInformationImpl(uint64_t _offset, - uint64_t _indexLength, - uint64_t _dataLength, - uint64_t _footerLength, - uint64_t _numRows, - InputStream* _stream, - MemoryPool& _memory, - CompressionKind _compression, - uint64_t _blockSize - ) : offset(_offset), - indexLength(_indexLength), - dataLength(_dataLength), - footerLength(_footerLength), - numRows(_numRows), - stream(_stream), - memory(_memory), - compression(_compression), - blockSize(_blockSize) { - // PASS - } - - virtual ~StripeInformationImpl() override { - // PASS - } - - uint64_t getOffset() const override { - return offset; - } - - uint64_t getLength() const override { - return indexLength + dataLength + footerLength; - } - uint64_t getIndexLength() const override { - return indexLength; - } - - uint64_t getDataLength()const override { - return dataLength; - } - - uint64_t getFooterLength() const override { - return footerLength; - } - - uint64_t getNumberOfRows() const override { - return numRows; - } - - uint64_t getNumberOfStreams() const override { - ensureStripeFooterLoaded(); - return static_cast<uint64_t>(stripeFooter->streams_size()); - } - - std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId - ) const override; - - ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { - ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter-> - columns(static_cast<int>(colId)) - .kind()); - } - - uint64_t getDictionarySize(uint64_t colId) const override { - ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter-> - columns(static_cast<int>(colId)) - .dictionarysize()); - } - - const std::string& getWriterTimezone() const override { - ensureStripeFooterLoaded(); - return stripeFooter->writertimezone(); - } - }; - -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STRIPE_STREAM_HH +#define ORC_STRIPE_STREAM_HH + +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include "Timezone.hh" +#include "TypeImpl.hh" + +namespace orc { + + class RowReaderImpl; + + /** + * StripeStream Implementation + */ + + class StripeStreamsImpl: public StripeStreams { + private: + const RowReaderImpl& reader; + const proto::StripeInformation& stripeInfo; + const proto::StripeFooter& footer; + const uint64_t stripeIndex; + const uint64_t stripeStart; + InputStream& input; + const Timezone& writerTimezone; + + public: + StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, + const proto::StripeInformation& stripeInfo, + const proto::StripeFooter& footer, + uint64_t stripeStart, + InputStream& input, + const Timezone& writerTimezone); + + virtual ~StripeStreamsImpl() override; + + virtual const std::vector<bool> getSelectedColumns() const override; + + virtual proto::ColumnEncoding getEncoding(uint64_t columnId + ) const override; + + virtual std::unique_ptr<SeekableInputStream> + getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const override; + + MemoryPool& getMemoryPool() const override; + + const Timezone& getWriterTimezone() const override; + + std::ostream* getErrorStream() const override; + + bool getThrowOnHive11DecimalOverflow() const override; + + int32_t getForcedScaleOnHive11Decimal() const override; + }; + + /** + * StreamInformation Implementation + */ + + class StreamInformationImpl: public StreamInformation { + private: + StreamKind kind; + uint64_t column; + uint64_t offset; + uint64_t length; + public: + StreamInformationImpl(uint64_t _offset, + const proto::Stream& stream + ): kind(static_cast<StreamKind>(stream.kind())), + column(stream.column()), + offset(_offset), + length(stream.length()) { + // PASS + } + + ~StreamInformationImpl() override; + + StreamKind getKind() const override { + return kind; + } + + uint64_t getColumnId() const override { + return column; + } + + uint64_t getOffset() const override { + return offset; + } + + uint64_t getLength() const override { + return length; + } + }; + + /** + * StripeInformation Implementation + */ + + class StripeInformationImpl : public StripeInformation { + uint64_t offset; + uint64_t indexLength; + uint64_t dataLength; + uint64_t footerLength; + uint64_t numRows; + InputStream* stream; + MemoryPool& memory; + CompressionKind compression; + uint64_t blockSize; + mutable std::unique_ptr<proto::StripeFooter> stripeFooter; + void ensureStripeFooterLoaded() const; + public: + + StripeInformationImpl(uint64_t _offset, + uint64_t _indexLength, + uint64_t _dataLength, + uint64_t _footerLength, + uint64_t _numRows, + InputStream* _stream, + MemoryPool& _memory, + CompressionKind _compression, + uint64_t _blockSize + ) : offset(_offset), + indexLength(_indexLength), + dataLength(_dataLength), + footerLength(_footerLength), + numRows(_numRows), + stream(_stream), + memory(_memory), + compression(_compression), + blockSize(_blockSize) { + // PASS + } + + virtual ~StripeInformationImpl() override { + // PASS + } + + uint64_t getOffset() const override { + return offset; + } + + uint64_t getLength() const override { + return indexLength + dataLength + footerLength; + } + uint64_t getIndexLength() const override { + return indexLength; + } + + uint64_t getDataLength()const override { + return dataLength; + } + + uint64_t getFooterLength() const override { + return footerLength; + } + + uint64_t getNumberOfRows() const override { + return numRows; + } + + uint64_t getNumberOfStreams() const override { + ensureStripeFooterLoaded(); + return static_cast<uint64_t>(stripeFooter->streams_size()); + } + + std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId + ) const override; + + ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { + ensureStripeFooterLoaded(); + return static_cast<ColumnEncodingKind>(stripeFooter-> + columns(static_cast<int>(colId)) + .kind()); + } + + uint64_t getDictionarySize(uint64_t colId) const override { + ensureStripeFooterLoaded(); + return static_cast<ColumnEncodingKind>(stripeFooter-> + columns(static_cast<int>(colId)) + .dictionarysize()); + } + + const std::string& getWriterTimezone() const override { + ensureStripeFooterLoaded(); + return stripeFooter->writertimezone(); + } + }; + +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc index 318e5bcc12..0aa66ef71c 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.cc +++ b/contrib/libs/apache/orc/c++/src/Timezone.cc @@ -1,936 +1,936 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/OrcFile.hh" -#include "Timezone.hh" - -#include <errno.h> -#include <map> -#include <sstream> -#include <stdint.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> - -namespace orc { - - // default location of the timezone files - static const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo"; - - // location of a symlink to the local timezone - static const char LOCAL_TIMEZONE[] = "/etc/localtime"; - - enum TransitionKind { - TRANSITION_JULIAN, - TRANSITION_DAY, - TRANSITION_MONTH - }; - - static const int64_t MONTHS_PER_YEAR = 12; - /** - * The number of days in each month in non-leap and leap years. - */ - static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = - {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - static const int64_t DAYS_PER_WEEK = 7; - - // Leap years and day of the week repeat every 400 years, which makes it - // a good cycle length. - static const int64_t SECONDS_PER_400_YEARS = - SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3)); - - /** - * Is the given year a leap year? - */ - bool isLeap(int64_t year) { - return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); - } - - /** - * Find the position that is the closest and less than or equal to the - * target. - * @return -1 if the target < array[0] or array is empty or - * i if array[i] <= target and (i == n or array[i] < array[i+1]) - */ - int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) { - uint64_t size = array.size(); - if (size == 0) { - return -1; - } - uint64_t min = 0; - uint64_t max = size - 1; - uint64_t mid = (min + max) / 2; - while ((array[mid] != target) && (min < max)) { - if (array[mid] < target) { - min = mid + 1; - } else if (mid == 0) { - max = 0; - } else { - max = mid - 1; - } - mid = (min + max) / 2; - } - if (target < array[mid]) { - return static_cast<int64_t>(mid) - 1; - } else { - return static_cast<int64_t>(mid); - } - } - - struct Transition { - TransitionKind kind; - int64_t day; - int64_t week; - int64_t month; - int64_t time; - - std::string toString() const { - std::stringstream buffer; - switch (kind) { - case TRANSITION_JULIAN: - buffer << "julian " << day; - break; - case TRANSITION_DAY: - buffer << "day " << day; - break; - case TRANSITION_MONTH: - buffer << "month " << month << " week " << week << " day " << day; - break; - } - buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) - << ":" << (time % 60); - return buffer.str(); - } - - /** - * Get the transition time for the given year. - * @param year the year - * @return the number of seconds past local Jan 1 00:00:00 that the - * transition happens. - */ - int64_t getTime(int64_t year) const { - int64_t result = time; - switch (kind) { - case TRANSITION_JULIAN: - result += SECONDS_PER_DAY * day; - if (day > 60 && isLeap(year)) { - result += SECONDS_PER_DAY; - } - break; - case TRANSITION_DAY: - result += SECONDS_PER_DAY * day; - break; - case TRANSITION_MONTH: { - bool inLeap = isLeap(year); - int64_t adjustedMonth = (month + 9) % 12 + 1; - int64_t adjustedYear = (month <= 2) ? (year - 1) : year; - int64_t adjustedCentury = adjustedYear / 100; - int64_t adjustedRemainder = adjustedYear % 100; - - // day of the week of the first day of month - int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + - 1 + adjustedRemainder + adjustedRemainder / 4 + - adjustedCentury / 4 - 2 * adjustedCentury) % 7; - if (dayOfWeek < 0) { - dayOfWeek += DAYS_PER_WEEK; - } - - int64_t d = day - dayOfWeek; - if (d < 0) { - d += DAYS_PER_WEEK; - } - for (int w = 1; w < week; ++w) { - if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) { - break; - } - d += DAYS_PER_WEEK; - } - result += d * SECONDS_PER_DAY; - - // Add in the time for the month - for(int m=0; m < month - 1; ++m) { - result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY; - } - break; - } - } - return result; - } - }; - - /** - * The current rule for finding timezone variants arbitrarily far in - * the future. They are based on a string representation that - * specifies the standard name and offset. For timezones with - * daylight savings, the string specifies the daylight variant name - * and offset and the rules for switching between them. - * - * rule = <standard name><standard offset><daylight>? - * name = string with no numbers or '+', '-', or ',' - * offset = [-+]?hh(:mm(:ss)?)? - * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)? - * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week> - */ - class FutureRuleImpl: public FutureRule { - std::string ruleString; - TimezoneVariant standard; - bool hasDst; - TimezoneVariant dst; - Transition start; - Transition end; - - // expanded time_t offsets of transitions - std::vector<int64_t> offsets; - - // Is the epoch (1 Jan 1970 00:00) in standard time? - // This code assumes that the transition dates fall in the same order - // each year. Hopefully no timezone regions decide to move across the - // equator, which is about what it would take. - bool startInStd; - - void computeOffsets() { - if (!hasDst) { - startInStd = true; - offsets.resize(1); - } else { - // Insert a transition for the epoch and two per a year for the next - // 400 years. We assume that the all even positions are in standard - // time if and only if startInStd and the odd ones are the reverse. - offsets.resize(400 * 2 + 1); - startInStd = start.getTime(1970) < end.getTime(1970); - int64_t base = 0; - for(int64_t year = 1970; year < 1970 + 400; ++year) { - if (startInStd) { - offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + start.getTime(year) - standard.gmtOffset; - offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + end.getTime(year) - dst.gmtOffset; - } else { - offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + end.getTime(year) - dst.gmtOffset; - offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + start.getTime(year) - standard.gmtOffset; - } - base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY; - } - } - offsets[0] = 0; - } - - public: - virtual ~FutureRuleImpl() override; - bool isDefined() const override; - const TimezoneVariant& getVariant(int64_t clk) const override; - void print(std::ostream& out) const override; - - friend class FutureRuleParser; - }; - - FutureRule::~FutureRule() { - // PASS - } - - FutureRuleImpl::~FutureRuleImpl() { - // PASS - } - - bool FutureRuleImpl::isDefined() const { - return ruleString.size() > 0; - } - - const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const { - if (!hasDst) { - return standard; - } else { - int64_t adjusted = clk % SECONDS_PER_400_YEARS; - if (adjusted < 0) { - adjusted += SECONDS_PER_400_YEARS; - } - int64_t idx = binarySearch(offsets, adjusted); - if (startInStd == (idx % 2 == 0)) { - return standard; - } else { - return dst; - } - } - } - - void FutureRuleImpl::print(std::ostream& out) const { - if (isDefined()) { - out << " Future rule: " << ruleString << "\n"; - out << " standard " << standard.toString() << "\n"; - if (hasDst) { - out << " dst " << dst.toString() << "\n"; - out << " start " << start.toString() << "\n"; - out << " end " << end.toString() << "\n"; - } - } - } - - /** - * A parser for the future rule strings. - */ - class FutureRuleParser { - public: - FutureRuleParser(const std::string& str, - FutureRuleImpl* rule - ): ruleString(str), - length(str.size()), - position(0), - output(*rule) { - output.ruleString = str; - if (position != length) { - parseName(output.standard.name); - output.standard.gmtOffset = -parseOffset(); - output.standard.isDst = false; - output.hasDst = position < length; - if (output.hasDst) { - parseName(output.dst.name); - output.dst.isDst = true; - if (ruleString[position] != ',') { - output.dst.gmtOffset = -parseOffset(); - } else { - output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60; - } - parseTransition(output.start); - parseTransition(output.end); - } - if (position != length) { - throwError("Extra text"); - } - output.computeOffsets(); - } - } - - private: - - const std::string& ruleString; - size_t length; - size_t position; - FutureRuleImpl &output; - - void throwError(const char *msg) { - std::stringstream buffer; - buffer << msg << " at " << position << " in '" << ruleString << "'"; - throw TimezoneError(buffer.str()); - } - - /** - * Parse the names of the form: - * ([^-+0-9,]+|<[^>]+>) - * and set the output string. - */ - void parseName(std::string& result) { - if (position == length) { - throwError("name required"); - } - size_t start = position; - if (ruleString[position] == '<') { - while (position < length && ruleString[position] != '>') { - position += 1; - } - if (position == length) { - throwError("missing close '>'"); - } - position +=1; - } else { - while (position < length) { - char ch = ruleString[position]; - if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') { - break; - } - position += 1; - } - } - if (position == start) { - throwError("empty string not allowed"); - } - result = ruleString.substr(start, position - start); - } - - /** - * Parse an integer of the form [0-9]+ and return it. - */ - int64_t parseNumber() { - if (position >= length) { - throwError("missing number"); - } - int64_t result = 0; - while (position < length) { - char ch = ruleString[position]; - if (isdigit(ch)) { - result = result * 10 + (ch - '0'); - position += 1; - } else { - break; - } - } - return result; - } - - /** - * Parse the offsets of the form: - * [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)? - * and convert it into a number of seconds. - */ - int64_t parseOffset() { - int64_t scale = 3600; - bool isNegative = false; - if (position < length) { - char ch = ruleString[position]; - isNegative = ch == '-'; - if (ch == '-' || ch == '+') { - position += 1; - } - } - int64_t result = parseNumber() * scale; - while (position < length && scale > 1 && ruleString[position] == ':') { - scale /= 60; - position += 1; - result += parseNumber() * scale; - } - if (isNegative) { - result = -result; - } - return result; - } - - /** - * Parse a transition of the following form: - * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)? - */ - void parseTransition(Transition& transition) { - if (length - position < 2 || ruleString[position] != ',') { - throwError("missing transition"); - } - position += 1; - char ch = ruleString[position]; - if (ch == 'J') { - transition.kind = TRANSITION_JULIAN; - position += 1; - transition.day = parseNumber(); - } else if (ch == 'M') { - transition.kind = TRANSITION_MONTH; - position += 1; - transition.month = parseNumber(); - if (position == length || ruleString[position] != '.') { - throwError("missing first ."); - } - position += 1; - transition.week = parseNumber(); - if (position == length || ruleString[position] != '.') { - throwError("missing second ."); - } - position += 1; - transition.day = parseNumber(); - } else { - transition.kind = TRANSITION_DAY; - transition.day = parseNumber(); - } - if (position < length && ruleString[position] == '/') { - position += 1; - transition.time = parseOffset(); - } else { - transition.time = 2 * 60 * 60; - } - } - }; - - /** - * Parse the POSIX TZ string. - */ - std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) { - std::shared_ptr<FutureRule> result(new FutureRuleImpl()); - FutureRuleParser parser(ruleString, - dynamic_cast<FutureRuleImpl*>(result.get())); - return result; - } - - std::string TimezoneVariant::toString() const { - std::stringstream buffer; - buffer << name << " " << gmtOffset; - if (isDst) { - buffer << " (dst)"; - } - return buffer.str(); - } - - /** - * An abstraction of the differences between versions. - */ - class VersionParser { - public: - virtual ~VersionParser(); - - /** - * Get the version number. - */ - virtual uint64_t getVersion() const = 0; - - /** - * Get the number of bytes - */ - virtual uint64_t getTimeSize() const = 0; - - /** - * Parse the time at the given location. - */ - virtual int64_t parseTime(const unsigned char* ptr) const = 0; - - /** - * Parse the future string - */ - virtual std::string parseFutureString(const unsigned char *ptr, - uint64_t offset, - uint64_t length) const = 0; - }; - - VersionParser::~VersionParser() { - // PASS - } - - static uint32_t decode32(const unsigned char* ptr) { - return static_cast<uint32_t>(ptr[0] << 24) | - static_cast<uint32_t>(ptr[1] << 16) | - static_cast<uint32_t>(ptr[2] << 8) | - static_cast<uint32_t>(ptr[3]); - } - - class Version1Parser: public VersionParser { - public: - virtual ~Version1Parser() override; - - virtual uint64_t getVersion() const override { - return 1; - } - - /** - * Get the number of bytes - */ - virtual uint64_t getTimeSize() const override { - return 4; - } - - /** - * Parse the time at the given location. - */ - virtual int64_t parseTime(const unsigned char* ptr) const override { - // sign extend from 32 bits - return static_cast<int32_t>(decode32(ptr)); - } - - virtual std::string parseFutureString(const unsigned char *, - uint64_t, - uint64_t) const override { - return ""; - } - }; - - Version1Parser::~Version1Parser() { - // PASS - } - - class Version2Parser: public VersionParser { - public: - virtual ~Version2Parser() override; - - virtual uint64_t getVersion() const override { - return 2; - } - - /** - * Get the number of bytes - */ - virtual uint64_t getTimeSize() const override { - return 8; - } - - /** - * Parse the time at the given location. - */ - virtual int64_t parseTime(const unsigned char* ptr) const override { - return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4); - } - - virtual std::string parseFutureString(const unsigned char *ptr, - uint64_t offset, - uint64_t length) const override { - return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, - length - 2); - } - }; - - Version2Parser::~Version2Parser() { - // PASS - } - - class TimezoneImpl: public Timezone { - public: - TimezoneImpl(const std::string& name, - const std::vector<unsigned char> bytes); - virtual ~TimezoneImpl() override; - - /** - * Get the variant for the given time (time_t). - */ - const TimezoneVariant& getVariant(int64_t clk) const override; - - void print(std::ostream&) const override; - - uint64_t getVersion() const override { - return version; - } - - int64_t getEpoch() const override { - return epoch; - } - - int64_t convertToUTC(int64_t clk) const override { - return clk + getVariant(clk).gmtOffset; - } - - private: - void parseTimeVariants(const unsigned char* ptr, - uint64_t variantOffset, - uint64_t variantCount, - uint64_t nameOffset, - uint64_t nameCount); - void parseZoneFile(const unsigned char* ptr, - uint64_t sectionOffset, - uint64_t fileLength, - const VersionParser& version); - // filename - std::string filename; - - // the version of the file - uint64_t version; - - // the list of variants for this timezone - std::vector<TimezoneVariant> variants; - - // the list of the times where the local rules change - std::vector<int64_t> transitions; - - // the variant that starts at this transition. - std::vector<uint64_t> currentVariant; - - // the variant before the first transition - uint64_t ancientVariant; - - // the rule for future times - std::shared_ptr<FutureRule> futureRule; - - // the last explicit transition after which we use the future rule - int64_t lastTransition; - - // The ORC epoch time in this timezone. - int64_t epoch; - }; - - DIAGNOSTIC_PUSH - #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wglobal-constructors") - DIAGNOSTIC_IGNORE("-Wexit-time-destructors") - #endif - static std::mutex timezone_mutex; - static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache; - DIAGNOSTIC_POP - - Timezone::~Timezone() { - // PASS - } - - TimezoneImpl::TimezoneImpl(const std::string& _filename, - const std::vector<unsigned char> buffer - ): filename(_filename) { - parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser()); - // Build the literal for the ORC epoch - // 2015 Jan 1 00:00:00 - tm epochStruct; - epochStruct.tm_sec = 0; - epochStruct.tm_min = 0; - epochStruct.tm_hour = 0; - epochStruct.tm_mday = 1; - epochStruct.tm_mon = 0; - epochStruct.tm_year = 2015 - 1900; - epochStruct.tm_isdst = 0; - time_t utcEpoch = timegm(&epochStruct); - epoch = utcEpoch - getVariant(utcEpoch).gmtOffset; - } - - const char* getTimezoneDirectory() { - const char *dir = getenv("TZDIR"); - if (!dir) { - dir = DEFAULT_TZDIR; - } - return dir; - } - - /** - * Get a timezone by absolute filename. - * Results are cached. - */ - const Timezone& getTimezoneByFilename(const std::string& filename) { - // ORC-110 - std::lock_guard<std::mutex> timezone_lock(timezone_mutex); - std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = - timezoneCache.find(filename); - if (itr != timezoneCache.end()) { - return *(itr->second).get(); - } - try { - ORC_UNIQUE_PTR<InputStream> file = readFile(filename); - size_t size = static_cast<size_t>(file->getLength()); - std::vector<unsigned char> buffer(size); - file->read(&buffer[0], size, 0); - timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer)); - } catch(ParseError& err) { - throw TimezoneError(err.what()); - } - return *timezoneCache[filename].get(); - } - - /** - * Get the local timezone. - */ - const Timezone& getLocalTimezone() { -#ifdef _MSC_VER - return getTimezoneByName("UTC"); -#else - return getTimezoneByFilename(LOCAL_TIMEZONE); -#endif - } - - /** - * Get a timezone by name (eg. America/Los_Angeles). - * Results are cached. - */ - const Timezone& getTimezoneByName(const std::string& zone) { - std::string filename(getTimezoneDirectory()); - filename += "/"; - filename += zone; - return getTimezoneByFilename(filename); - } - - /** - * Parse a set of bytes as a timezone file as if they came from filename. - */ - std::unique_ptr<Timezone> getTimezone(const std::string& filename, - const std::vector<unsigned char>& b){ - return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b)); - } - - TimezoneImpl::~TimezoneImpl() { - // PASS - } - - void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, - uint64_t variantOffset, - uint64_t variantCount, - uint64_t nameOffset, - uint64_t nameCount) { - for(uint64_t variant=0; variant < variantCount; ++variant) { - variants[variant].gmtOffset = - static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); - variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; - uint64_t nameStart = ptr[variantOffset + 6 * variant + 5]; - if (nameStart >= nameCount) { - std::stringstream buffer; - buffer << "name out of range in variant " << variant - << " - " << nameStart << " >= " << nameCount; - throw TimezoneError(buffer.str()); - } - variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) - + nameOffset + nameStart); - } - } - - /** - * Parse the zone file to get the bits we need. - * There are two versions of the timezone file: - * - * Version 1(version = 0x00): - * Magic(version) - * Header - * TransitionTimes(4 byte) - * TransitionRules - * Rules - * LeapSeconds(4 byte) - * IsStd - * IsGmt - * - * Version2: - * Version1(0x32) = a version 1 copy of the data for old clients - * Magic(0x32) - * Header - * TransitionTimes(8 byte) - * TransitionRules - * Rules - * LeapSeconds(8 byte) - * IsStd - * IsGmt - * FutureString - */ - void TimezoneImpl::parseZoneFile(const unsigned char *ptr, - uint64_t sectionOffset, - uint64_t fileLength, - const VersionParser& versionParser) { - const uint64_t magicOffset = sectionOffset + 0; - const uint64_t headerOffset = magicOffset + 20; - - // check for validity before we start parsing - if (fileLength < headerOffset + 6 * 4 || - strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) - != 0) { - std::stringstream buffer; - buffer << "non-tzfile " << filename; - throw TimezoneError(buffer.str()); - } - - const uint64_t isGmtCount = decode32(ptr + headerOffset + 0); - const uint64_t isStdCount = decode32(ptr + headerOffset + 4); - const uint64_t leapCount = decode32(ptr + headerOffset + 8); - const uint64_t timeCount = decode32(ptr + headerOffset + 12); - const uint64_t variantCount = decode32(ptr + headerOffset + 16); - const uint64_t nameCount = decode32(ptr + headerOffset + 20); - - const uint64_t timeOffset = headerOffset + 24; - const uint64_t timeVariantOffset = - timeOffset + versionParser.getTimeSize() * timeCount; - const uint64_t variantOffset = timeVariantOffset + timeCount; - const uint64_t nameOffset = variantOffset + variantCount * 6; - const uint64_t sectionLength = nameOffset + nameCount - + (versionParser.getTimeSize() + 4) * leapCount - + isGmtCount + isStdCount; - - if (sectionLength > fileLength) { - std::stringstream buffer; - buffer << "tzfile too short " << filename - << " needs " << sectionLength << " and has " << fileLength; - throw TimezoneError(buffer.str()); - } - - // if it is version 2, skip over the old layout and read the new one. - if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) { - parseZoneFile(ptr, sectionLength, fileLength, Version2Parser()); - return; - } - version = versionParser.getVersion(); - variants.resize(variantCount); - transitions.resize(timeCount); - currentVariant.resize(timeCount); - parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, - nameCount); - bool foundAncient = false; - for(uint64_t t=0; t < timeCount; ++t) { - transitions[t] = - versionParser.parseTime(ptr + timeOffset + - t * versionParser.getTimeSize()); - currentVariant[t] = ptr[timeVariantOffset + t]; - if (currentVariant[t] >= variantCount) { - std::stringstream buffer; - buffer << "tzfile rule out of range " << filename - << " references rule " << currentVariant[t] - << " of " << variantCount; - throw TimezoneError(buffer.str()); - } - // find the oldest standard time and use that as the ancient value - if (!foundAncient && - !variants[currentVariant[t]].isDst) { - foundAncient = true; - ancientVariant = currentVariant[t]; - } - } - if (!foundAncient) { - ancientVariant = 0; - } - futureRule = parseFutureRule(versionParser.parseFutureString - (ptr, sectionLength, - fileLength - sectionLength)); - - // find the lower bound for applying the future rule - if (futureRule->isDefined()) { - if (timeCount > 0) { - lastTransition = transitions[timeCount - 1]; - } else { - lastTransition = INT64_MIN; - } - } else { - lastTransition = INT64_MAX; - } - } - - const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const { - // if it is after the last explicit entry in the table, - // use the future rule to get an answer - if (clk > lastTransition) { - return futureRule->getVariant(clk); - } else { - int64_t transition = binarySearch(transitions, clk); - uint64_t idx; - if (transition < 0) { - idx = ancientVariant; - } else { - idx = currentVariant[static_cast<size_t>(transition)]; - } - return variants[idx]; - } - } - - void TimezoneImpl::print(std::ostream& out) const { - out << "Timezone file: " << filename << "\n"; - out << " Version: " << version << "\n"; - futureRule->print(out); - for(uint64_t r=0; r < variants.size(); ++r) { - out << " Variant " << r << ": " - << variants[r].toString() << "\n"; - } - for(uint64_t t=0; t < transitions.size(); ++t) { - tm timeStruct; - tm* result = nullptr; - char buffer[25]; - if (sizeof(time_t) >= 8) { - time_t val = transitions[t]; - result = gmtime_r(&val, &timeStruct); - if (result) { - strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct); - } - } - std::cout << " Transition: " << (result == nullptr ? "null" : buffer) - << " (" << transitions[t] << ") -> " - << variants[currentVariant[t]].name - << "\n"; - } - } - - TimezoneError::TimezoneError(const std::string& what - ): std::runtime_error(what) { - // PASS - } - - TimezoneError::TimezoneError(const TimezoneError& other - ): std::runtime_error(other) { - // PASS - } - - TimezoneError::~TimezoneError() ORC_NOEXCEPT { - // PASS - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/OrcFile.hh" +#include "Timezone.hh" + +#include <errno.h> +#include <map> +#include <sstream> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +namespace orc { + + // default location of the timezone files + static const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo"; + + // location of a symlink to the local timezone + static const char LOCAL_TIMEZONE[] = "/etc/localtime"; + + enum TransitionKind { + TRANSITION_JULIAN, + TRANSITION_DAY, + TRANSITION_MONTH + }; + + static const int64_t MONTHS_PER_YEAR = 12; + /** + * The number of days in each month in non-leap and leap years. + */ + static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = + {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; + static const int64_t DAYS_PER_WEEK = 7; + + // Leap years and day of the week repeat every 400 years, which makes it + // a good cycle length. + static const int64_t SECONDS_PER_400_YEARS = + SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3)); + + /** + * Is the given year a leap year? + */ + bool isLeap(int64_t year) { + return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); + } + + /** + * Find the position that is the closest and less than or equal to the + * target. + * @return -1 if the target < array[0] or array is empty or + * i if array[i] <= target and (i == n or array[i] < array[i+1]) + */ + int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) { + uint64_t size = array.size(); + if (size == 0) { + return -1; + } + uint64_t min = 0; + uint64_t max = size - 1; + uint64_t mid = (min + max) / 2; + while ((array[mid] != target) && (min < max)) { + if (array[mid] < target) { + min = mid + 1; + } else if (mid == 0) { + max = 0; + } else { + max = mid - 1; + } + mid = (min + max) / 2; + } + if (target < array[mid]) { + return static_cast<int64_t>(mid) - 1; + } else { + return static_cast<int64_t>(mid); + } + } + + struct Transition { + TransitionKind kind; + int64_t day; + int64_t week; + int64_t month; + int64_t time; + + std::string toString() const { + std::stringstream buffer; + switch (kind) { + case TRANSITION_JULIAN: + buffer << "julian " << day; + break; + case TRANSITION_DAY: + buffer << "day " << day; + break; + case TRANSITION_MONTH: + buffer << "month " << month << " week " << week << " day " << day; + break; + } + buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) + << ":" << (time % 60); + return buffer.str(); + } + + /** + * Get the transition time for the given year. + * @param year the year + * @return the number of seconds past local Jan 1 00:00:00 that the + * transition happens. + */ + int64_t getTime(int64_t year) const { + int64_t result = time; + switch (kind) { + case TRANSITION_JULIAN: + result += SECONDS_PER_DAY * day; + if (day > 60 && isLeap(year)) { + result += SECONDS_PER_DAY; + } + break; + case TRANSITION_DAY: + result += SECONDS_PER_DAY * day; + break; + case TRANSITION_MONTH: { + bool inLeap = isLeap(year); + int64_t adjustedMonth = (month + 9) % 12 + 1; + int64_t adjustedYear = (month <= 2) ? (year - 1) : year; + int64_t adjustedCentury = adjustedYear / 100; + int64_t adjustedRemainder = adjustedYear % 100; + + // day of the week of the first day of month + int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + + 1 + adjustedRemainder + adjustedRemainder / 4 + + adjustedCentury / 4 - 2 * adjustedCentury) % 7; + if (dayOfWeek < 0) { + dayOfWeek += DAYS_PER_WEEK; + } + + int64_t d = day - dayOfWeek; + if (d < 0) { + d += DAYS_PER_WEEK; + } + for (int w = 1; w < week; ++w) { + if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) { + break; + } + d += DAYS_PER_WEEK; + } + result += d * SECONDS_PER_DAY; + + // Add in the time for the month + for(int m=0; m < month - 1; ++m) { + result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY; + } + break; + } + } + return result; + } + }; + + /** + * The current rule for finding timezone variants arbitrarily far in + * the future. They are based on a string representation that + * specifies the standard name and offset. For timezones with + * daylight savings, the string specifies the daylight variant name + * and offset and the rules for switching between them. + * + * rule = <standard name><standard offset><daylight>? + * name = string with no numbers or '+', '-', or ',' + * offset = [-+]?hh(:mm(:ss)?)? + * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)? + * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week> + */ + class FutureRuleImpl: public FutureRule { + std::string ruleString; + TimezoneVariant standard; + bool hasDst; + TimezoneVariant dst; + Transition start; + Transition end; + + // expanded time_t offsets of transitions + std::vector<int64_t> offsets; + + // Is the epoch (1 Jan 1970 00:00) in standard time? + // This code assumes that the transition dates fall in the same order + // each year. Hopefully no timezone regions decide to move across the + // equator, which is about what it would take. + bool startInStd; + + void computeOffsets() { + if (!hasDst) { + startInStd = true; + offsets.resize(1); + } else { + // Insert a transition for the epoch and two per a year for the next + // 400 years. We assume that the all even positions are in standard + // time if and only if startInStd and the odd ones are the reverse. + offsets.resize(400 * 2 + 1); + startInStd = start.getTime(1970) < end.getTime(1970); + int64_t base = 0; + for(int64_t year = 1970; year < 1970 + 400; ++year) { + if (startInStd) { + offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = + base + start.getTime(year) - standard.gmtOffset; + offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = + base + end.getTime(year) - dst.gmtOffset; + } else { + offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = + base + end.getTime(year) - dst.gmtOffset; + offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = + base + start.getTime(year) - standard.gmtOffset; + } + base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY; + } + } + offsets[0] = 0; + } + + public: + virtual ~FutureRuleImpl() override; + bool isDefined() const override; + const TimezoneVariant& getVariant(int64_t clk) const override; + void print(std::ostream& out) const override; + + friend class FutureRuleParser; + }; + + FutureRule::~FutureRule() { + // PASS + } + + FutureRuleImpl::~FutureRuleImpl() { + // PASS + } + + bool FutureRuleImpl::isDefined() const { + return ruleString.size() > 0; + } + + const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const { + if (!hasDst) { + return standard; + } else { + int64_t adjusted = clk % SECONDS_PER_400_YEARS; + if (adjusted < 0) { + adjusted += SECONDS_PER_400_YEARS; + } + int64_t idx = binarySearch(offsets, adjusted); + if (startInStd == (idx % 2 == 0)) { + return standard; + } else { + return dst; + } + } + } + + void FutureRuleImpl::print(std::ostream& out) const { + if (isDefined()) { + out << " Future rule: " << ruleString << "\n"; + out << " standard " << standard.toString() << "\n"; + if (hasDst) { + out << " dst " << dst.toString() << "\n"; + out << " start " << start.toString() << "\n"; + out << " end " << end.toString() << "\n"; + } + } + } + + /** + * A parser for the future rule strings. + */ + class FutureRuleParser { + public: + FutureRuleParser(const std::string& str, + FutureRuleImpl* rule + ): ruleString(str), + length(str.size()), + position(0), + output(*rule) { + output.ruleString = str; + if (position != length) { + parseName(output.standard.name); + output.standard.gmtOffset = -parseOffset(); + output.standard.isDst = false; + output.hasDst = position < length; + if (output.hasDst) { + parseName(output.dst.name); + output.dst.isDst = true; + if (ruleString[position] != ',') { + output.dst.gmtOffset = -parseOffset(); + } else { + output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60; + } + parseTransition(output.start); + parseTransition(output.end); + } + if (position != length) { + throwError("Extra text"); + } + output.computeOffsets(); + } + } + + private: + + const std::string& ruleString; + size_t length; + size_t position; + FutureRuleImpl &output; + + void throwError(const char *msg) { + std::stringstream buffer; + buffer << msg << " at " << position << " in '" << ruleString << "'"; + throw TimezoneError(buffer.str()); + } + + /** + * Parse the names of the form: + * ([^-+0-9,]+|<[^>]+>) + * and set the output string. + */ + void parseName(std::string& result) { + if (position == length) { + throwError("name required"); + } + size_t start = position; + if (ruleString[position] == '<') { + while (position < length && ruleString[position] != '>') { + position += 1; + } + if (position == length) { + throwError("missing close '>'"); + } + position +=1; + } else { + while (position < length) { + char ch = ruleString[position]; + if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') { + break; + } + position += 1; + } + } + if (position == start) { + throwError("empty string not allowed"); + } + result = ruleString.substr(start, position - start); + } + + /** + * Parse an integer of the form [0-9]+ and return it. + */ + int64_t parseNumber() { + if (position >= length) { + throwError("missing number"); + } + int64_t result = 0; + while (position < length) { + char ch = ruleString[position]; + if (isdigit(ch)) { + result = result * 10 + (ch - '0'); + position += 1; + } else { + break; + } + } + return result; + } + + /** + * Parse the offsets of the form: + * [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)? + * and convert it into a number of seconds. + */ + int64_t parseOffset() { + int64_t scale = 3600; + bool isNegative = false; + if (position < length) { + char ch = ruleString[position]; + isNegative = ch == '-'; + if (ch == '-' || ch == '+') { + position += 1; + } + } + int64_t result = parseNumber() * scale; + while (position < length && scale > 1 && ruleString[position] == ':') { + scale /= 60; + position += 1; + result += parseNumber() * scale; + } + if (isNegative) { + result = -result; + } + return result; + } + + /** + * Parse a transition of the following form: + * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)? + */ + void parseTransition(Transition& transition) { + if (length - position < 2 || ruleString[position] != ',') { + throwError("missing transition"); + } + position += 1; + char ch = ruleString[position]; + if (ch == 'J') { + transition.kind = TRANSITION_JULIAN; + position += 1; + transition.day = parseNumber(); + } else if (ch == 'M') { + transition.kind = TRANSITION_MONTH; + position += 1; + transition.month = parseNumber(); + if (position == length || ruleString[position] != '.') { + throwError("missing first ."); + } + position += 1; + transition.week = parseNumber(); + if (position == length || ruleString[position] != '.') { + throwError("missing second ."); + } + position += 1; + transition.day = parseNumber(); + } else { + transition.kind = TRANSITION_DAY; + transition.day = parseNumber(); + } + if (position < length && ruleString[position] == '/') { + position += 1; + transition.time = parseOffset(); + } else { + transition.time = 2 * 60 * 60; + } + } + }; + + /** + * Parse the POSIX TZ string. + */ + std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) { + std::shared_ptr<FutureRule> result(new FutureRuleImpl()); + FutureRuleParser parser(ruleString, + dynamic_cast<FutureRuleImpl*>(result.get())); + return result; + } + + std::string TimezoneVariant::toString() const { + std::stringstream buffer; + buffer << name << " " << gmtOffset; + if (isDst) { + buffer << " (dst)"; + } + return buffer.str(); + } + + /** + * An abstraction of the differences between versions. + */ + class VersionParser { + public: + virtual ~VersionParser(); + + /** + * Get the version number. + */ + virtual uint64_t getVersion() const = 0; + + /** + * Get the number of bytes + */ + virtual uint64_t getTimeSize() const = 0; + + /** + * Parse the time at the given location. + */ + virtual int64_t parseTime(const unsigned char* ptr) const = 0; + + /** + * Parse the future string + */ + virtual std::string parseFutureString(const unsigned char *ptr, + uint64_t offset, + uint64_t length) const = 0; + }; + + VersionParser::~VersionParser() { + // PASS + } + + static uint32_t decode32(const unsigned char* ptr) { + return static_cast<uint32_t>(ptr[0] << 24) | + static_cast<uint32_t>(ptr[1] << 16) | + static_cast<uint32_t>(ptr[2] << 8) | + static_cast<uint32_t>(ptr[3]); + } + + class Version1Parser: public VersionParser { + public: + virtual ~Version1Parser() override; + + virtual uint64_t getVersion() const override { + return 1; + } + + /** + * Get the number of bytes + */ + virtual uint64_t getTimeSize() const override { + return 4; + } + + /** + * Parse the time at the given location. + */ + virtual int64_t parseTime(const unsigned char* ptr) const override { + // sign extend from 32 bits + return static_cast<int32_t>(decode32(ptr)); + } + + virtual std::string parseFutureString(const unsigned char *, + uint64_t, + uint64_t) const override { + return ""; + } + }; + + Version1Parser::~Version1Parser() { + // PASS + } + + class Version2Parser: public VersionParser { + public: + virtual ~Version2Parser() override; + + virtual uint64_t getVersion() const override { + return 2; + } + + /** + * Get the number of bytes + */ + virtual uint64_t getTimeSize() const override { + return 8; + } + + /** + * Parse the time at the given location. + */ + virtual int64_t parseTime(const unsigned char* ptr) const override { + return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4); + } + + virtual std::string parseFutureString(const unsigned char *ptr, + uint64_t offset, + uint64_t length) const override { + return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, + length - 2); + } + }; + + Version2Parser::~Version2Parser() { + // PASS + } + + class TimezoneImpl: public Timezone { + public: + TimezoneImpl(const std::string& name, + const std::vector<unsigned char> bytes); + virtual ~TimezoneImpl() override; + + /** + * Get the variant for the given time (time_t). + */ + const TimezoneVariant& getVariant(int64_t clk) const override; + + void print(std::ostream&) const override; + + uint64_t getVersion() const override { + return version; + } + + int64_t getEpoch() const override { + return epoch; + } + + int64_t convertToUTC(int64_t clk) const override { + return clk + getVariant(clk).gmtOffset; + } + + private: + void parseTimeVariants(const unsigned char* ptr, + uint64_t variantOffset, + uint64_t variantCount, + uint64_t nameOffset, + uint64_t nameCount); + void parseZoneFile(const unsigned char* ptr, + uint64_t sectionOffset, + uint64_t fileLength, + const VersionParser& version); + // filename + std::string filename; + + // the version of the file + uint64_t version; + + // the list of variants for this timezone + std::vector<TimezoneVariant> variants; + + // the list of the times where the local rules change + std::vector<int64_t> transitions; + + // the variant that starts at this transition. + std::vector<uint64_t> currentVariant; + + // the variant before the first transition + uint64_t ancientVariant; + + // the rule for future times + std::shared_ptr<FutureRule> futureRule; + + // the last explicit transition after which we use the future rule + int64_t lastTransition; + + // The ORC epoch time in this timezone. + int64_t epoch; + }; + + DIAGNOSTIC_PUSH + #ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wglobal-constructors") + DIAGNOSTIC_IGNORE("-Wexit-time-destructors") + #endif + static std::mutex timezone_mutex; + static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache; + DIAGNOSTIC_POP + + Timezone::~Timezone() { + // PASS + } + + TimezoneImpl::TimezoneImpl(const std::string& _filename, + const std::vector<unsigned char> buffer + ): filename(_filename) { + parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser()); + // Build the literal for the ORC epoch + // 2015 Jan 1 00:00:00 + tm epochStruct; + epochStruct.tm_sec = 0; + epochStruct.tm_min = 0; + epochStruct.tm_hour = 0; + epochStruct.tm_mday = 1; + epochStruct.tm_mon = 0; + epochStruct.tm_year = 2015 - 1900; + epochStruct.tm_isdst = 0; + time_t utcEpoch = timegm(&epochStruct); + epoch = utcEpoch - getVariant(utcEpoch).gmtOffset; + } + + const char* getTimezoneDirectory() { + const char *dir = getenv("TZDIR"); + if (!dir) { + dir = DEFAULT_TZDIR; + } + return dir; + } + + /** + * Get a timezone by absolute filename. + * Results are cached. + */ + const Timezone& getTimezoneByFilename(const std::string& filename) { + // ORC-110 + std::lock_guard<std::mutex> timezone_lock(timezone_mutex); + std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = + timezoneCache.find(filename); + if (itr != timezoneCache.end()) { + return *(itr->second).get(); + } + try { + ORC_UNIQUE_PTR<InputStream> file = readFile(filename); + size_t size = static_cast<size_t>(file->getLength()); + std::vector<unsigned char> buffer(size); + file->read(&buffer[0], size, 0); + timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer)); + } catch(ParseError& err) { + throw TimezoneError(err.what()); + } + return *timezoneCache[filename].get(); + } + + /** + * Get the local timezone. + */ + const Timezone& getLocalTimezone() { +#ifdef _MSC_VER + return getTimezoneByName("UTC"); +#else + return getTimezoneByFilename(LOCAL_TIMEZONE); +#endif + } + + /** + * Get a timezone by name (eg. America/Los_Angeles). + * Results are cached. + */ + const Timezone& getTimezoneByName(const std::string& zone) { + std::string filename(getTimezoneDirectory()); + filename += "/"; + filename += zone; + return getTimezoneByFilename(filename); + } + + /** + * Parse a set of bytes as a timezone file as if they came from filename. + */ + std::unique_ptr<Timezone> getTimezone(const std::string& filename, + const std::vector<unsigned char>& b){ + return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b)); + } + + TimezoneImpl::~TimezoneImpl() { + // PASS + } + + void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, + uint64_t variantOffset, + uint64_t variantCount, + uint64_t nameOffset, + uint64_t nameCount) { + for(uint64_t variant=0; variant < variantCount; ++variant) { + variants[variant].gmtOffset = + static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); + variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; + uint64_t nameStart = ptr[variantOffset + 6 * variant + 5]; + if (nameStart >= nameCount) { + std::stringstream buffer; + buffer << "name out of range in variant " << variant + << " - " << nameStart << " >= " << nameCount; + throw TimezoneError(buffer.str()); + } + variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) + + nameOffset + nameStart); + } + } + + /** + * Parse the zone file to get the bits we need. + * There are two versions of the timezone file: + * + * Version 1(version = 0x00): + * Magic(version) + * Header + * TransitionTimes(4 byte) + * TransitionRules + * Rules + * LeapSeconds(4 byte) + * IsStd + * IsGmt + * + * Version2: + * Version1(0x32) = a version 1 copy of the data for old clients + * Magic(0x32) + * Header + * TransitionTimes(8 byte) + * TransitionRules + * Rules + * LeapSeconds(8 byte) + * IsStd + * IsGmt + * FutureString + */ + void TimezoneImpl::parseZoneFile(const unsigned char *ptr, + uint64_t sectionOffset, + uint64_t fileLength, + const VersionParser& versionParser) { + const uint64_t magicOffset = sectionOffset + 0; + const uint64_t headerOffset = magicOffset + 20; + + // check for validity before we start parsing + if (fileLength < headerOffset + 6 * 4 || + strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) + != 0) { + std::stringstream buffer; + buffer << "non-tzfile " << filename; + throw TimezoneError(buffer.str()); + } + + const uint64_t isGmtCount = decode32(ptr + headerOffset + 0); + const uint64_t isStdCount = decode32(ptr + headerOffset + 4); + const uint64_t leapCount = decode32(ptr + headerOffset + 8); + const uint64_t timeCount = decode32(ptr + headerOffset + 12); + const uint64_t variantCount = decode32(ptr + headerOffset + 16); + const uint64_t nameCount = decode32(ptr + headerOffset + 20); + + const uint64_t timeOffset = headerOffset + 24; + const uint64_t timeVariantOffset = + timeOffset + versionParser.getTimeSize() * timeCount; + const uint64_t variantOffset = timeVariantOffset + timeCount; + const uint64_t nameOffset = variantOffset + variantCount * 6; + const uint64_t sectionLength = nameOffset + nameCount + + (versionParser.getTimeSize() + 4) * leapCount + + isGmtCount + isStdCount; + + if (sectionLength > fileLength) { + std::stringstream buffer; + buffer << "tzfile too short " << filename + << " needs " << sectionLength << " and has " << fileLength; + throw TimezoneError(buffer.str()); + } + + // if it is version 2, skip over the old layout and read the new one. + if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) { + parseZoneFile(ptr, sectionLength, fileLength, Version2Parser()); + return; + } + version = versionParser.getVersion(); + variants.resize(variantCount); + transitions.resize(timeCount); + currentVariant.resize(timeCount); + parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, + nameCount); + bool foundAncient = false; + for(uint64_t t=0; t < timeCount; ++t) { + transitions[t] = + versionParser.parseTime(ptr + timeOffset + + t * versionParser.getTimeSize()); + currentVariant[t] = ptr[timeVariantOffset + t]; + if (currentVariant[t] >= variantCount) { + std::stringstream buffer; + buffer << "tzfile rule out of range " << filename + << " references rule " << currentVariant[t] + << " of " << variantCount; + throw TimezoneError(buffer.str()); + } + // find the oldest standard time and use that as the ancient value + if (!foundAncient && + !variants[currentVariant[t]].isDst) { + foundAncient = true; + ancientVariant = currentVariant[t]; + } + } + if (!foundAncient) { + ancientVariant = 0; + } + futureRule = parseFutureRule(versionParser.parseFutureString + (ptr, sectionLength, + fileLength - sectionLength)); + + // find the lower bound for applying the future rule + if (futureRule->isDefined()) { + if (timeCount > 0) { + lastTransition = transitions[timeCount - 1]; + } else { + lastTransition = INT64_MIN; + } + } else { + lastTransition = INT64_MAX; + } + } + + const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const { + // if it is after the last explicit entry in the table, + // use the future rule to get an answer + if (clk > lastTransition) { + return futureRule->getVariant(clk); + } else { + int64_t transition = binarySearch(transitions, clk); + uint64_t idx; + if (transition < 0) { + idx = ancientVariant; + } else { + idx = currentVariant[static_cast<size_t>(transition)]; + } + return variants[idx]; + } + } + + void TimezoneImpl::print(std::ostream& out) const { + out << "Timezone file: " << filename << "\n"; + out << " Version: " << version << "\n"; + futureRule->print(out); + for(uint64_t r=0; r < variants.size(); ++r) { + out << " Variant " << r << ": " + << variants[r].toString() << "\n"; + } + for(uint64_t t=0; t < transitions.size(); ++t) { + tm timeStruct; + tm* result = nullptr; + char buffer[25]; + if (sizeof(time_t) >= 8) { + time_t val = transitions[t]; + result = gmtime_r(&val, &timeStruct); + if (result) { + strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct); + } + } + std::cout << " Transition: " << (result == nullptr ? "null" : buffer) + << " (" << transitions[t] << ") -> " + << variants[currentVariant[t]].name + << "\n"; + } + } + + TimezoneError::TimezoneError(const std::string& what + ): std::runtime_error(what) { + // PASS + } + + TimezoneError::TimezoneError(const TimezoneError& other + ): std::runtime_error(other) { + // PASS + } + + TimezoneError::~TimezoneError() ORC_NOEXCEPT { + // PASS + } + +} diff --git a/contrib/libs/apache/orc/c++/src/Timezone.hh b/contrib/libs/apache/orc/c++/src/Timezone.hh index 136b7a18b7..6bcb6586d0 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.hh +++ b/contrib/libs/apache/orc/c++/src/Timezone.hh @@ -1,130 +1,130 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TIMEZONE_HH -#define TIMEZONE_HH - -// This file is for timezone routines. - -#include "Adaptor.hh" - -#include <memory> -#include <stdexcept> -#include <stdint.h> -#include <string> -#include <vector> - -namespace orc { - - static const int64_t SECONDS_PER_HOUR = 60 * 60; - static const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24; - - /** - * A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles). - */ - struct TimezoneVariant { - int64_t gmtOffset; - bool isDst; - std::string name; - - std::string toString() const; - }; - - /** - * A region that shares the same legal rules for wall clock time and - * day light savings transitions. They are typically named for the largest - * city in the region (eg. America/Los_Angeles or America/Mexico_City). - */ - class Timezone { - public: - virtual ~Timezone(); - - /** - * Get the variant for the given time (time_t). - */ - virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; - - /** - * Get the number of seconds between the ORC epoch in this timezone - * and Unix epoch. - * ORC epoch is 1 Jan 2015 00:00:00 local. - * Unix epoch is 1 Jan 1970 00:00:00 UTC. - */ - virtual int64_t getEpoch() const = 0; - - /** - * Print the timezone to the stream. - */ - virtual void print(std::ostream&) const = 0; - - /** - * Get the version of the zone file. - */ - virtual uint64_t getVersion() const =0; - - /** - * Convert wall clock time of current timezone to UTC timezone - */ - virtual int64_t convertToUTC(int64_t clk) const = 0; - }; - - /** - * Get the local timezone. - * Results are cached. - */ - const Timezone& getLocalTimezone(); - - /** - * Get a timezone by name (eg. America/Los_Angeles). - * Results are cached. - */ - const Timezone& getTimezoneByName(const std::string& zone); - - /** - * Parse a set of bytes as a timezone file as if they came from filename. - */ - std::unique_ptr<Timezone> getTimezone(const std::string& filename, - const std::vector<unsigned char>& b); - - class TimezoneError: public std::runtime_error { - public: - TimezoneError(const std::string& what); - TimezoneError(const TimezoneError&); - virtual ~TimezoneError() ORC_NOEXCEPT; - }; - - /** - * Represents the parsed POSIX timezone rule strings that are used to - * describe the future transitions, because they can go arbitrarily far into - * the future. - */ - class FutureRule { - public: - virtual ~FutureRule(); - virtual bool isDefined() const = 0; - virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; - virtual void print(std::ostream& out) const = 0; - }; - - /** - * Parse the POSIX TZ string. - */ - std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TIMEZONE_HH +#define TIMEZONE_HH + +// This file is for timezone routines. + +#include "Adaptor.hh" + +#include <memory> +#include <stdexcept> +#include <stdint.h> +#include <string> +#include <vector> + +namespace orc { + + static const int64_t SECONDS_PER_HOUR = 60 * 60; + static const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24; + + /** + * A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles). + */ + struct TimezoneVariant { + int64_t gmtOffset; + bool isDst; + std::string name; + + std::string toString() const; + }; + + /** + * A region that shares the same legal rules for wall clock time and + * day light savings transitions. They are typically named for the largest + * city in the region (eg. America/Los_Angeles or America/Mexico_City). + */ + class Timezone { + public: + virtual ~Timezone(); + + /** + * Get the variant for the given time (time_t). + */ + virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; + + /** + * Get the number of seconds between the ORC epoch in this timezone + * and Unix epoch. + * ORC epoch is 1 Jan 2015 00:00:00 local. + * Unix epoch is 1 Jan 1970 00:00:00 UTC. + */ + virtual int64_t getEpoch() const = 0; + + /** + * Print the timezone to the stream. + */ + virtual void print(std::ostream&) const = 0; + + /** + * Get the version of the zone file. + */ + virtual uint64_t getVersion() const =0; + + /** + * Convert wall clock time of current timezone to UTC timezone + */ + virtual int64_t convertToUTC(int64_t clk) const = 0; + }; + + /** + * Get the local timezone. + * Results are cached. + */ + const Timezone& getLocalTimezone(); + + /** + * Get a timezone by name (eg. America/Los_Angeles). + * Results are cached. + */ + const Timezone& getTimezoneByName(const std::string& zone); + + /** + * Parse a set of bytes as a timezone file as if they came from filename. + */ + std::unique_ptr<Timezone> getTimezone(const std::string& filename, + const std::vector<unsigned char>& b); + + class TimezoneError: public std::runtime_error { + public: + TimezoneError(const std::string& what); + TimezoneError(const TimezoneError&); + virtual ~TimezoneError() ORC_NOEXCEPT; + }; + + /** + * Represents the parsed POSIX timezone rule strings that are used to + * describe the future transitions, because they can go arbitrarily far into + * the future. + */ + class FutureRule { + public: + virtual ~FutureRule(); + virtual bool isDefined() const = 0; + virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; + virtual void print(std::ostream& out) const = 0; + }; + + /** + * Parse the POSIX TZ string. + */ + std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc index c154f2af04..78a0e00686 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc @@ -1,707 +1,707 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "orc/Exceptions.hh" -#include "TypeImpl.hh" - -#include <iostream> -#include <sstream> - -namespace orc { - - Type::~Type() { - // PASS - } - - TypeImpl::TypeImpl(TypeKind _kind) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = 0; - precision = 0; - scale = 0; - subtypeCount = 0; - } - - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = _maxLength; - precision = 0; - scale = 0; - subtypeCount = 0; - } - - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, - uint64_t _scale) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = 0; - precision = _precision; - scale = _scale; - subtypeCount = 0; - } - - uint64_t TypeImpl::assignIds(uint64_t root) const { - columnId = static_cast<int64_t>(root); - uint64_t current = root + 1; - for(uint64_t i=0; i < subtypeCount; ++i) { - current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current); - } - maximumColumnId = static_cast<int64_t>(current) - 1; - return current; - } - - TypeImpl::~TypeImpl() { - for (std::vector<Type*>::iterator it = subTypes.begin(); - it != subTypes.end(); it++) { - delete (*it) ; - } - } - - void TypeImpl::ensureIdAssigned() const { - if (columnId == -1) { - const TypeImpl* root = this; - while (root->parent != nullptr) { - root = root->parent; - } - root->assignIds(0); - } - } - - uint64_t TypeImpl::getColumnId() const { - ensureIdAssigned(); - return static_cast<uint64_t>(columnId); - } - - uint64_t TypeImpl::getMaximumColumnId() const { - ensureIdAssigned(); - return static_cast<uint64_t>(maximumColumnId); - } - - TypeKind TypeImpl::getKind() const { - return kind; - } - - uint64_t TypeImpl::getSubtypeCount() const { - return subtypeCount; - } - - const Type* TypeImpl::getSubtype(uint64_t i) const { - return subTypes[i]; - } - - const std::string& TypeImpl::getFieldName(uint64_t i) const { - return fieldNames[i]; - } - - uint64_t TypeImpl::getMaximumLength() const { - return maxLength; - } - - uint64_t TypeImpl::getPrecision() const { - return precision; - } - - uint64_t TypeImpl::getScale() const { - return scale; - } - - void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) { - columnId = static_cast<int64_t>(_columnId); - maximumColumnId = static_cast<int64_t>(_maxColumnId); - } - - void TypeImpl::addChildType(std::unique_ptr<Type> childType) { - TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release()); - subTypes.push_back(child); - if (child != nullptr) { - child->parent = this; - } - subtypeCount += 1; - } - - Type* TypeImpl::addStructField(const std::string& fieldName, - std::unique_ptr<Type> fieldType) { - addChildType(std::move(fieldType)); - fieldNames.push_back(fieldName); - return this; - } - - Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) { - addChildType(std::move(fieldType)); - return this; - } - - std::string TypeImpl::toString() const { - switch (static_cast<int64_t>(kind)) { - case BOOLEAN: - return "boolean"; - case BYTE: - return "tinyint"; - case SHORT: - return "smallint"; - case INT: - return "int"; - case LONG: - return "bigint"; - case FLOAT: - return "float"; - case DOUBLE: - return "double"; - case STRING: - return "string"; - case BINARY: - return "binary"; - case TIMESTAMP: - return "timestamp"; - case LIST: - return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; - case MAP: - return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + - (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; - case STRUCT: { - std::string result = "struct<"; - for(size_t i=0; i < subTypes.size(); ++i) { - if (i != 0) { - result += ","; - } - result += fieldNames[i]; - result += ":"; - result += subTypes[i]->toString(); - } - result += ">"; - return result; - } - case UNION: { - std::string result = "uniontype<"; - for(size_t i=0; i < subTypes.size(); ++i) { - if (i != 0) { - result += ","; - } - result += subTypes[i]->toString(); - } - result += ">"; - return result; - } - case DECIMAL: { - std::stringstream result; - result << "decimal(" << precision << "," << scale << ")"; - return result.str(); - } - case DATE: - return "date"; - case VARCHAR: { - std::stringstream result; - result << "varchar(" << maxLength << ")"; - return result.str(); - } - case CHAR: { - std::stringstream result; - result << "char(" << maxLength << ")"; - return result.str(); - } - default: - throw NotImplementedYet("Unknown type"); - } - } - - std::unique_ptr<ColumnVectorBatch> - TypeImpl::createRowBatch(uint64_t capacity, - MemoryPool& memoryPool, - bool encoded) const { - switch (static_cast<int64_t>(kind)) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case DATE: - return std::unique_ptr<ColumnVectorBatch> - (new LongVectorBatch(capacity, memoryPool)); - - case FLOAT: - case DOUBLE: - return std::unique_ptr<ColumnVectorBatch> - (new DoubleVectorBatch(capacity, memoryPool)); - - case STRING: - case BINARY: - case CHAR: - case VARCHAR: - return encoded ? - std::unique_ptr<ColumnVectorBatch> - (new EncodedStringVectorBatch(capacity, memoryPool)) - : std::unique_ptr<ColumnVectorBatch> - (new StringVectorBatch(capacity, memoryPool)); - - case TIMESTAMP: - return std::unique_ptr<ColumnVectorBatch> - (new TimestampVectorBatch(capacity, memoryPool)); - - case STRUCT: { - StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - for(uint64_t i=0; i < getSubtypeCount(); ++i) { - result->fields.push_back(getSubtype(i)-> - createRowBatch(capacity, - memoryPool, encoded).release()); - } - return return_value; - } - - case LIST: { - ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - if (getSubtype(0) != nullptr) { - result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); - } - return return_value; - } - - case MAP: { - MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - if (getSubtype(0) != nullptr) { - result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); - } - if (getSubtype(1) != nullptr) { - result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded); - } - return return_value; - } - - case DECIMAL: { - if (getPrecision() == 0 || getPrecision() > 18) { - return std::unique_ptr<ColumnVectorBatch> - (new Decimal128VectorBatch(capacity, memoryPool)); - } else { - return std::unique_ptr<ColumnVectorBatch> - (new Decimal64VectorBatch(capacity, memoryPool)); - } - } - - case UNION: { - UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - for(uint64_t i=0; i < getSubtypeCount(); ++i) { - result->children.push_back(getSubtype(i)->createRowBatch(capacity, - memoryPool, encoded) - .release()); - } - return return_value; - } - - default: - throw NotImplementedYet("not supported yet"); - } - } - - std::unique_ptr<Type> createPrimitiveType(TypeKind kind) { - return std::unique_ptr<Type>(new TypeImpl(kind)); - } - - std::unique_ptr<Type> createCharType(TypeKind kind, - uint64_t maxLength) { - return std::unique_ptr<Type>(new TypeImpl(kind, maxLength)); - } - - std::unique_ptr<Type> createDecimalType(uint64_t precision, - uint64_t scale) { - return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); - } - - std::unique_ptr<Type> createStructType() { - return std::unique_ptr<Type>(new TypeImpl(STRUCT)); - } - - std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) { - TypeImpl* result = new TypeImpl(LIST); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - result->addChildType(std::move(elements)); - return return_value; - } - - std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, - std::unique_ptr<Type> value) { - TypeImpl* result = new TypeImpl(MAP); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - result->addChildType(std::move(key)); - result->addChildType(std::move(value)); - return return_value; - } - - std::unique_ptr<Type> createUnionType() { - return std::unique_ptr<Type>(new TypeImpl(UNION)); - } - - std::string printProtobufMessage(const google::protobuf::Message& message); - std::unique_ptr<Type> convertType(const proto::Type& type, - const proto::Footer& footer) { - switch (static_cast<int64_t>(type.kind())) { - - case proto::Type_Kind_BOOLEAN: - case proto::Type_Kind_BYTE: - case proto::Type_Kind_SHORT: - case proto::Type_Kind_INT: - case proto::Type_Kind_LONG: - case proto::Type_Kind_FLOAT: - case proto::Type_Kind_DOUBLE: - case proto::Type_Kind_STRING: - case proto::Type_Kind_BINARY: - case proto::Type_Kind_TIMESTAMP: - case proto::Type_Kind_DATE: - return std::unique_ptr<Type> - (new TypeImpl(static_cast<TypeKind>(type.kind()))); - - case proto::Type_Kind_CHAR: - case proto::Type_Kind_VARCHAR: - return std::unique_ptr<Type> - (new TypeImpl(static_cast<TypeKind>(type.kind()), - type.maximumlength())); - - case proto::Type_Kind_DECIMAL: - return std::unique_ptr<Type> - (new TypeImpl(DECIMAL, type.precision(), type.scale())); - - case proto::Type_Kind_LIST: - case proto::Type_Kind_MAP: - case proto::Type_Kind_UNION: { - TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind())); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1) - throw ParseError("Illegal LIST type that doesn't contain one subtype"); - if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2) - throw ParseError("Illegal MAP type that doesn't contain two subtypes"); - if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0) - throw ParseError("Illegal UNION type that doesn't contain any subtypes"); - for(int i=0; i < type.subtypes_size(); ++i) { - result->addUnionChild(convertType(footer.types(static_cast<int> - (type.subtypes(i))), - footer)); - } - return return_value; - } - - case proto::Type_Kind_STRUCT: { - TypeImpl* result = new TypeImpl(STRUCT); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - for(int i=0; i < type.subtypes_size(); ++i) { - result->addStructField(type.fieldnames(i), - convertType(footer.types(static_cast<int> - (type.subtypes(i))), - footer)); - } - return return_value; - } - default: - throw NotImplementedYet("Unknown type kind"); - } - } - - /** - * Build a clone of the file type, projecting columns from the selected - * vector. This routine assumes that the parent of any selected column - * is also selected. The column ids are copied from the fileType. - * @param fileType the type in the file - * @param selected is each column by id selected - * @return a clone of the fileType filtered by the selection array - */ - std::unique_ptr<Type> buildSelectedType(const Type *fileType, - const std::vector<bool>& selected) { - if (fileType == nullptr || !selected[fileType->getColumnId()]) { - return std::unique_ptr<Type>(); - } - - TypeImpl* result; - switch (static_cast<int>(fileType->getKind())) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case FLOAT: - case DOUBLE: - case STRING: - case BINARY: - case TIMESTAMP: - case DATE: - result = new TypeImpl(fileType->getKind()); - break; - - case DECIMAL: - result= new TypeImpl(fileType->getKind(), - fileType->getPrecision(), fileType->getScale()); - break; - - case VARCHAR: - case CHAR: - result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength()); - break; - - case LIST: - result = new TypeImpl(fileType->getKind()); - result->addChildType(buildSelectedType(fileType->getSubtype(0), - selected)); - break; - - case MAP: - result = new TypeImpl(fileType->getKind()); - result->addChildType(buildSelectedType(fileType->getSubtype(0), - selected)); - result->addChildType(buildSelectedType(fileType->getSubtype(1), - selected)); - break; - - case STRUCT: { - result = new TypeImpl(fileType->getKind()); - for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { - std::unique_ptr<Type> childType = - buildSelectedType(fileType->getSubtype(child), selected); - if (childType.get() != nullptr) { - result->addStructField(fileType->getFieldName(child), - std::move(childType)); - } - } - break; - } - - case UNION: { - result = new TypeImpl(fileType->getKind()); - for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { - std::unique_ptr<Type> childType = - buildSelectedType(fileType->getSubtype(child), selected); - if (childType.get() != nullptr) { - result->addUnionChild(std::move(childType)); - } - } - break; - } - - default: - throw NotImplementedYet("Unknown type kind"); - } - result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); - return std::unique_ptr<Type>(result); - } - - ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) { - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res = - TypeImpl::parseType(input, 0, input.size()); - if (res.size() != 1) { - throw std::logic_error("Invalid type string."); - } - return std::move(res[0].second); - } - - std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* arrayType = new TypeImpl(LIST); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = - TypeImpl::parseType(input, start, end); - if (v.size() != 1) { - throw std::logic_error("Array type must contain exactly one sub type."); - } - arrayType->addChildType(std::move(v[0].second)); - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input, - size_t start, - size_t end) { - TypeImpl * mapType = new TypeImpl(MAP); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = - TypeImpl::parseType(input, start, end); - if (v.size() != 2) { - throw std::logic_error( - "Map type must contain exactly two sub types."); - } - mapType->addChildType(std::move(v[0].second)); - mapType->addChildType(std::move(v[1].second)); - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* structType = new TypeImpl(STRUCT); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type>> > v = - TypeImpl::parseType(input, start, end); - if (v.size() == 0) { - throw std::logic_error( - "Struct type must contain at least one sub type."); - } - for (size_t i = 0; i < v.size(); ++i) { - structType->addStructField(v[i].first, std::move(v[i].second)); - } - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* unionType = new TypeImpl(UNION); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = - TypeImpl::parseType(input, start, end); - if (v.size() == 0) { - throw std::logic_error("Union type must contain at least one sub type."); - } - for (size_t i = 0; i < v.size(); ++i) { - unionType->addChildType(std::move(v[i].second)); - } - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input, - size_t start, - size_t end) { - size_t sep = input.find(',', start); - if (sep + 1 >= end || sep == std::string::npos) { - throw std::logic_error("Decimal type must specify precision and scale."); - } - uint64_t precision = - static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str())); - uint64_t scale = - static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); - return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); - } - - std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, - const std::string &input, - size_t start, - size_t end) { - if (category == "boolean") { - return std::unique_ptr<Type>(new TypeImpl(BOOLEAN)); - } else if (category == "tinyint") { - return std::unique_ptr<Type>(new TypeImpl(BYTE)); - } else if (category == "smallint") { - return std::unique_ptr<Type>(new TypeImpl(SHORT)); - } else if (category == "int") { - return std::unique_ptr<Type>(new TypeImpl(INT)); - } else if (category == "bigint") { - return std::unique_ptr<Type>(new TypeImpl(LONG)); - } else if (category == "float") { - return std::unique_ptr<Type>(new TypeImpl(FLOAT)); - } else if (category == "double") { - return std::unique_ptr<Type>(new TypeImpl(DOUBLE)); - } else if (category == "string") { - return std::unique_ptr<Type>(new TypeImpl(STRING)); - } else if (category == "binary") { - return std::unique_ptr<Type>(new TypeImpl(BINARY)); - } else if (category == "timestamp") { - return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP)); - } else if (category == "array") { - return parseArrayType(input, start, end); - } else if (category == "map") { - return parseMapType(input, start, end); - } else if (category == "struct") { - return parseStructType(input, start, end); - } else if (category == "uniontype") { - return parseUnionType(input, start, end); - } else if (category == "decimal") { - return parseDecimalType(input, start, end); - } else if (category == "date") { - return std::unique_ptr<Type>(new TypeImpl(DATE)); - } else if (category == "varchar") { - uint64_t maxLength = static_cast<uint64_t>( - atoi(input.substr(start, end - start).c_str())); - return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength)); - } else if (category == "char") { - uint64_t maxLength = static_cast<uint64_t>( - atoi(input.substr(start, end - start).c_str())); - return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength)); - } else { - throw std::logic_error("Unknown type " + category); - } - } - - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > TypeImpl::parseType( - const std::string &input, - size_t start, - size_t end) { - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res; - size_t pos = start; - - while (pos < end) { - size_t endPos = pos; - while (endPos < end && (isalnum(input[endPos]) || input[endPos] == '_')) { - ++endPos; - } - - std::string fieldName; - if (input[endPos] == ':') { - fieldName = input.substr(pos, endPos - pos); - pos = ++endPos; - while (endPos < end && isalpha(input[endPos])) { - ++endPos; - } - } - - size_t nextPos = endPos + 1; - if (input[endPos] == '<') { - int count = 1; - while (nextPos < end) { - if (input[nextPos] == '<') { - ++count; - } else if (input[nextPos] == '>') { - --count; - } - if (count == 0) { - break; - } - ++nextPos; - } - if (nextPos == end) { - throw std::logic_error("Invalid type string. Cannot find closing >"); - } - } else if (input[endPos] == '(') { - while (nextPos < end && input[nextPos] != ')') { - ++nextPos; - } - if (nextPos == end) { - throw std::logic_error("Invalid type string. Cannot find closing )"); - } - } else if (input[endPos] != ',' && endPos != end) { - throw std::logic_error("Unrecognized character."); - } - - std::string category = input.substr(pos, endPos - pos); - res.push_back(std::make_pair(fieldName, parseCategory(category, input, endPos + 1, nextPos))); - - if (nextPos < end && (input[nextPos] == ')' || input[nextPos] == '>')) { - pos = nextPos + 2; - } else { - pos = nextPos; - } - } - - return res; - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "orc/Exceptions.hh" +#include "TypeImpl.hh" + +#include <iostream> +#include <sstream> + +namespace orc { + + Type::~Type() { + // PASS + } + + TypeImpl::TypeImpl(TypeKind _kind) { + parent = nullptr; + columnId = -1; + maximumColumnId = -1; + kind = _kind; + maxLength = 0; + precision = 0; + scale = 0; + subtypeCount = 0; + } + + TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) { + parent = nullptr; + columnId = -1; + maximumColumnId = -1; + kind = _kind; + maxLength = _maxLength; + precision = 0; + scale = 0; + subtypeCount = 0; + } + + TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, + uint64_t _scale) { + parent = nullptr; + columnId = -1; + maximumColumnId = -1; + kind = _kind; + maxLength = 0; + precision = _precision; + scale = _scale; + subtypeCount = 0; + } + + uint64_t TypeImpl::assignIds(uint64_t root) const { + columnId = static_cast<int64_t>(root); + uint64_t current = root + 1; + for(uint64_t i=0; i < subtypeCount; ++i) { + current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current); + } + maximumColumnId = static_cast<int64_t>(current) - 1; + return current; + } + + TypeImpl::~TypeImpl() { + for (std::vector<Type*>::iterator it = subTypes.begin(); + it != subTypes.end(); it++) { + delete (*it) ; + } + } + + void TypeImpl::ensureIdAssigned() const { + if (columnId == -1) { + const TypeImpl* root = this; + while (root->parent != nullptr) { + root = root->parent; + } + root->assignIds(0); + } + } + + uint64_t TypeImpl::getColumnId() const { + ensureIdAssigned(); + return static_cast<uint64_t>(columnId); + } + + uint64_t TypeImpl::getMaximumColumnId() const { + ensureIdAssigned(); + return static_cast<uint64_t>(maximumColumnId); + } + + TypeKind TypeImpl::getKind() const { + return kind; + } + + uint64_t TypeImpl::getSubtypeCount() const { + return subtypeCount; + } + + const Type* TypeImpl::getSubtype(uint64_t i) const { + return subTypes[i]; + } + + const std::string& TypeImpl::getFieldName(uint64_t i) const { + return fieldNames[i]; + } + + uint64_t TypeImpl::getMaximumLength() const { + return maxLength; + } + + uint64_t TypeImpl::getPrecision() const { + return precision; + } + + uint64_t TypeImpl::getScale() const { + return scale; + } + + void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) { + columnId = static_cast<int64_t>(_columnId); + maximumColumnId = static_cast<int64_t>(_maxColumnId); + } + + void TypeImpl::addChildType(std::unique_ptr<Type> childType) { + TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release()); + subTypes.push_back(child); + if (child != nullptr) { + child->parent = this; + } + subtypeCount += 1; + } + + Type* TypeImpl::addStructField(const std::string& fieldName, + std::unique_ptr<Type> fieldType) { + addChildType(std::move(fieldType)); + fieldNames.push_back(fieldName); + return this; + } + + Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) { + addChildType(std::move(fieldType)); + return this; + } + + std::string TypeImpl::toString() const { + switch (static_cast<int64_t>(kind)) { + case BOOLEAN: + return "boolean"; + case BYTE: + return "tinyint"; + case SHORT: + return "smallint"; + case INT: + return "int"; + case LONG: + return "bigint"; + case FLOAT: + return "float"; + case DOUBLE: + return "double"; + case STRING: + return "string"; + case BINARY: + return "binary"; + case TIMESTAMP: + return "timestamp"; + case LIST: + return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; + case MAP: + return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + + (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; + case STRUCT: { + std::string result = "struct<"; + for(size_t i=0; i < subTypes.size(); ++i) { + if (i != 0) { + result += ","; + } + result += fieldNames[i]; + result += ":"; + result += subTypes[i]->toString(); + } + result += ">"; + return result; + } + case UNION: { + std::string result = "uniontype<"; + for(size_t i=0; i < subTypes.size(); ++i) { + if (i != 0) { + result += ","; + } + result += subTypes[i]->toString(); + } + result += ">"; + return result; + } + case DECIMAL: { + std::stringstream result; + result << "decimal(" << precision << "," << scale << ")"; + return result.str(); + } + case DATE: + return "date"; + case VARCHAR: { + std::stringstream result; + result << "varchar(" << maxLength << ")"; + return result.str(); + } + case CHAR: { + std::stringstream result; + result << "char(" << maxLength << ")"; + return result.str(); + } + default: + throw NotImplementedYet("Unknown type"); + } + } + + std::unique_ptr<ColumnVectorBatch> + TypeImpl::createRowBatch(uint64_t capacity, + MemoryPool& memoryPool, + bool encoded) const { + switch (static_cast<int64_t>(kind)) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case DATE: + return std::unique_ptr<ColumnVectorBatch> + (new LongVectorBatch(capacity, memoryPool)); + + case FLOAT: + case DOUBLE: + return std::unique_ptr<ColumnVectorBatch> + (new DoubleVectorBatch(capacity, memoryPool)); + + case STRING: + case BINARY: + case CHAR: + case VARCHAR: + return encoded ? + std::unique_ptr<ColumnVectorBatch> + (new EncodedStringVectorBatch(capacity, memoryPool)) + : std::unique_ptr<ColumnVectorBatch> + (new StringVectorBatch(capacity, memoryPool)); + + case TIMESTAMP: + return std::unique_ptr<ColumnVectorBatch> + (new TimestampVectorBatch(capacity, memoryPool)); + + case STRUCT: { + StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + for(uint64_t i=0; i < getSubtypeCount(); ++i) { + result->fields.push_back(getSubtype(i)-> + createRowBatch(capacity, + memoryPool, encoded).release()); + } + return return_value; + } + + case LIST: { + ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + if (getSubtype(0) != nullptr) { + result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); + } + return return_value; + } + + case MAP: { + MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + if (getSubtype(0) != nullptr) { + result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); + } + if (getSubtype(1) != nullptr) { + result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded); + } + return return_value; + } + + case DECIMAL: { + if (getPrecision() == 0 || getPrecision() > 18) { + return std::unique_ptr<ColumnVectorBatch> + (new Decimal128VectorBatch(capacity, memoryPool)); + } else { + return std::unique_ptr<ColumnVectorBatch> + (new Decimal64VectorBatch(capacity, memoryPool)); + } + } + + case UNION: { + UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + for(uint64_t i=0; i < getSubtypeCount(); ++i) { + result->children.push_back(getSubtype(i)->createRowBatch(capacity, + memoryPool, encoded) + .release()); + } + return return_value; + } + + default: + throw NotImplementedYet("not supported yet"); + } + } + + std::unique_ptr<Type> createPrimitiveType(TypeKind kind) { + return std::unique_ptr<Type>(new TypeImpl(kind)); + } + + std::unique_ptr<Type> createCharType(TypeKind kind, + uint64_t maxLength) { + return std::unique_ptr<Type>(new TypeImpl(kind, maxLength)); + } + + std::unique_ptr<Type> createDecimalType(uint64_t precision, + uint64_t scale) { + return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); + } + + std::unique_ptr<Type> createStructType() { + return std::unique_ptr<Type>(new TypeImpl(STRUCT)); + } + + std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) { + TypeImpl* result = new TypeImpl(LIST); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + result->addChildType(std::move(elements)); + return return_value; + } + + std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, + std::unique_ptr<Type> value) { + TypeImpl* result = new TypeImpl(MAP); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + result->addChildType(std::move(key)); + result->addChildType(std::move(value)); + return return_value; + } + + std::unique_ptr<Type> createUnionType() { + return std::unique_ptr<Type>(new TypeImpl(UNION)); + } + + std::string printProtobufMessage(const google::protobuf::Message& message); + std::unique_ptr<Type> convertType(const proto::Type& type, + const proto::Footer& footer) { + switch (static_cast<int64_t>(type.kind())) { + + case proto::Type_Kind_BOOLEAN: + case proto::Type_Kind_BYTE: + case proto::Type_Kind_SHORT: + case proto::Type_Kind_INT: + case proto::Type_Kind_LONG: + case proto::Type_Kind_FLOAT: + case proto::Type_Kind_DOUBLE: + case proto::Type_Kind_STRING: + case proto::Type_Kind_BINARY: + case proto::Type_Kind_TIMESTAMP: + case proto::Type_Kind_DATE: + return std::unique_ptr<Type> + (new TypeImpl(static_cast<TypeKind>(type.kind()))); + + case proto::Type_Kind_CHAR: + case proto::Type_Kind_VARCHAR: + return std::unique_ptr<Type> + (new TypeImpl(static_cast<TypeKind>(type.kind()), + type.maximumlength())); + + case proto::Type_Kind_DECIMAL: + return std::unique_ptr<Type> + (new TypeImpl(DECIMAL, type.precision(), type.scale())); + + case proto::Type_Kind_LIST: + case proto::Type_Kind_MAP: + case proto::Type_Kind_UNION: { + TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind())); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1) + throw ParseError("Illegal LIST type that doesn't contain one subtype"); + if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2) + throw ParseError("Illegal MAP type that doesn't contain two subtypes"); + if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0) + throw ParseError("Illegal UNION type that doesn't contain any subtypes"); + for(int i=0; i < type.subtypes_size(); ++i) { + result->addUnionChild(convertType(footer.types(static_cast<int> + (type.subtypes(i))), + footer)); + } + return return_value; + } + + case proto::Type_Kind_STRUCT: { + TypeImpl* result = new TypeImpl(STRUCT); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + for(int i=0; i < type.subtypes_size(); ++i) { + result->addStructField(type.fieldnames(i), + convertType(footer.types(static_cast<int> + (type.subtypes(i))), + footer)); + } + return return_value; + } + default: + throw NotImplementedYet("Unknown type kind"); + } + } + + /** + * Build a clone of the file type, projecting columns from the selected + * vector. This routine assumes that the parent of any selected column + * is also selected. The column ids are copied from the fileType. + * @param fileType the type in the file + * @param selected is each column by id selected + * @return a clone of the fileType filtered by the selection array + */ + std::unique_ptr<Type> buildSelectedType(const Type *fileType, + const std::vector<bool>& selected) { + if (fileType == nullptr || !selected[fileType->getColumnId()]) { + return std::unique_ptr<Type>(); + } + + TypeImpl* result; + switch (static_cast<int>(fileType->getKind())) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + case BINARY: + case TIMESTAMP: + case DATE: + result = new TypeImpl(fileType->getKind()); + break; + + case DECIMAL: + result= new TypeImpl(fileType->getKind(), + fileType->getPrecision(), fileType->getScale()); + break; + + case VARCHAR: + case CHAR: + result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength()); + break; + + case LIST: + result = new TypeImpl(fileType->getKind()); + result->addChildType(buildSelectedType(fileType->getSubtype(0), + selected)); + break; + + case MAP: + result = new TypeImpl(fileType->getKind()); + result->addChildType(buildSelectedType(fileType->getSubtype(0), + selected)); + result->addChildType(buildSelectedType(fileType->getSubtype(1), + selected)); + break; + + case STRUCT: { + result = new TypeImpl(fileType->getKind()); + for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { + std::unique_ptr<Type> childType = + buildSelectedType(fileType->getSubtype(child), selected); + if (childType.get() != nullptr) { + result->addStructField(fileType->getFieldName(child), + std::move(childType)); + } + } + break; + } + + case UNION: { + result = new TypeImpl(fileType->getKind()); + for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { + std::unique_ptr<Type> childType = + buildSelectedType(fileType->getSubtype(child), selected); + if (childType.get() != nullptr) { + result->addUnionChild(std::move(childType)); + } + } + break; + } + + default: + throw NotImplementedYet("Unknown type kind"); + } + result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); + return std::unique_ptr<Type>(result); + } + + ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) { + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res = + TypeImpl::parseType(input, 0, input.size()); + if (res.size() != 1) { + throw std::logic_error("Invalid type string."); + } + return std::move(res[0].second); + } + + std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* arrayType = new TypeImpl(LIST); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = + TypeImpl::parseType(input, start, end); + if (v.size() != 1) { + throw std::logic_error("Array type must contain exactly one sub type."); + } + arrayType->addChildType(std::move(v[0].second)); + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input, + size_t start, + size_t end) { + TypeImpl * mapType = new TypeImpl(MAP); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = + TypeImpl::parseType(input, start, end); + if (v.size() != 2) { + throw std::logic_error( + "Map type must contain exactly two sub types."); + } + mapType->addChildType(std::move(v[0].second)); + mapType->addChildType(std::move(v[1].second)); + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* structType = new TypeImpl(STRUCT); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type>> > v = + TypeImpl::parseType(input, start, end); + if (v.size() == 0) { + throw std::logic_error( + "Struct type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + structType->addStructField(v[i].first, std::move(v[i].second)); + } + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* unionType = new TypeImpl(UNION); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = + TypeImpl::parseType(input, start, end); + if (v.size() == 0) { + throw std::logic_error("Union type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + unionType->addChildType(std::move(v[i].second)); + } + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input, + size_t start, + size_t end) { + size_t sep = input.find(',', start); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Decimal type must specify precision and scale."); + } + uint64_t precision = + static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str())); + uint64_t scale = + static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); + return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); + } + + std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, + const std::string &input, + size_t start, + size_t end) { + if (category == "boolean") { + return std::unique_ptr<Type>(new TypeImpl(BOOLEAN)); + } else if (category == "tinyint") { + return std::unique_ptr<Type>(new TypeImpl(BYTE)); + } else if (category == "smallint") { + return std::unique_ptr<Type>(new TypeImpl(SHORT)); + } else if (category == "int") { + return std::unique_ptr<Type>(new TypeImpl(INT)); + } else if (category == "bigint") { + return std::unique_ptr<Type>(new TypeImpl(LONG)); + } else if (category == "float") { + return std::unique_ptr<Type>(new TypeImpl(FLOAT)); + } else if (category == "double") { + return std::unique_ptr<Type>(new TypeImpl(DOUBLE)); + } else if (category == "string") { + return std::unique_ptr<Type>(new TypeImpl(STRING)); + } else if (category == "binary") { + return std::unique_ptr<Type>(new TypeImpl(BINARY)); + } else if (category == "timestamp") { + return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP)); + } else if (category == "array") { + return parseArrayType(input, start, end); + } else if (category == "map") { + return parseMapType(input, start, end); + } else if (category == "struct") { + return parseStructType(input, start, end); + } else if (category == "uniontype") { + return parseUnionType(input, start, end); + } else if (category == "decimal") { + return parseDecimalType(input, start, end); + } else if (category == "date") { + return std::unique_ptr<Type>(new TypeImpl(DATE)); + } else if (category == "varchar") { + uint64_t maxLength = static_cast<uint64_t>( + atoi(input.substr(start, end - start).c_str())); + return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength)); + } else if (category == "char") { + uint64_t maxLength = static_cast<uint64_t>( + atoi(input.substr(start, end - start).c_str())); + return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength)); + } else { + throw std::logic_error("Unknown type " + category); + } + } + + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > TypeImpl::parseType( + const std::string &input, + size_t start, + size_t end) { + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res; + size_t pos = start; + + while (pos < end) { + size_t endPos = pos; + while (endPos < end && (isalnum(input[endPos]) || input[endPos] == '_')) { + ++endPos; + } + + std::string fieldName; + if (input[endPos] == ':') { + fieldName = input.substr(pos, endPos - pos); + pos = ++endPos; + while (endPos < end && isalpha(input[endPos])) { + ++endPos; + } + } + + size_t nextPos = endPos + 1; + if (input[endPos] == '<') { + int count = 1; + while (nextPos < end) { + if (input[nextPos] == '<') { + ++count; + } else if (input[nextPos] == '>') { + --count; + } + if (count == 0) { + break; + } + ++nextPos; + } + if (nextPos == end) { + throw std::logic_error("Invalid type string. Cannot find closing >"); + } + } else if (input[endPos] == '(') { + while (nextPos < end && input[nextPos] != ')') { + ++nextPos; + } + if (nextPos == end) { + throw std::logic_error("Invalid type string. Cannot find closing )"); + } + } else if (input[endPos] != ',' && endPos != end) { + throw std::logic_error("Unrecognized character."); + } + + std::string category = input.substr(pos, endPos - pos); + res.push_back(std::make_pair(fieldName, parseCategory(category, input, endPos + 1, nextPos))); + + if (nextPos < end && (input[nextPos] == ')' || input[nextPos] == '>')) { + pos = nextPos + 2; + } else { + pos = nextPos; + } + } + + return res; + } + +} diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh index 054ceab5dc..cee52006b7 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh @@ -1,198 +1,198 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TYPE_IMPL_HH -#define TYPE_IMPL_HH - -#include "orc/Type.hh" - -#include "Adaptor.hh" -#include "wrap/orc-proto-wrapper.hh" - -#include <vector> - -namespace orc { - - class TypeImpl: public Type { - private: - TypeImpl* parent; - mutable int64_t columnId; - mutable int64_t maximumColumnId; - TypeKind kind; - std::vector<Type*> subTypes; - std::vector<std::string> fieldNames; - uint64_t subtypeCount; - uint64_t maxLength; - uint64_t precision; - uint64_t scale; - - public: - /** - * Create most of the primitive types. - */ - TypeImpl(TypeKind kind); - - /** - * Create char and varchar type. - */ - TypeImpl(TypeKind kind, uint64_t maxLength); - - /** - * Create decimal type. - */ - TypeImpl(TypeKind kind, uint64_t precision, - uint64_t scale); - - virtual ~TypeImpl() override; - - uint64_t getColumnId() const override; - - uint64_t getMaximumColumnId() const override; - - TypeKind getKind() const override; - - uint64_t getSubtypeCount() const override; - - const Type* getSubtype(uint64_t i) const override; - - const std::string& getFieldName(uint64_t i) const override; - - uint64_t getMaximumLength() const override; - - uint64_t getPrecision() const override; - - uint64_t getScale() const override; - - std::string toString() const override; - - Type* addStructField(const std::string& fieldName, - std::unique_ptr<Type> fieldType) override; - Type* addUnionChild(std::unique_ptr<Type> fieldType) override; - - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, - MemoryPool& memoryPool, - bool encoded = false - ) const override; - - /** - * Explicitly set the column ids. Only for internal usage. - */ - void setIds(uint64_t columnId, uint64_t maxColumnId); - - /** - * Add a child type. - */ - void addChildType(std::unique_ptr<Type> childType); - - static std::vector<std::pair<std::string, std::unique_ptr<Type> > > parseType( - const std::string &input, - size_t start, - size_t end); - - private: - /** - * Assign ids to this node and its children giving this - * node rootId. - * @param rootId the column id that should be assigned to this node. - */ - uint64_t assignIds(uint64_t rootId) const; - - /** - * Ensure that ids are assigned to all of the nodes. - */ - void ensureIdAssigned() const; - - /** - * Parse array type from string - * @param input the input string of an array type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseArrayType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse map type from string - * @param input the input string of a map type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseMapType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse struct type from string - * @param input the input string of a struct type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseStructType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse union type from string - * @param input the input string of an union type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseUnionType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse decimal type from string - * @param input the input string of a decimal type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseDecimalType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse type for a category - * @param category type name - * @param input the input string of the category - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseCategory(std::string category, - const std::string &input, - size_t start, - size_t end); - }; - - std::unique_ptr<Type> convertType(const proto::Type& type, - const proto::Footer& footer); - - /** - * Build a clone of the file type, projecting columns from the selected - * vector. This routine assumes that the parent of any selected column - * is also selected. - * @param fileType the type in the file - * @param selected is each column by id selected - * @return a clone of the fileType filtered by the selection array - */ - std::unique_ptr<Type> buildSelectedType(const Type *fileType, - const std::vector<bool>& selected); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TYPE_IMPL_HH +#define TYPE_IMPL_HH + +#include "orc/Type.hh" + +#include "Adaptor.hh" +#include "wrap/orc-proto-wrapper.hh" + +#include <vector> + +namespace orc { + + class TypeImpl: public Type { + private: + TypeImpl* parent; + mutable int64_t columnId; + mutable int64_t maximumColumnId; + TypeKind kind; + std::vector<Type*> subTypes; + std::vector<std::string> fieldNames; + uint64_t subtypeCount; + uint64_t maxLength; + uint64_t precision; + uint64_t scale; + + public: + /** + * Create most of the primitive types. + */ + TypeImpl(TypeKind kind); + + /** + * Create char and varchar type. + */ + TypeImpl(TypeKind kind, uint64_t maxLength); + + /** + * Create decimal type. + */ + TypeImpl(TypeKind kind, uint64_t precision, + uint64_t scale); + + virtual ~TypeImpl() override; + + uint64_t getColumnId() const override; + + uint64_t getMaximumColumnId() const override; + + TypeKind getKind() const override; + + uint64_t getSubtypeCount() const override; + + const Type* getSubtype(uint64_t i) const override; + + const std::string& getFieldName(uint64_t i) const override; + + uint64_t getMaximumLength() const override; + + uint64_t getPrecision() const override; + + uint64_t getScale() const override; + + std::string toString() const override; + + Type* addStructField(const std::string& fieldName, + std::unique_ptr<Type> fieldType) override; + Type* addUnionChild(std::unique_ptr<Type> fieldType) override; + + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, + MemoryPool& memoryPool, + bool encoded = false + ) const override; + + /** + * Explicitly set the column ids. Only for internal usage. + */ + void setIds(uint64_t columnId, uint64_t maxColumnId); + + /** + * Add a child type. + */ + void addChildType(std::unique_ptr<Type> childType); + + static std::vector<std::pair<std::string, std::unique_ptr<Type> > > parseType( + const std::string &input, + size_t start, + size_t end); + + private: + /** + * Assign ids to this node and its children giving this + * node rootId. + * @param rootId the column id that should be assigned to this node. + */ + uint64_t assignIds(uint64_t rootId) const; + + /** + * Ensure that ids are assigned to all of the nodes. + */ + void ensureIdAssigned() const; + + /** + * Parse array type from string + * @param input the input string of an array type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseArrayType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse map type from string + * @param input the input string of a map type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseMapType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse struct type from string + * @param input the input string of a struct type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseStructType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse union type from string + * @param input the input string of an union type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseUnionType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse decimal type from string + * @param input the input string of a decimal type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseDecimalType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse type for a category + * @param category type name + * @param input the input string of the category + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseCategory(std::string category, + const std::string &input, + size_t start, + size_t end); + }; + + std::unique_ptr<Type> convertType(const proto::Type& type, + const proto::Footer& footer); + + /** + * Build a clone of the file type, projecting columns from the selected + * vector. This routine assumes that the parent of any selected column + * is also selected. + * @param fileType the type in the file + * @param selected is each column by id selected + * @return a clone of the fileType filtered by the selection array + */ + std::unique_ptr<Type> buildSelectedType(const Type *fileType, + const std::vector<bool>& selected); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc index 14c0ded030..6ba2f8ae7d 100644 --- a/contrib/libs/apache/orc/c++/src/Vector.cc +++ b/contrib/libs/apache/orc/c++/src/Vector.cc @@ -1,518 +1,518 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Vector.hh" - -#include "Adaptor.hh" -#include "orc/Exceptions.hh" - -#include <iostream> -#include <sstream> -#include <cstdlib> - -namespace orc { - - ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, - MemoryPool& pool - ): capacity(cap), - numElements(0), - notNull(pool, cap), - hasNulls(false), - isEncoded(false), - memoryPool(pool) { - std::memset(notNull.data(), 1, capacity); - } - - ColumnVectorBatch::~ColumnVectorBatch() { - // PASS - } - - void ColumnVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - capacity = cap; - notNull.resize(cap); - } - } - - void ColumnVectorBatch::clear() { - numElements = 0; - } - - uint64_t ColumnVectorBatch::getMemoryUsage() { - return static_cast<uint64_t>(notNull.capacity() * sizeof(char)); - } - - bool ColumnVectorBatch::hasVariableLength() { - return false; - } - - LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity) { - // PASS - } - - LongVectorBatch::~LongVectorBatch() { - // PASS - } - - std::string LongVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Long vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void LongVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - } - } - - void LongVectorBatch::clear() { - numElements = 0; - } - - uint64_t LongVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() + - static_cast<uint64_t>(data.capacity() * sizeof(int64_t)); - } - - DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity) { - // PASS - } - - DoubleVectorBatch::~DoubleVectorBatch() { - // PASS - } - - std::string DoubleVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Double vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void DoubleVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - } - } - - void DoubleVectorBatch::clear() { - numElements = 0; - } - - uint64_t DoubleVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(data.capacity() * sizeof(double)); - } - - StringDictionary::StringDictionary(MemoryPool& pool) - : dictionaryBlob(pool), - dictionaryOffset(pool) { - // PASS - } - - EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, - MemoryPool& pool) - : StringVectorBatch(_capacity, pool), - dictionary(), - index(pool, _capacity) { - // PASS - } - - EncodedStringVectorBatch::~EncodedStringVectorBatch() { - // PASS - } - - std::string EncodedStringVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Encoded string vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity), - length(pool, _capacity), - blob(pool) { - // PASS - } - - StringVectorBatch::~StringVectorBatch() { - // PASS - } - - std::string StringVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Byte vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void StringVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - length.resize(cap); - } - } - - void StringVectorBatch::clear() { - numElements = 0; - } - - uint64_t StringVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(data.capacity() * sizeof(char*) - + length.capacity() * sizeof(int64_t)); - } - - StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool) { - // PASS - } - - StructVectorBatch::~StructVectorBatch() { - for (uint64_t i=0; i<this->fields.size(); i++) { - delete this->fields[i]; - } - } - - std::string StructVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Struct vector <" << numElements << " of " << capacity - << "; "; - for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin(); - ptr != fields.end(); ++ptr) { - buffer << (*ptr)->toString() << "; "; - } - buffer << ">"; - return buffer.str(); - } - - void StructVectorBatch::resize(uint64_t cap) { - ColumnVectorBatch::resize(cap); - } - - void StructVectorBatch::clear() { - for(size_t i=0; i < fields.size(); i++) { - fields[i]->clear(); - } - numElements = 0; - } - - uint64_t StructVectorBatch::getMemoryUsage() { - uint64_t memory = ColumnVectorBatch::getMemoryUsage(); - for (unsigned int i=0; i < fields.size(); i++) { - memory += fields[i]->getMemoryUsage(); - } - return memory; - } - - bool StructVectorBatch::hasVariableLength() { - for (unsigned int i=0; i < fields.size(); i++) { - if (fields[i]->hasVariableLength()) { - return true; - } - } - return false; - } - - ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - offsets(pool, cap+1) { - // PASS - } - - ListVectorBatch::~ListVectorBatch() { - // PASS - } - - std::string ListVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "List vector <" << elements->toString() << " with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void ListVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - offsets.resize(cap + 1); - } - } - - void ListVectorBatch::clear() { - numElements = 0; - elements->clear(); - } - - uint64_t ListVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) - + elements->getMemoryUsage(); - } - - bool ListVectorBatch::hasVariableLength() { - return true; - } - - MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - offsets(pool, cap+1) { - // PASS - } - - MapVectorBatch::~MapVectorBatch() { - // PASS - } - - std::string MapVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Map vector <" << keys->toString() << ", " - << elements->toString() << " with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void MapVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - offsets.resize(cap + 1); - } - } - - void MapVectorBatch::clear() { - keys->clear(); - elements->clear(); - numElements = 0; - } - - uint64_t MapVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) - + keys->getMemoryUsage() - + elements->getMemoryUsage(); - } - - bool MapVectorBatch::hasVariableLength() { - return true; - } - - UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - tags(pool, cap), - offsets(pool, cap) { - // PASS - } - - UnionVectorBatch::~UnionVectorBatch() { - for (uint64_t i=0; i < children.size(); i++) { - delete children[i]; - } - } - - std::string UnionVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Union vector <"; - for(size_t i=0; i < children.size(); ++i) { - if (i != 0) { - buffer << ", "; - } - buffer << children[i]->toString(); - } - buffer << "; with " << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void UnionVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - tags.resize(cap); - offsets.resize(cap); - } - } - - void UnionVectorBatch::clear() { - for(size_t i=0; i < children.size(); i++) { - children[i]->clear(); - } - numElements = 0; - } - - uint64_t UnionVectorBatch::getMemoryUsage() { - uint64_t memory = ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) - + offsets.capacity() * sizeof(uint64_t)); - for(size_t i=0; i < children.size(); ++i) { - memory += children[i]->getMemoryUsage(); - } - return memory; - } - - bool UnionVectorBatch::hasVariableLength() { - for(size_t i=0; i < children.size(); ++i) { - if (children[i]->hasVariableLength()) { - return true; - } - } - return false; - } - - Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - precision(0), - scale(0), - values(pool, cap), - readScales(pool, cap) { - // PASS - } - - Decimal64VectorBatch::~Decimal64VectorBatch() { - // PASS - } - - std::string Decimal64VectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Decimal64 vector with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void Decimal64VectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - values.resize(cap); - readScales.resize(cap); - } - } - - void Decimal64VectorBatch::clear() { - numElements = 0; - } - - uint64_t Decimal64VectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>( - (values.capacity() + readScales.capacity()) * sizeof(int64_t)); - } - - Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - precision(0), - scale(0), - values(pool, cap), - readScales(pool, cap) { - // PASS - } - - Decimal128VectorBatch::~Decimal128VectorBatch() { - // PASS - } - - std::string Decimal128VectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Decimal128 vector with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void Decimal128VectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - values.resize(cap); - readScales.resize(cap); - } - } - - void Decimal128VectorBatch::clear() { - numElements = 0; - } - - uint64_t Decimal128VectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(values.capacity() * sizeof(Int128) - + readScales.capacity() * sizeof(int64_t)); - } - - Decimal::Decimal(const Int128& _value, - int32_t _scale): value(_value), scale(_scale) { - // PASS - } - - Decimal::Decimal(const std::string& str) { - std::size_t foundPoint = str.find("."); - // no decimal point, it is int - if(foundPoint == std::string::npos){ - value = Int128(str); - scale = 0; - }else{ - std::string copy(str); - scale = static_cast<int32_t>(str.length() - foundPoint - 1); - value = Int128(copy.replace(foundPoint, 1, "")); - } - } - - Decimal::Decimal() : value(0), scale(0) { - // PASS - } - - std::string Decimal::toString() const { - return value.toDecimalString(scale); - } - - TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, - MemoryPool& pool - ): ColumnVectorBatch(_capacity, - pool), - data(pool, _capacity), - nanoseconds(pool, _capacity) { - // PASS - } - - TimestampVectorBatch::~TimestampVectorBatch() { - // PASS - } - - std::string TimestampVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Timestamp vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void TimestampVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - nanoseconds.resize(cap); - } - } - - void TimestampVectorBatch::clear() { - numElements = 0; - } - - uint64_t TimestampVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>( - (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t)); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Vector.hh" + +#include "Adaptor.hh" +#include "orc/Exceptions.hh" + +#include <iostream> +#include <sstream> +#include <cstdlib> + +namespace orc { + + ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, + MemoryPool& pool + ): capacity(cap), + numElements(0), + notNull(pool, cap), + hasNulls(false), + isEncoded(false), + memoryPool(pool) { + std::memset(notNull.data(), 1, capacity); + } + + ColumnVectorBatch::~ColumnVectorBatch() { + // PASS + } + + void ColumnVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + capacity = cap; + notNull.resize(cap); + } + } + + void ColumnVectorBatch::clear() { + numElements = 0; + } + + uint64_t ColumnVectorBatch::getMemoryUsage() { + return static_cast<uint64_t>(notNull.capacity() * sizeof(char)); + } + + bool ColumnVectorBatch::hasVariableLength() { + return false; + } + + LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool + ): ColumnVectorBatch(_capacity, pool), + data(pool, _capacity) { + // PASS + } + + LongVectorBatch::~LongVectorBatch() { + // PASS + } + + std::string LongVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Long vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void LongVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + } + } + + void LongVectorBatch::clear() { + numElements = 0; + } + + uint64_t LongVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(int64_t)); + } + + DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool + ): ColumnVectorBatch(_capacity, pool), + data(pool, _capacity) { + // PASS + } + + DoubleVectorBatch::~DoubleVectorBatch() { + // PASS + } + + std::string DoubleVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Double vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void DoubleVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + } + } + + void DoubleVectorBatch::clear() { + numElements = 0; + } + + uint64_t DoubleVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(double)); + } + + StringDictionary::StringDictionary(MemoryPool& pool) + : dictionaryBlob(pool), + dictionaryOffset(pool) { + // PASS + } + + EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, + MemoryPool& pool) + : StringVectorBatch(_capacity, pool), + dictionary(), + index(pool, _capacity) { + // PASS + } + + EncodedStringVectorBatch::~EncodedStringVectorBatch() { + // PASS + } + + std::string EncodedStringVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Encoded string vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool + ): ColumnVectorBatch(_capacity, pool), + data(pool, _capacity), + length(pool, _capacity), + blob(pool) { + // PASS + } + + StringVectorBatch::~StringVectorBatch() { + // PASS + } + + std::string StringVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Byte vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void StringVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + length.resize(cap); + } + } + + void StringVectorBatch::clear() { + numElements = 0; + } + + uint64_t StringVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(char*) + + length.capacity() * sizeof(int64_t)); + } + + StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool) { + // PASS + } + + StructVectorBatch::~StructVectorBatch() { + for (uint64_t i=0; i<this->fields.size(); i++) { + delete this->fields[i]; + } + } + + std::string StructVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Struct vector <" << numElements << " of " << capacity + << "; "; + for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin(); + ptr != fields.end(); ++ptr) { + buffer << (*ptr)->toString() << "; "; + } + buffer << ">"; + return buffer.str(); + } + + void StructVectorBatch::resize(uint64_t cap) { + ColumnVectorBatch::resize(cap); + } + + void StructVectorBatch::clear() { + for(size_t i=0; i < fields.size(); i++) { + fields[i]->clear(); + } + numElements = 0; + } + + uint64_t StructVectorBatch::getMemoryUsage() { + uint64_t memory = ColumnVectorBatch::getMemoryUsage(); + for (unsigned int i=0; i < fields.size(); i++) { + memory += fields[i]->getMemoryUsage(); + } + return memory; + } + + bool StructVectorBatch::hasVariableLength() { + for (unsigned int i=0; i < fields.size(); i++) { + if (fields[i]->hasVariableLength()) { + return true; + } + } + return false; + } + + ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + offsets(pool, cap+1) { + // PASS + } + + ListVectorBatch::~ListVectorBatch() { + // PASS + } + + std::string ListVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "List vector <" << elements->toString() << " with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void ListVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + offsets.resize(cap + 1); + } + } + + void ListVectorBatch::clear() { + numElements = 0; + elements->clear(); + } + + uint64_t ListVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + + elements->getMemoryUsage(); + } + + bool ListVectorBatch::hasVariableLength() { + return true; + } + + MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + offsets(pool, cap+1) { + // PASS + } + + MapVectorBatch::~MapVectorBatch() { + // PASS + } + + std::string MapVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Map vector <" << keys->toString() << ", " + << elements->toString() << " with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void MapVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + offsets.resize(cap + 1); + } + } + + void MapVectorBatch::clear() { + keys->clear(); + elements->clear(); + numElements = 0; + } + + uint64_t MapVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + + keys->getMemoryUsage() + + elements->getMemoryUsage(); + } + + bool MapVectorBatch::hasVariableLength() { + return true; + } + + UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + tags(pool, cap), + offsets(pool, cap) { + // PASS + } + + UnionVectorBatch::~UnionVectorBatch() { + for (uint64_t i=0; i < children.size(); i++) { + delete children[i]; + } + } + + std::string UnionVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Union vector <"; + for(size_t i=0; i < children.size(); ++i) { + if (i != 0) { + buffer << ", "; + } + buffer << children[i]->toString(); + } + buffer << "; with " << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void UnionVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + tags.resize(cap); + offsets.resize(cap); + } + } + + void UnionVectorBatch::clear() { + for(size_t i=0; i < children.size(); i++) { + children[i]->clear(); + } + numElements = 0; + } + + uint64_t UnionVectorBatch::getMemoryUsage() { + uint64_t memory = ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) + + offsets.capacity() * sizeof(uint64_t)); + for(size_t i=0; i < children.size(); ++i) { + memory += children[i]->getMemoryUsage(); + } + return memory; + } + + bool UnionVectorBatch::hasVariableLength() { + for(size_t i=0; i < children.size(); ++i) { + if (children[i]->hasVariableLength()) { + return true; + } + } + return false; + } + + Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + precision(0), + scale(0), + values(pool, cap), + readScales(pool, cap) { + // PASS + } + + Decimal64VectorBatch::~Decimal64VectorBatch() { + // PASS + } + + std::string Decimal64VectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Decimal64 vector with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void Decimal64VectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + values.resize(cap); + readScales.resize(cap); + } + } + + void Decimal64VectorBatch::clear() { + numElements = 0; + } + + uint64_t Decimal64VectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>( + (values.capacity() + readScales.capacity()) * sizeof(int64_t)); + } + + Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + precision(0), + scale(0), + values(pool, cap), + readScales(pool, cap) { + // PASS + } + + Decimal128VectorBatch::~Decimal128VectorBatch() { + // PASS + } + + std::string Decimal128VectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Decimal128 vector with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void Decimal128VectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + values.resize(cap); + readScales.resize(cap); + } + } + + void Decimal128VectorBatch::clear() { + numElements = 0; + } + + uint64_t Decimal128VectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(values.capacity() * sizeof(Int128) + + readScales.capacity() * sizeof(int64_t)); + } + + Decimal::Decimal(const Int128& _value, + int32_t _scale): value(_value), scale(_scale) { + // PASS + } + + Decimal::Decimal(const std::string& str) { + std::size_t foundPoint = str.find("."); + // no decimal point, it is int + if(foundPoint == std::string::npos){ + value = Int128(str); + scale = 0; + }else{ + std::string copy(str); + scale = static_cast<int32_t>(str.length() - foundPoint - 1); + value = Int128(copy.replace(foundPoint, 1, "")); + } + } + + Decimal::Decimal() : value(0), scale(0) { + // PASS + } + + std::string Decimal::toString() const { + return value.toDecimalString(scale); + } + + TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, + MemoryPool& pool + ): ColumnVectorBatch(_capacity, + pool), + data(pool, _capacity), + nanoseconds(pool, _capacity) { + // PASS + } + + TimestampVectorBatch::~TimestampVectorBatch() { + // PASS + } + + std::string TimestampVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Timestamp vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void TimestampVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + nanoseconds.resize(cap); + } + } + + void TimestampVectorBatch::clear() { + numElements = 0; + } + + uint64_t TimestampVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>( + (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t)); + } +} diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc index b5bd19b304..8b13750865 100644 --- a/contrib/libs/apache/orc/c++/src/Writer.cc +++ b/contrib/libs/apache/orc/c++/src/Writer.cc @@ -1,641 +1,641 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Common.hh" -#include "orc/OrcFile.hh" - -#include "ColumnWriter.hh" -#include "Timezone.hh" - -#include <memory> - -namespace orc { - - struct WriterOptionsPrivate { - uint64_t stripeSize; - uint64_t compressionBlockSize; - uint64_t rowIndexStride; - CompressionKind compression; - CompressionStrategy compressionStrategy; - MemoryPool* memoryPool; - double paddingTolerance; - std::ostream* errorStream; - FileVersion fileVersion; - double dictionaryKeySizeThreshold; - bool enableIndex; - std::set<uint64_t> columnsUseBloomFilter; - double bloomFilterFalsePositiveProb; - BloomFilterVersion bloomFilterVersion; - - WriterOptionsPrivate() : - fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 - stripeSize = 64 * 1024 * 1024; // 64M - compressionBlockSize = 64 * 1024; // 64K - rowIndexStride = 10000; - compression = CompressionKind_ZLIB; - compressionStrategy = CompressionStrategy_SPEED; - memoryPool = getDefaultPool(); - paddingTolerance = 0.0; - errorStream = &std::cerr; - dictionaryKeySizeThreshold = 0.0; - enableIndex = true; - bloomFilterFalsePositiveProb = 0.05; - bloomFilterVersion = UTF8; - } - }; - - WriterOptions::WriterOptions(): - privateBits(std::unique_ptr<WriterOptionsPrivate> - (new WriterOptionsPrivate())) { - // PASS - } - - WriterOptions::WriterOptions(const WriterOptions& rhs): - privateBits(std::unique_ptr<WriterOptionsPrivate> - (new WriterOptionsPrivate(*(rhs.privateBits.get())))) { - // PASS - } - - WriterOptions::WriterOptions(WriterOptions& rhs) { - // swap privateBits with rhs - WriterOptionsPrivate* l = privateBits.release(); - privateBits.reset(rhs.privateBits.release()); - rhs.privateBits.reset(l); - } - - WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) { - if (this != &rhs) { - privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get()))); - } - return *this; - } - - WriterOptions::~WriterOptions() { - // PASS - } - RleVersion WriterOptions::getRleVersion() const { - if(privateBits->fileVersion == FileVersion::v_0_11()) - { - return RleVersion_1; - } - - return RleVersion_2; - } - - WriterOptions& WriterOptions::setStripeSize(uint64_t size) { - privateBits->stripeSize = size; - return *this; - } - - uint64_t WriterOptions::getStripeSize() const { - return privateBits->stripeSize; - } - - WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) { - privateBits->compressionBlockSize = size; - return *this; - } - - uint64_t WriterOptions::getCompressionBlockSize() const { - return privateBits->compressionBlockSize; - } - - WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) { - privateBits->rowIndexStride = stride; - privateBits->enableIndex = (stride != 0); - return *this; - } - - uint64_t WriterOptions::getRowIndexStride() const { - return privateBits->rowIndexStride; - } - - WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) { - privateBits->dictionaryKeySizeThreshold = val; - return *this; - } - - double WriterOptions::getDictionaryKeySizeThreshold() const { - return privateBits->dictionaryKeySizeThreshold; - } - - WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) { - // Only Hive_0_11 and Hive_0_12 version are supported currently - if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) { - privateBits->fileVersion = version; - return *this; - } - throw std::logic_error("Unsupported file version specified."); - } - - FileVersion WriterOptions::getFileVersion() const { - return privateBits->fileVersion; - } - - WriterOptions& WriterOptions::setCompression(CompressionKind comp) { - privateBits->compression = comp; - return *this; - } - - CompressionKind WriterOptions::getCompression() const { - return privateBits->compression; - } - - WriterOptions& WriterOptions::setCompressionStrategy( - CompressionStrategy strategy) { - privateBits->compressionStrategy = strategy; - return *this; - } - - CompressionStrategy WriterOptions::getCompressionStrategy() const { - return privateBits->compressionStrategy; - } - - bool WriterOptions::getAlignedBitpacking() const { - return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED; - } - - WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) { - privateBits->paddingTolerance = tolerance; - return *this; - } - - double WriterOptions::getPaddingTolerance() const { - return privateBits->paddingTolerance; - } - - WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) { - privateBits->memoryPool = memoryPool; - return *this; - } - - MemoryPool* WriterOptions::getMemoryPool() const { - return privateBits->memoryPool; - } - - WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) { - privateBits->errorStream = &errStream; - return *this; - } - - std::ostream* WriterOptions::getErrorStream() const { - return privateBits->errorStream; - } - - bool WriterOptions::getEnableIndex() const { - return privateBits->enableIndex; - } - - bool WriterOptions::getEnableDictionary() const { - return privateBits->dictionaryKeySizeThreshold > 0.0; - } - - WriterOptions& WriterOptions::setColumnsUseBloomFilter( - const std::set<uint64_t>& columns) { - privateBits->columnsUseBloomFilter = columns; - return *this; - } - - bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const { - return privateBits->columnsUseBloomFilter.find(column) != - privateBits->columnsUseBloomFilter.end(); - } - - WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) { - privateBits->bloomFilterFalsePositiveProb = fpp; - return *this; - } - - double WriterOptions::getBloomFilterFPP() const { - return privateBits->bloomFilterFalsePositiveProb; - } - - // delibrately not provide setter to write bloom filter version because - // we only support UTF8 for now. - BloomFilterVersion WriterOptions::getBloomFilterVersion() const { - return privateBits->bloomFilterVersion; - } - - Writer::~Writer() { - // PASS - } - - class WriterImpl : public Writer { - private: - std::unique_ptr<ColumnWriter> columnWriter; - std::unique_ptr<BufferedOutputStream> compressionStream; - std::unique_ptr<BufferedOutputStream> bufferedStream; - std::unique_ptr<StreamsFactory> streamsFactory; - OutputStream* outStream; - WriterOptions options; - const Type& type; - uint64_t stripeRows, totalRows, indexRows; - uint64_t currentOffset; - proto::Footer fileFooter; - proto::PostScript postScript; - proto::StripeInformation stripeInfo; - proto::Metadata metadata; - - static const char* magicId; - static const WriterId writerId; - - public: - WriterImpl( - const Type& type, - OutputStream* stream, - const WriterOptions& options); - - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) - const override; - - void add(ColumnVectorBatch& rowsToAdd) override; - - void close() override; - - void addUserMetadata(const std::string name, const std::string value) override; - - private: - void init(); - void initStripe(); - void writeStripe(); - void writeMetadata(); - void writeFileFooter(); - void writePostscript(); - void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index); - static proto::CompressionKind convertCompressionKind( - const CompressionKind& kind); - }; - - const char * WriterImpl::magicId = "ORC"; - - const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER; - - WriterImpl::WriterImpl( - const Type& t, - OutputStream* stream, - const WriterOptions& opts) : - outStream(stream), - options(opts), - type(t) { - streamsFactory = createStreamsFactory(options, outStream); - columnWriter = buildWriter(type, *streamsFactory, options); - stripeRows = totalRows = indexRows = 0; - currentOffset = 0; - - // compression stream for stripe footer, file footer and metadata - compressionStream = createCompressor( - options.getCompression(), - outStream, - options.getCompressionStrategy(), - 1 * 1024 * 1024, // buffer capacity: 1M - options.getCompressionBlockSize(), - *options.getMemoryPool()); - - // uncompressed stream for post script - bufferedStream.reset(new BufferedOutputStream( - *options.getMemoryPool(), - outStream, - 1024, // buffer capacity: 1024 bytes - options.getCompressionBlockSize())); - - init(); - } - - std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) - const { - return type.createRowBatch(size, *options.getMemoryPool()); - } - - void WriterImpl::add(ColumnVectorBatch& rowsToAdd) { - if (options.getEnableIndex()) { - uint64_t pos = 0; - uint64_t chunkSize = 0; - uint64_t rowIndexStride = options.getRowIndexStride(); - while (pos < rowsToAdd.numElements) { - chunkSize = std::min(rowsToAdd.numElements - pos, - rowIndexStride - indexRows); - columnWriter->add(rowsToAdd, pos, chunkSize, nullptr); - - pos += chunkSize; - indexRows += chunkSize; - stripeRows += chunkSize; - - if (indexRows >= rowIndexStride) { - columnWriter->createRowIndexEntry(); - indexRows = 0; - } - } - } else { - stripeRows += rowsToAdd.numElements; - columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr); - } - - if (columnWriter->getEstimatedSize() >= options.getStripeSize()) { - writeStripe(); - } - } - - void WriterImpl::close() { - if (stripeRows > 0) { - writeStripe(); - } - writeMetadata(); - writeFileFooter(); - writePostscript(); - outStream->close(); - } - - void WriterImpl::addUserMetadata(const std::string name, const std::string value){ - proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata(); - userMetadataItem->set_name(TString(name)); - userMetadataItem->set_value(TString(value)); - } - - void WriterImpl::init() { - // Write file header - const static size_t magicIdLength = strlen(WriterImpl::magicId); - outStream->write(WriterImpl::magicId, magicIdLength); - currentOffset += magicIdLength; - - // Initialize file footer - fileFooter.set_headerlength(currentOffset); - fileFooter.set_contentlength(0); - fileFooter.set_numberofrows(0); - fileFooter.set_rowindexstride( - static_cast<uint32_t>(options.getRowIndexStride())); - fileFooter.set_writer(writerId); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Common.hh" +#include "orc/OrcFile.hh" + +#include "ColumnWriter.hh" +#include "Timezone.hh" + +#include <memory> + +namespace orc { + + struct WriterOptionsPrivate { + uint64_t stripeSize; + uint64_t compressionBlockSize; + uint64_t rowIndexStride; + CompressionKind compression; + CompressionStrategy compressionStrategy; + MemoryPool* memoryPool; + double paddingTolerance; + std::ostream* errorStream; + FileVersion fileVersion; + double dictionaryKeySizeThreshold; + bool enableIndex; + std::set<uint64_t> columnsUseBloomFilter; + double bloomFilterFalsePositiveProb; + BloomFilterVersion bloomFilterVersion; + + WriterOptionsPrivate() : + fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 + stripeSize = 64 * 1024 * 1024; // 64M + compressionBlockSize = 64 * 1024; // 64K + rowIndexStride = 10000; + compression = CompressionKind_ZLIB; + compressionStrategy = CompressionStrategy_SPEED; + memoryPool = getDefaultPool(); + paddingTolerance = 0.0; + errorStream = &std::cerr; + dictionaryKeySizeThreshold = 0.0; + enableIndex = true; + bloomFilterFalsePositiveProb = 0.05; + bloomFilterVersion = UTF8; + } + }; + + WriterOptions::WriterOptions(): + privateBits(std::unique_ptr<WriterOptionsPrivate> + (new WriterOptionsPrivate())) { + // PASS + } + + WriterOptions::WriterOptions(const WriterOptions& rhs): + privateBits(std::unique_ptr<WriterOptionsPrivate> + (new WriterOptionsPrivate(*(rhs.privateBits.get())))) { + // PASS + } + + WriterOptions::WriterOptions(WriterOptions& rhs) { + // swap privateBits with rhs + WriterOptionsPrivate* l = privateBits.release(); + privateBits.reset(rhs.privateBits.release()); + rhs.privateBits.reset(l); + } + + WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) { + if (this != &rhs) { + privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get()))); + } + return *this; + } + + WriterOptions::~WriterOptions() { + // PASS + } + RleVersion WriterOptions::getRleVersion() const { + if(privateBits->fileVersion == FileVersion::v_0_11()) + { + return RleVersion_1; + } + + return RleVersion_2; + } + + WriterOptions& WriterOptions::setStripeSize(uint64_t size) { + privateBits->stripeSize = size; + return *this; + } + + uint64_t WriterOptions::getStripeSize() const { + return privateBits->stripeSize; + } + + WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) { + privateBits->compressionBlockSize = size; + return *this; + } + + uint64_t WriterOptions::getCompressionBlockSize() const { + return privateBits->compressionBlockSize; + } + + WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) { + privateBits->rowIndexStride = stride; + privateBits->enableIndex = (stride != 0); + return *this; + } + + uint64_t WriterOptions::getRowIndexStride() const { + return privateBits->rowIndexStride; + } + + WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) { + privateBits->dictionaryKeySizeThreshold = val; + return *this; + } + + double WriterOptions::getDictionaryKeySizeThreshold() const { + return privateBits->dictionaryKeySizeThreshold; + } + + WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) { + // Only Hive_0_11 and Hive_0_12 version are supported currently + if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) { + privateBits->fileVersion = version; + return *this; + } + throw std::logic_error("Unsupported file version specified."); + } + + FileVersion WriterOptions::getFileVersion() const { + return privateBits->fileVersion; + } + + WriterOptions& WriterOptions::setCompression(CompressionKind comp) { + privateBits->compression = comp; + return *this; + } + + CompressionKind WriterOptions::getCompression() const { + return privateBits->compression; + } + + WriterOptions& WriterOptions::setCompressionStrategy( + CompressionStrategy strategy) { + privateBits->compressionStrategy = strategy; + return *this; + } + + CompressionStrategy WriterOptions::getCompressionStrategy() const { + return privateBits->compressionStrategy; + } + + bool WriterOptions::getAlignedBitpacking() const { + return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED; + } + + WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) { + privateBits->paddingTolerance = tolerance; + return *this; + } + + double WriterOptions::getPaddingTolerance() const { + return privateBits->paddingTolerance; + } + + WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) { + privateBits->memoryPool = memoryPool; + return *this; + } + + MemoryPool* WriterOptions::getMemoryPool() const { + return privateBits->memoryPool; + } + + WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) { + privateBits->errorStream = &errStream; + return *this; + } + + std::ostream* WriterOptions::getErrorStream() const { + return privateBits->errorStream; + } + + bool WriterOptions::getEnableIndex() const { + return privateBits->enableIndex; + } + + bool WriterOptions::getEnableDictionary() const { + return privateBits->dictionaryKeySizeThreshold > 0.0; + } + + WriterOptions& WriterOptions::setColumnsUseBloomFilter( + const std::set<uint64_t>& columns) { + privateBits->columnsUseBloomFilter = columns; + return *this; + } + + bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const { + return privateBits->columnsUseBloomFilter.find(column) != + privateBits->columnsUseBloomFilter.end(); + } + + WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) { + privateBits->bloomFilterFalsePositiveProb = fpp; + return *this; + } + + double WriterOptions::getBloomFilterFPP() const { + return privateBits->bloomFilterFalsePositiveProb; + } + + // delibrately not provide setter to write bloom filter version because + // we only support UTF8 for now. + BloomFilterVersion WriterOptions::getBloomFilterVersion() const { + return privateBits->bloomFilterVersion; + } + + Writer::~Writer() { + // PASS + } + + class WriterImpl : public Writer { + private: + std::unique_ptr<ColumnWriter> columnWriter; + std::unique_ptr<BufferedOutputStream> compressionStream; + std::unique_ptr<BufferedOutputStream> bufferedStream; + std::unique_ptr<StreamsFactory> streamsFactory; + OutputStream* outStream; + WriterOptions options; + const Type& type; + uint64_t stripeRows, totalRows, indexRows; + uint64_t currentOffset; + proto::Footer fileFooter; + proto::PostScript postScript; + proto::StripeInformation stripeInfo; + proto::Metadata metadata; + + static const char* magicId; + static const WriterId writerId; + + public: + WriterImpl( + const Type& type, + OutputStream* stream, + const WriterOptions& options); + + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) + const override; + + void add(ColumnVectorBatch& rowsToAdd) override; + + void close() override; + + void addUserMetadata(const std::string name, const std::string value) override; + + private: + void init(); + void initStripe(); + void writeStripe(); + void writeMetadata(); + void writeFileFooter(); + void writePostscript(); + void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index); + static proto::CompressionKind convertCompressionKind( + const CompressionKind& kind); + }; + + const char * WriterImpl::magicId = "ORC"; + + const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER; + + WriterImpl::WriterImpl( + const Type& t, + OutputStream* stream, + const WriterOptions& opts) : + outStream(stream), + options(opts), + type(t) { + streamsFactory = createStreamsFactory(options, outStream); + columnWriter = buildWriter(type, *streamsFactory, options); + stripeRows = totalRows = indexRows = 0; + currentOffset = 0; + + // compression stream for stripe footer, file footer and metadata + compressionStream = createCompressor( + options.getCompression(), + outStream, + options.getCompressionStrategy(), + 1 * 1024 * 1024, // buffer capacity: 1M + options.getCompressionBlockSize(), + *options.getMemoryPool()); + + // uncompressed stream for post script + bufferedStream.reset(new BufferedOutputStream( + *options.getMemoryPool(), + outStream, + 1024, // buffer capacity: 1024 bytes + options.getCompressionBlockSize())); + + init(); + } + + std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) + const { + return type.createRowBatch(size, *options.getMemoryPool()); + } + + void WriterImpl::add(ColumnVectorBatch& rowsToAdd) { + if (options.getEnableIndex()) { + uint64_t pos = 0; + uint64_t chunkSize = 0; + uint64_t rowIndexStride = options.getRowIndexStride(); + while (pos < rowsToAdd.numElements) { + chunkSize = std::min(rowsToAdd.numElements - pos, + rowIndexStride - indexRows); + columnWriter->add(rowsToAdd, pos, chunkSize, nullptr); + + pos += chunkSize; + indexRows += chunkSize; + stripeRows += chunkSize; + + if (indexRows >= rowIndexStride) { + columnWriter->createRowIndexEntry(); + indexRows = 0; + } + } + } else { + stripeRows += rowsToAdd.numElements; + columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr); + } + + if (columnWriter->getEstimatedSize() >= options.getStripeSize()) { + writeStripe(); + } + } + + void WriterImpl::close() { + if (stripeRows > 0) { + writeStripe(); + } + writeMetadata(); + writeFileFooter(); + writePostscript(); + outStream->close(); + } + + void WriterImpl::addUserMetadata(const std::string name, const std::string value){ + proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata(); + userMetadataItem->set_name(TString(name)); + userMetadataItem->set_value(TString(value)); + } + + void WriterImpl::init() { + // Write file header + const static size_t magicIdLength = strlen(WriterImpl::magicId); + outStream->write(WriterImpl::magicId, magicIdLength); + currentOffset += magicIdLength; + + // Initialize file footer + fileFooter.set_headerlength(currentOffset); + fileFooter.set_contentlength(0); + fileFooter.set_numberofrows(0); + fileFooter.set_rowindexstride( + static_cast<uint32_t>(options.getRowIndexStride())); + fileFooter.set_writer(writerId); fileFooter.set_softwareversion(ORC_VERSION); - - uint32_t index = 0; - buildFooterType(type, fileFooter, index); - - // Initialize post script - postScript.set_footerlength(0); - postScript.set_compression( - WriterImpl::convertCompressionKind(options.getCompression())); - postScript.set_compressionblocksize(options.getCompressionBlockSize()); - - postScript.add_version(options.getFileVersion().getMajor()); - postScript.add_version(options.getFileVersion().getMinor()); - - postScript.set_writerversion(WriterVersion_ORC_135); - postScript.set_magic("ORC"); - - // Initialize first stripe - initStripe(); - } - - void WriterImpl::initStripe() { - stripeInfo.set_offset(currentOffset); - stripeInfo.set_indexlength(0); - stripeInfo.set_datalength(0); - stripeInfo.set_footerlength(0); - stripeInfo.set_numberofrows(0); - - stripeRows = indexRows = 0; - } - - void WriterImpl::writeStripe() { - if (options.getEnableIndex() && indexRows != 0) { - columnWriter->createRowIndexEntry(); - indexRows = 0; - } else { - columnWriter->mergeRowGroupStatsIntoStripeStats(); - } - - // dictionary should be written before any stream is flushed - columnWriter->writeDictionary(); - - std::vector<proto::Stream> streams; - // write ROW_INDEX streams - if (options.getEnableIndex()) { - columnWriter->writeIndex(streams); - } - // write streams like PRESENT, DATA, etc. - columnWriter->flush(streams); - - // generate and write stripe footer - proto::StripeFooter stripeFooter; - for (uint32_t i = 0; i < streams.size(); ++i) { - *stripeFooter.add_streams() = streams[i]; - } - - std::vector<proto::ColumnEncoding> encodings; - columnWriter->getColumnEncoding(encodings); - - for (uint32_t i = 0; i < encodings.size(); ++i) { - *stripeFooter.add_columns() = encodings[i]; - } - - // use GMT to guarantee TimestampVectorBatch from reader can write - // same wall clock time - stripeFooter.set_writertimezone("GMT"); - - // add stripe statistics to metadata - proto::StripeStatistics* stripeStats = metadata.add_stripestats(); - std::vector<proto::ColumnStatistics> colStats; - columnWriter->getStripeStatistics(colStats); - for (uint32_t i = 0; i != colStats.size(); ++i) { - *stripeStats->add_colstats() = colStats[i]; - } - // merge stripe stats into file stats and clear stripe stats - columnWriter->mergeStripeStatsIntoFileStats(); - - if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) { - throw std::logic_error("Failed to write stripe footer."); - } - uint64_t footerLength = compressionStream->flush(); - - // calculate data length and index length - uint64_t dataLength = 0; - uint64_t indexLength = 0; - for (uint32_t i = 0; i < streams.size(); ++i) { - if (streams[i].kind() == proto::Stream_Kind_ROW_INDEX || - streams[i].kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8) { - indexLength += streams[i].length(); - } else { - dataLength += streams[i].length(); - } - } - - // update stripe info - stripeInfo.set_indexlength(indexLength); - stripeInfo.set_datalength(dataLength); - stripeInfo.set_footerlength(footerLength); - stripeInfo.set_numberofrows(stripeRows); - - *fileFooter.add_stripes() = stripeInfo; - - currentOffset = currentOffset + indexLength + dataLength + footerLength; - totalRows += stripeRows; - - columnWriter->reset(); - - initStripe(); - } - - void WriterImpl::writeMetadata() { - if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) { - throw std::logic_error("Failed to write metadata."); - } - postScript.set_metadatalength(compressionStream.get()->flush()); - } - - void WriterImpl::writeFileFooter() { - fileFooter.set_contentlength(currentOffset - fileFooter.headerlength()); - fileFooter.set_numberofrows(totalRows); - - // update file statistics - std::vector<proto::ColumnStatistics> colStats; - columnWriter->getFileStatistics(colStats); - for (uint32_t i = 0; i != colStats.size(); ++i) { - *fileFooter.add_statistics() = colStats[i]; - } - - if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) { - throw std::logic_error("Failed to write file footer."); - } - postScript.set_footerlength(compressionStream->flush()); - } - - void WriterImpl::writePostscript() { - if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) { - throw std::logic_error("Failed to write post script."); - } - unsigned char psLength = - static_cast<unsigned char>(bufferedStream->flush()); - outStream->write(&psLength, sizeof(unsigned char)); - } - - void WriterImpl::buildFooterType( - const Type& t, - proto::Footer& footer, - uint32_t & index) { - proto::Type protoType; - protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength())); - protoType.set_precision(static_cast<uint32_t>(t.getPrecision())); - protoType.set_scale(static_cast<uint32_t>(t.getScale())); - - switch (t.getKind()) { - case BOOLEAN: { - protoType.set_kind(proto::Type_Kind_BOOLEAN); - break; - } - case BYTE: { - protoType.set_kind(proto::Type_Kind_BYTE); - break; - } - case SHORT: { - protoType.set_kind(proto::Type_Kind_SHORT); - break; - } - case INT: { - protoType.set_kind(proto::Type_Kind_INT); - break; - } - case LONG: { - protoType.set_kind(proto::Type_Kind_LONG); - break; - } - case FLOAT: { - protoType.set_kind(proto::Type_Kind_FLOAT); - break; - } - case DOUBLE: { - protoType.set_kind(proto::Type_Kind_DOUBLE); - break; - } - case STRING: { - protoType.set_kind(proto::Type_Kind_STRING); - break; - } - case BINARY: { - protoType.set_kind(proto::Type_Kind_BINARY); - break; - } - case TIMESTAMP: { - protoType.set_kind(proto::Type_Kind_TIMESTAMP); - break; - } - case LIST: { - protoType.set_kind(proto::Type_Kind_LIST); - break; - } - case MAP: { - protoType.set_kind(proto::Type_Kind_MAP); - break; - } - case STRUCT: { - protoType.set_kind(proto::Type_Kind_STRUCT); - break; - } - case UNION: { - protoType.set_kind(proto::Type_Kind_UNION); - break; - } - case DECIMAL: { - protoType.set_kind(proto::Type_Kind_DECIMAL); - break; - } - case DATE: { - protoType.set_kind(proto::Type_Kind_DATE); - break; - } - case VARCHAR: { - protoType.set_kind(proto::Type_Kind_VARCHAR); - break; - } - case CHAR: { - protoType.set_kind(proto::Type_Kind_CHAR); - break; - } - default: - throw std::logic_error("Unknown type."); - } - - int pos = static_cast<int>(index); - *footer.add_types() = protoType; - - for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) { - // only add subtypes' field names if this type is STRUCT - if (t.getKind() == STRUCT) { - footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i))); - } - footer.mutable_types(pos)->add_subtypes(++index); - buildFooterType(*t.getSubtype(i), footer, index); - } - } - - proto::CompressionKind WriterImpl::convertCompressionKind( - const CompressionKind& kind) { - return static_cast<proto::CompressionKind>(kind); - } - - std::unique_ptr<Writer> createWriter( - const Type& type, - OutputStream* stream, - const WriterOptions& options) { - return std::unique_ptr<Writer>( - new WriterImpl( - type, - stream, - options)); - } - -} - + + uint32_t index = 0; + buildFooterType(type, fileFooter, index); + + // Initialize post script + postScript.set_footerlength(0); + postScript.set_compression( + WriterImpl::convertCompressionKind(options.getCompression())); + postScript.set_compressionblocksize(options.getCompressionBlockSize()); + + postScript.add_version(options.getFileVersion().getMajor()); + postScript.add_version(options.getFileVersion().getMinor()); + + postScript.set_writerversion(WriterVersion_ORC_135); + postScript.set_magic("ORC"); + + // Initialize first stripe + initStripe(); + } + + void WriterImpl::initStripe() { + stripeInfo.set_offset(currentOffset); + stripeInfo.set_indexlength(0); + stripeInfo.set_datalength(0); + stripeInfo.set_footerlength(0); + stripeInfo.set_numberofrows(0); + + stripeRows = indexRows = 0; + } + + void WriterImpl::writeStripe() { + if (options.getEnableIndex() && indexRows != 0) { + columnWriter->createRowIndexEntry(); + indexRows = 0; + } else { + columnWriter->mergeRowGroupStatsIntoStripeStats(); + } + + // dictionary should be written before any stream is flushed + columnWriter->writeDictionary(); + + std::vector<proto::Stream> streams; + // write ROW_INDEX streams + if (options.getEnableIndex()) { + columnWriter->writeIndex(streams); + } + // write streams like PRESENT, DATA, etc. + columnWriter->flush(streams); + + // generate and write stripe footer + proto::StripeFooter stripeFooter; + for (uint32_t i = 0; i < streams.size(); ++i) { + *stripeFooter.add_streams() = streams[i]; + } + + std::vector<proto::ColumnEncoding> encodings; + columnWriter->getColumnEncoding(encodings); + + for (uint32_t i = 0; i < encodings.size(); ++i) { + *stripeFooter.add_columns() = encodings[i]; + } + + // use GMT to guarantee TimestampVectorBatch from reader can write + // same wall clock time + stripeFooter.set_writertimezone("GMT"); + + // add stripe statistics to metadata + proto::StripeStatistics* stripeStats = metadata.add_stripestats(); + std::vector<proto::ColumnStatistics> colStats; + columnWriter->getStripeStatistics(colStats); + for (uint32_t i = 0; i != colStats.size(); ++i) { + *stripeStats->add_colstats() = colStats[i]; + } + // merge stripe stats into file stats and clear stripe stats + columnWriter->mergeStripeStatsIntoFileStats(); + + if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) { + throw std::logic_error("Failed to write stripe footer."); + } + uint64_t footerLength = compressionStream->flush(); + + // calculate data length and index length + uint64_t dataLength = 0; + uint64_t indexLength = 0; + for (uint32_t i = 0; i < streams.size(); ++i) { + if (streams[i].kind() == proto::Stream_Kind_ROW_INDEX || + streams[i].kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8) { + indexLength += streams[i].length(); + } else { + dataLength += streams[i].length(); + } + } + + // update stripe info + stripeInfo.set_indexlength(indexLength); + stripeInfo.set_datalength(dataLength); + stripeInfo.set_footerlength(footerLength); + stripeInfo.set_numberofrows(stripeRows); + + *fileFooter.add_stripes() = stripeInfo; + + currentOffset = currentOffset + indexLength + dataLength + footerLength; + totalRows += stripeRows; + + columnWriter->reset(); + + initStripe(); + } + + void WriterImpl::writeMetadata() { + if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) { + throw std::logic_error("Failed to write metadata."); + } + postScript.set_metadatalength(compressionStream.get()->flush()); + } + + void WriterImpl::writeFileFooter() { + fileFooter.set_contentlength(currentOffset - fileFooter.headerlength()); + fileFooter.set_numberofrows(totalRows); + + // update file statistics + std::vector<proto::ColumnStatistics> colStats; + columnWriter->getFileStatistics(colStats); + for (uint32_t i = 0; i != colStats.size(); ++i) { + *fileFooter.add_statistics() = colStats[i]; + } + + if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) { + throw std::logic_error("Failed to write file footer."); + } + postScript.set_footerlength(compressionStream->flush()); + } + + void WriterImpl::writePostscript() { + if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) { + throw std::logic_error("Failed to write post script."); + } + unsigned char psLength = + static_cast<unsigned char>(bufferedStream->flush()); + outStream->write(&psLength, sizeof(unsigned char)); + } + + void WriterImpl::buildFooterType( + const Type& t, + proto::Footer& footer, + uint32_t & index) { + proto::Type protoType; + protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength())); + protoType.set_precision(static_cast<uint32_t>(t.getPrecision())); + protoType.set_scale(static_cast<uint32_t>(t.getScale())); + + switch (t.getKind()) { + case BOOLEAN: { + protoType.set_kind(proto::Type_Kind_BOOLEAN); + break; + } + case BYTE: { + protoType.set_kind(proto::Type_Kind_BYTE); + break; + } + case SHORT: { + protoType.set_kind(proto::Type_Kind_SHORT); + break; + } + case INT: { + protoType.set_kind(proto::Type_Kind_INT); + break; + } + case LONG: { + protoType.set_kind(proto::Type_Kind_LONG); + break; + } + case FLOAT: { + protoType.set_kind(proto::Type_Kind_FLOAT); + break; + } + case DOUBLE: { + protoType.set_kind(proto::Type_Kind_DOUBLE); + break; + } + case STRING: { + protoType.set_kind(proto::Type_Kind_STRING); + break; + } + case BINARY: { + protoType.set_kind(proto::Type_Kind_BINARY); + break; + } + case TIMESTAMP: { + protoType.set_kind(proto::Type_Kind_TIMESTAMP); + break; + } + case LIST: { + protoType.set_kind(proto::Type_Kind_LIST); + break; + } + case MAP: { + protoType.set_kind(proto::Type_Kind_MAP); + break; + } + case STRUCT: { + protoType.set_kind(proto::Type_Kind_STRUCT); + break; + } + case UNION: { + protoType.set_kind(proto::Type_Kind_UNION); + break; + } + case DECIMAL: { + protoType.set_kind(proto::Type_Kind_DECIMAL); + break; + } + case DATE: { + protoType.set_kind(proto::Type_Kind_DATE); + break; + } + case VARCHAR: { + protoType.set_kind(proto::Type_Kind_VARCHAR); + break; + } + case CHAR: { + protoType.set_kind(proto::Type_Kind_CHAR); + break; + } + default: + throw std::logic_error("Unknown type."); + } + + int pos = static_cast<int>(index); + *footer.add_types() = protoType; + + for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) { + // only add subtypes' field names if this type is STRUCT + if (t.getKind() == STRUCT) { + footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i))); + } + footer.mutable_types(pos)->add_subtypes(++index); + buildFooterType(*t.getSubtype(i), footer, index); + } + } + + proto::CompressionKind WriterImpl::convertCompressionKind( + const CompressionKind& kind) { + return static_cast<proto::CompressionKind>(kind); + } + + std::unique_ptr<Writer> createWriter( + const Type& type, + OutputStream* stream, + const WriterOptions& options) { + return std::unique_ptr<Writer>( + new WriterImpl( + type, + stream, + options)); + } + +} + diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc index 6e54b1412f..201f6f9c1d 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc @@ -1,222 +1,222 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "InputStream.hh" - -#include <algorithm> -#include <iomanip> - -namespace orc { - - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length) { - const uint64_t width = 24; - out << std::hex; - for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { - out << std::setfill('0') << std::setw(7) << (line * width); - for(uint64_t byte = 0; - byte < width && line * width + byte < length; ++byte) { - out << " " << std::setfill('0') << std::setw(2) - << static_cast<uint64_t>(0xff & buffer[line * width + - byte]); - } - out << "\n"; - } - out << std::dec; - } - - PositionProvider::PositionProvider(const std::list<uint64_t>& posns) { - position = posns.begin(); - } - - uint64_t PositionProvider::next() { - uint64_t result = *position; - ++position; - return result; - } - - SeekableInputStream::~SeekableInputStream() { - // PASS - } - - SeekableArrayInputStream::~SeekableArrayInputStream() { - // PASS - } - - SeekableArrayInputStream::SeekableArrayInputStream - (const unsigned char* values, - uint64_t size, - uint64_t blkSize - ): data(reinterpret_cast<const char*>(values)) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); - } - - SeekableArrayInputStream::SeekableArrayInputStream(const char* values, - uint64_t size, - uint64_t blkSize - ): data(values) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); - } - - bool SeekableArrayInputStream::Next(const void** buffer, int*size) { - uint64_t currentSize = std::min(length - position, blockSize); - if (currentSize > 0) { - *buffer = data + position; - *size = static_cast<int>(currentSize); - position += currentSize; - return true; - } - *size = 0; - return false; - } - - void SeekableArrayInputStream::BackUp(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount <= blockSize && unsignedCount <= position) { - position -= unsignedCount; - } else { - throw std::logic_error("Can't backup that much!"); - } - } - } - - bool SeekableArrayInputStream::Skip(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount + position <= length) { - position += unsignedCount; - return true; - } else { - position = length; - } - } - return false; - } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "InputStream.hh" + +#include <algorithm> +#include <iomanip> + +namespace orc { + + void printBuffer(std::ostream& out, + const char *buffer, + uint64_t length) { + const uint64_t width = 24; + out << std::hex; + for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { + out << std::setfill('0') << std::setw(7) << (line * width); + for(uint64_t byte = 0; + byte < width && line * width + byte < length; ++byte) { + out << " " << std::setfill('0') << std::setw(2) + << static_cast<uint64_t>(0xff & buffer[line * width + + byte]); + } + out << "\n"; + } + out << std::dec; + } + + PositionProvider::PositionProvider(const std::list<uint64_t>& posns) { + position = posns.begin(); + } + + uint64_t PositionProvider::next() { + uint64_t result = *position; + ++position; + return result; + } + + SeekableInputStream::~SeekableInputStream() { + // PASS + } + + SeekableArrayInputStream::~SeekableArrayInputStream() { + // PASS + } + + SeekableArrayInputStream::SeekableArrayInputStream + (const unsigned char* values, + uint64_t size, + uint64_t blkSize + ): data(reinterpret_cast<const char*>(values)) { + length = size; + position = 0; + blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); + } + + SeekableArrayInputStream::SeekableArrayInputStream(const char* values, + uint64_t size, + uint64_t blkSize + ): data(values) { + length = size; + position = 0; + blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); + } + + bool SeekableArrayInputStream::Next(const void** buffer, int*size) { + uint64_t currentSize = std::min(length - position, blockSize); + if (currentSize > 0) { + *buffer = data + position; + *size = static_cast<int>(currentSize); + position += currentSize; + return true; + } + *size = 0; + return false; + } + + void SeekableArrayInputStream::BackUp(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast<uint64_t>(count); + if (unsignedCount <= blockSize && unsignedCount <= position) { + position -= unsignedCount; + } else { + throw std::logic_error("Can't backup that much!"); + } + } + } + + bool SeekableArrayInputStream::Skip(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast<uint64_t>(count); + if (unsignedCount + position <= length) { + position += unsignedCount; + return true; + } else { + position = length; + } + } + return false; + } + int64_t SeekableArrayInputStream::ByteCount() const { - return static_cast<google::protobuf::int64>(position); - } - - void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { - position = seekPosition.next(); - } - - std::string SeekableArrayInputStream::getName() const { - std::ostringstream result; - result << "SeekableArrayInputStream " << position << " of " << length; - return result.str(); - } - - static uint64_t computeBlock(uint64_t request, uint64_t length) { - return std::min(length, request == 0 ? 256 * 1024 : request); - } - - SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, - uint64_t offset, - uint64_t byteCount, - MemoryPool& _pool, - uint64_t _blockSize - ):pool(_pool), - input(stream), - start(offset), - length(byteCount), - blockSize(computeBlock - (_blockSize, - length)) { - - position = 0; - buffer.reset(new DataBuffer<char>(pool)); - pushBack = 0; - } - - SeekableFileInputStream::~SeekableFileInputStream() { - // PASS - } - - bool SeekableFileInputStream::Next(const void** data, int*size) { - uint64_t bytesRead; - if (pushBack != 0) { - *data = buffer->data() + (buffer->size() - pushBack); - bytesRead = pushBack; - } else { - bytesRead = std::min(length - position, blockSize); - buffer->resize(bytesRead); - if (bytesRead > 0) { - input->read(buffer->data(), bytesRead, start+position); - *data = static_cast<void*>(buffer->data()); - } - } - position += bytesRead; - pushBack = 0; - *size = static_cast<int>(bytesRead); - return bytesRead != 0; - } - - void SeekableFileInputStream::BackUp(int signedCount) { - if (signedCount < 0) { - throw std::logic_error("can't backup negative distances"); - } - uint64_t count = static_cast<uint64_t>(signedCount); - if (pushBack > 0) { - throw std::logic_error("can't backup unless we just called Next"); - } - if (count > blockSize || count > position) { - throw std::logic_error("can't backup that far"); - } - pushBack = static_cast<uint64_t>(count); - position -= pushBack; - } - - bool SeekableFileInputStream::Skip(int signedCount) { - if (signedCount < 0) { - return false; - } - uint64_t count = static_cast<uint64_t>(signedCount); - position = std::min(position + count, length); - pushBack = 0; - return position < length; - } - - int64_t SeekableFileInputStream::ByteCount() const { - return static_cast<int64_t>(position); - } - - void SeekableFileInputStream::seek(PositionProvider& location) { - position = location.next(); - if (position > length) { - position = length; - throw std::logic_error("seek too far"); - } - pushBack = 0; - } - - std::string SeekableFileInputStream::getName() const { - std::ostringstream result; - result << input->getName() << " from " << start << " for " - << length; - return result.str(); - } - -} + return static_cast<google::protobuf::int64>(position); + } + + void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { + position = seekPosition.next(); + } + + std::string SeekableArrayInputStream::getName() const { + std::ostringstream result; + result << "SeekableArrayInputStream " << position << " of " << length; + return result.str(); + } + + static uint64_t computeBlock(uint64_t request, uint64_t length) { + return std::min(length, request == 0 ? 256 * 1024 : request); + } + + SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, + uint64_t offset, + uint64_t byteCount, + MemoryPool& _pool, + uint64_t _blockSize + ):pool(_pool), + input(stream), + start(offset), + length(byteCount), + blockSize(computeBlock + (_blockSize, + length)) { + + position = 0; + buffer.reset(new DataBuffer<char>(pool)); + pushBack = 0; + } + + SeekableFileInputStream::~SeekableFileInputStream() { + // PASS + } + + bool SeekableFileInputStream::Next(const void** data, int*size) { + uint64_t bytesRead; + if (pushBack != 0) { + *data = buffer->data() + (buffer->size() - pushBack); + bytesRead = pushBack; + } else { + bytesRead = std::min(length - position, blockSize); + buffer->resize(bytesRead); + if (bytesRead > 0) { + input->read(buffer->data(), bytesRead, start+position); + *data = static_cast<void*>(buffer->data()); + } + } + position += bytesRead; + pushBack = 0; + *size = static_cast<int>(bytesRead); + return bytesRead != 0; + } + + void SeekableFileInputStream::BackUp(int signedCount) { + if (signedCount < 0) { + throw std::logic_error("can't backup negative distances"); + } + uint64_t count = static_cast<uint64_t>(signedCount); + if (pushBack > 0) { + throw std::logic_error("can't backup unless we just called Next"); + } + if (count > blockSize || count > position) { + throw std::logic_error("can't backup that far"); + } + pushBack = static_cast<uint64_t>(count); + position -= pushBack; + } + + bool SeekableFileInputStream::Skip(int signedCount) { + if (signedCount < 0) { + return false; + } + uint64_t count = static_cast<uint64_t>(signedCount); + position = std::min(position + count, length); + pushBack = 0; + return position < length; + } + + int64_t SeekableFileInputStream::ByteCount() const { + return static_cast<int64_t>(position); + } + + void SeekableFileInputStream::seek(PositionProvider& location) { + position = location.next(); + if (position > length) { + position = length; + throw std::logic_error("seek too far"); + } + pushBack = 0; + } + + std::string SeekableFileInputStream::getName() const { + std::ostringstream result; + result << input->getName() << " from " << start << " for " + << length; + return result.str(); + } + +} diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh index d8bd3d4d8c..797049a300 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh @@ -1,116 +1,116 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_INPUTSTREAM_HH -#define ORC_INPUTSTREAM_HH - -#include "Adaptor.hh" -#include "orc/OrcFile.hh" -#include "wrap/zero-copy-stream-wrapper.h" - -#include <list> -#include <fstream> -#include <iostream> -#include <sstream> -#include <vector> - -namespace orc { - - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length); - - class PositionProvider { - private: - std::list<uint64_t>::const_iterator position; - public: - PositionProvider(const std::list<uint64_t>& positions); - uint64_t next(); - }; - - /** - * A subclass of Google's ZeroCopyInputStream that supports seek. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf readers. - */ - class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { - public: - virtual ~SeekableInputStream(); - virtual void seek(PositionProvider& position) = 0; - virtual std::string getName() const = 0; - }; - - /** - * Create a seekable input stream based on a memory range. - */ - class SeekableArrayInputStream: public SeekableInputStream { - private: - const char* data; - uint64_t length; - uint64_t position; - uint64_t blockSize; - - public: - SeekableArrayInputStream(const unsigned char* list, - uint64_t length, - uint64_t block_size = 0); - SeekableArrayInputStream(const char* list, - uint64_t length, - uint64_t block_size = 0); - virtual ~SeekableArrayInputStream() override; - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_INPUTSTREAM_HH +#define ORC_INPUTSTREAM_HH + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "wrap/zero-copy-stream-wrapper.h" + +#include <list> +#include <fstream> +#include <iostream> +#include <sstream> +#include <vector> + +namespace orc { + + void printBuffer(std::ostream& out, + const char *buffer, + uint64_t length); + + class PositionProvider { + private: + std::list<uint64_t>::const_iterator position; + public: + PositionProvider(const std::list<uint64_t>& positions); + uint64_t next(); + }; + + /** + * A subclass of Google's ZeroCopyInputStream that supports seek. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf readers. + */ + class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { + public: + virtual ~SeekableInputStream(); + virtual void seek(PositionProvider& position) = 0; + virtual std::string getName() const = 0; + }; + + /** + * Create a seekable input stream based on a memory range. + */ + class SeekableArrayInputStream: public SeekableInputStream { + private: + const char* data; + uint64_t length; + uint64_t position; + uint64_t blockSize; + + public: + SeekableArrayInputStream(const unsigned char* list, + uint64_t length, + uint64_t block_size = 0); + SeekableArrayInputStream(const char* list, + uint64_t length, + uint64_t block_size = 0); + virtual ~SeekableArrayInputStream() override; + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - }; - - /** - * Create a seekable input stream based on an input stream. - */ - class SeekableFileInputStream: public SeekableInputStream { - private: - MemoryPool& pool; - InputStream* const input; - const uint64_t start; - const uint64_t length; - const uint64_t blockSize; - std::unique_ptr<DataBuffer<char> > buffer; - uint64_t position; - uint64_t pushBack; - - public: - SeekableFileInputStream(InputStream* input, - uint64_t offset, - uint64_t byteCount, - MemoryPool& pool, - uint64_t blockSize = 0); - virtual ~SeekableFileInputStream() override; - - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - }; - -} - -#endif //ORC_INPUTSTREAM_HH + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; + + /** + * Create a seekable input stream based on an input stream. + */ + class SeekableFileInputStream: public SeekableInputStream { + private: + MemoryPool& pool; + InputStream* const input; + const uint64_t start; + const uint64_t length; + const uint64_t blockSize; + std::unique_ptr<DataBuffer<char> > buffer; + uint64_t position; + uint64_t pushBack; + + public: + SeekableFileInputStream(InputStream* input, + uint64_t offset, + uint64_t byteCount, + MemoryPool& pool, + uint64_t blockSize = 0); + virtual ~SeekableFileInputStream() override; + + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; + +} + +#endif //ORC_INPUTSTREAM_HH diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc index 11a21c0bd3..dd9327adf9 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc @@ -1,147 +1,147 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "OutputStream.hh" - -#include <sstream> - -namespace orc { - - PositionRecorder::~PositionRecorder() { - // PASS - } - - BufferedOutputStream::BufferedOutputStream( - MemoryPool& pool, - OutputStream * outStream, - uint64_t capacity_, - uint64_t blockSize_) - : outputStream(outStream), - blockSize(blockSize_) { - dataBuffer.reset(new DataBuffer<char>(pool)); - dataBuffer->reserve(capacity_); - } - - BufferedOutputStream::~BufferedOutputStream() { - // PASS - } - - bool BufferedOutputStream::Next(void** buffer, int* size) { - *size = static_cast<int>(blockSize); - uint64_t oldSize = dataBuffer->size(); - uint64_t newSize = oldSize + blockSize; - uint64_t newCapacity = dataBuffer->capacity(); - while (newCapacity < newSize) { - newCapacity += dataBuffer->capacity(); - } - dataBuffer->reserve(newCapacity); - dataBuffer->resize(newSize); - *buffer = dataBuffer->data() + oldSize; - return true; - } - - void BufferedOutputStream::BackUp(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount <= dataBuffer->size()) { - dataBuffer->resize(dataBuffer->size() - unsignedCount); - } else { - throw std::logic_error("Can't backup that much!"); - } - } - } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "OutputStream.hh" + +#include <sstream> + +namespace orc { + + PositionRecorder::~PositionRecorder() { + // PASS + } + + BufferedOutputStream::BufferedOutputStream( + MemoryPool& pool, + OutputStream * outStream, + uint64_t capacity_, + uint64_t blockSize_) + : outputStream(outStream), + blockSize(blockSize_) { + dataBuffer.reset(new DataBuffer<char>(pool)); + dataBuffer->reserve(capacity_); + } + + BufferedOutputStream::~BufferedOutputStream() { + // PASS + } + + bool BufferedOutputStream::Next(void** buffer, int* size) { + *size = static_cast<int>(blockSize); + uint64_t oldSize = dataBuffer->size(); + uint64_t newSize = oldSize + blockSize; + uint64_t newCapacity = dataBuffer->capacity(); + while (newCapacity < newSize) { + newCapacity += dataBuffer->capacity(); + } + dataBuffer->reserve(newCapacity); + dataBuffer->resize(newSize); + *buffer = dataBuffer->data() + oldSize; + return true; + } + + void BufferedOutputStream::BackUp(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast<uint64_t>(count); + if (unsignedCount <= dataBuffer->size()) { + dataBuffer->resize(dataBuffer->size() - unsignedCount); + } else { + throw std::logic_error("Can't backup that much!"); + } + } + } + int64_t BufferedOutputStream::ByteCount() const { - return static_cast<google::protobuf::int64>(dataBuffer->size()); - } - - bool BufferedOutputStream::WriteAliasedRaw(const void *, int) { - throw NotImplementedYet("WriteAliasedRaw is not supported."); - } - - bool BufferedOutputStream::AllowsAliasing() const { - return false; - } - - std::string BufferedOutputStream::getName() const { - std::ostringstream result; - result << "BufferedOutputStream " << dataBuffer->size() << " of " - << dataBuffer->capacity(); - return result.str(); - } - - uint64_t BufferedOutputStream::getSize() const { - return dataBuffer->size(); - } - - uint64_t BufferedOutputStream::flush() { - uint64_t dataSize = dataBuffer->size(); - outputStream->write(dataBuffer->data(), dataSize); - dataBuffer->resize(0); - return dataSize; - } - - void AppendOnlyBufferedStream::write(const char * data, size_t size) { - size_t dataOffset = 0; - while (size > 0) { - if (bufferOffset == bufferLength) { - if (!outStream->Next( - reinterpret_cast<void **>(&buffer), - &bufferLength)) { - throw std::logic_error("Failed to allocate buffer."); - } - bufferOffset = 0; - } - size_t len = std::min( - static_cast<size_t>(bufferLength - bufferOffset), - size); - memcpy(buffer + bufferOffset, data + dataOffset, len); - bufferOffset += static_cast<int>(len); - dataOffset += len; - size -= len; - } - } - - uint64_t AppendOnlyBufferedStream::getSize() const { - return outStream->getSize(); - } - - uint64_t AppendOnlyBufferedStream::flush() { - outStream->BackUp(bufferLength - bufferOffset); - bufferOffset = bufferLength = 0; - buffer = nullptr; - return outStream->flush(); - } - - void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { - uint64_t flushedSize = outStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset); - if (outStream->isCompressed()) { - // start of the compression chunk in the stream - recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); - } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - // byte offset of the start location - recorder->add(flushedSize + unflushedSize); - } - } - -} + return static_cast<google::protobuf::int64>(dataBuffer->size()); + } + + bool BufferedOutputStream::WriteAliasedRaw(const void *, int) { + throw NotImplementedYet("WriteAliasedRaw is not supported."); + } + + bool BufferedOutputStream::AllowsAliasing() const { + return false; + } + + std::string BufferedOutputStream::getName() const { + std::ostringstream result; + result << "BufferedOutputStream " << dataBuffer->size() << " of " + << dataBuffer->capacity(); + return result.str(); + } + + uint64_t BufferedOutputStream::getSize() const { + return dataBuffer->size(); + } + + uint64_t BufferedOutputStream::flush() { + uint64_t dataSize = dataBuffer->size(); + outputStream->write(dataBuffer->data(), dataSize); + dataBuffer->resize(0); + return dataSize; + } + + void AppendOnlyBufferedStream::write(const char * data, size_t size) { + size_t dataOffset = 0; + while (size > 0) { + if (bufferOffset == bufferLength) { + if (!outStream->Next( + reinterpret_cast<void **>(&buffer), + &bufferLength)) { + throw std::logic_error("Failed to allocate buffer."); + } + bufferOffset = 0; + } + size_t len = std::min( + static_cast<size_t>(bufferLength - bufferOffset), + size); + memcpy(buffer + bufferOffset, data + dataOffset, len); + bufferOffset += static_cast<int>(len); + dataOffset += len; + size -= len; + } + } + + uint64_t AppendOnlyBufferedStream::getSize() const { + return outStream->getSize(); + } + + uint64_t AppendOnlyBufferedStream::flush() { + outStream->BackUp(bufferLength - bufferOffset); + bufferOffset = bufferLength = 0; + buffer = nullptr; + return outStream->flush(); + } + + void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { + uint64_t flushedSize = outStream->getSize(); + uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset); + if (outStream->isCompressed()) { + // start of the compression chunk in the stream + recorder->add(flushedSize); + // number of decompressed bytes that need to be consumed + recorder->add(unflushedSize); + } else { + flushedSize -= static_cast<uint64_t>(bufferLength); + // byte offset of the start location + recorder->add(flushedSize + unflushedSize); + } + } + +} diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh index 7ce9fafa24..e40263fdfb 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh @@ -1,96 +1,96 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_OUTPUTSTREAM_HH -#define ORC_OUTPUTSTREAM_HH - -#include "Adaptor.hh" -#include "orc/OrcFile.hh" -#include "wrap/zero-copy-stream-wrapper.h" - -namespace orc { - - /** - * Record write position for creating index stream - */ - class PositionRecorder { - public: - virtual ~PositionRecorder(); - virtual void add(uint64_t pos) = 0; - }; - - /** - * A subclass of Google's ZeroCopyOutputStream that supports output to memory - * buffer, and flushing to OutputStream. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf writers. - */ - class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream { - private: - OutputStream * outputStream; - std::unique_ptr<DataBuffer<char> > dataBuffer; - uint64_t blockSize; - - public: - BufferedOutputStream(MemoryPool& pool, - OutputStream * outStream, - uint64_t capacity, - uint64_t block_size); - virtual ~BufferedOutputStream() override; - - virtual bool Next(void** data, int*size) override; - virtual void BackUp(int count) override; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_OUTPUTSTREAM_HH +#define ORC_OUTPUTSTREAM_HH + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "wrap/zero-copy-stream-wrapper.h" + +namespace orc { + + /** + * Record write position for creating index stream + */ + class PositionRecorder { + public: + virtual ~PositionRecorder(); + virtual void add(uint64_t pos) = 0; + }; + + /** + * A subclass of Google's ZeroCopyOutputStream that supports output to memory + * buffer, and flushing to OutputStream. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf writers. + */ + class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream { + private: + OutputStream * outputStream; + std::unique_ptr<DataBuffer<char> > dataBuffer; + uint64_t blockSize; + + public: + BufferedOutputStream(MemoryPool& pool, + OutputStream * outStream, + uint64_t capacity, + uint64_t block_size); + virtual ~BufferedOutputStream() override; + + virtual bool Next(void** data, int*size) override; + virtual void BackUp(int count) override; virtual int64_t ByteCount() const override; - virtual bool WriteAliasedRaw(const void * data, int size) override; - virtual bool AllowsAliasing() const override; - - virtual std::string getName() const; - virtual uint64_t getSize() const; - virtual uint64_t flush(); - - virtual bool isCompressed() const { return false; } - }; - - /** - * An append only buffered stream that allows - * buffer, and flushing to OutputStream. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf writers. - */ - class AppendOnlyBufferedStream { - private: - std::unique_ptr<BufferedOutputStream> outStream; - char * buffer; - int bufferOffset, bufferLength; - - public: - AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) : - outStream(std::move(_outStream)) { - buffer = nullptr; - bufferOffset = bufferLength = 0; - } - - void write(const char * data, size_t size); - uint64_t getSize() const; - uint64_t flush(); - - void recordPosition(PositionRecorder* recorder) const; - }; -} - -#endif // ORC_OUTPUTSTREAM_HH + virtual bool WriteAliasedRaw(const void * data, int size) override; + virtual bool AllowsAliasing() const override; + + virtual std::string getName() const; + virtual uint64_t getSize() const; + virtual uint64_t flush(); + + virtual bool isCompressed() const { return false; } + }; + + /** + * An append only buffered stream that allows + * buffer, and flushing to OutputStream. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf writers. + */ + class AppendOnlyBufferedStream { + private: + std::unique_ptr<BufferedOutputStream> outStream; + char * buffer; + int bufferOffset, bufferLength; + + public: + AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) : + outStream(std::move(_outStream)) { + buffer = nullptr; + bufferOffset = bufferLength = 0; + } + + void write(const char * data, size_t size); + uint64_t getSize() const; + uint64_t flush(); + + void recordPosition(PositionRecorder* recorder) const; + }; +} + +#endif // ORC_OUTPUTSTREAM_HH diff --git a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h index 605fbf826c..8d1eab50b4 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h @@ -1,35 +1,35 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef CODED_STREAM_WRAPPER_HH -#define CODED_STREAM_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") -#endif - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wconversion") -#endif - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CODED_STREAM_WRAPPER_HH +#define CODED_STREAM_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +#endif + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wconversion") +#endif + #include <google/protobuf/io/coded_stream.h> - -DIAGNOSTIC_POP - -#endif + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh index 5c161660cc..dc8e9de7f6 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh +++ b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh @@ -1,47 +1,47 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_PROTO_WRAPPER_HH -#define ORC_PROTO_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wconversion") - DIAGNOSTIC_IGNORE("-Wdeprecated") - DIAGNOSTIC_IGNORE("-Wsign-conversion") - DIAGNOSTIC_IGNORE("-Wunused-parameter") -#endif - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wnested-anon-types") - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") - DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") - DIAGNOSTIC_IGNORE("-Wunknown-warning-option") - DIAGNOSTIC_IGNORE("-Wweak-vtables") - DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant") -#endif - -#if defined(_MSC_VER) - DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned - DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false' -#endif - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_PROTO_WRAPPER_HH +#define ORC_PROTO_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wconversion") + DIAGNOSTIC_IGNORE("-Wdeprecated") + DIAGNOSTIC_IGNORE("-Wsign-conversion") + DIAGNOSTIC_IGNORE("-Wunused-parameter") +#endif + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wnested-anon-types") + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") + DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") + DIAGNOSTIC_IGNORE("-Wunknown-warning-option") + DIAGNOSTIC_IGNORE("-Wweak-vtables") + DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant") +#endif + +#if defined(_MSC_VER) + DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned + DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false' +#endif + #include "contrib/libs/apache/orc/proto/orc_proto.pb.h" - -DIAGNOSTIC_POP - -#endif + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h index aeab0f0033..497ae6f508 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h @@ -1,30 +1,30 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SNAPPY_WRAPPER_HH -#define SNAPPY_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") -#endif - -#include <snappy.h> - -DIAGNOSTIC_POP - -#endif +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SNAPPY_WRAPPER_HH +#define SNAPPY_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +#endif + +#include <snappy.h> + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h index 1af0bd002d..7cf1491d3d 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h @@ -1,36 +1,36 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ZERO_COPY_STREAM_WRAPPER_HH -#define ZERO_COPY_STREAM_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wdeprecated") - DIAGNOSTIC_IGNORE("-Wpadded") - DIAGNOSTIC_IGNORE("-Wunused-parameter") -#endif - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") -#endif - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ZERO_COPY_STREAM_WRAPPER_HH +#define ZERO_COPY_STREAM_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wdeprecated") + DIAGNOSTIC_IGNORE("-Wpadded") + DIAGNOSTIC_IGNORE("-Wunused-parameter") +#endif + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +#endif + #include <google/protobuf/io/zero_copy_stream.h> - -DIAGNOSTIC_POP - -#endif + +DIAGNOSTIC_POP + +#endif |